1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
27 #include <sys/utsname.h>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
34 #include "common/async/waiter.h"
36 #if defined(__FreeBSD__) || defined(_WIN32)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
40 #include <sys/xattr.h>
43 #if defined(__linux__)
44 #include <linux/falloc.h>
47 #include <sys/statvfs.h>
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
53 #include "mon/MonClient.h"
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
75 #include "osd/OSDMap.h"
76 #include "osdc/Filer.h"
78 #include "common/Cond.h"
79 #include "common/perf_counters.h"
80 #include "common/admin_socket.h"
81 #include "common/errno.h"
82 #include "include/str_list.h"
84 #define dout_subsys ceph_subsys_client
86 #include "include/lru.h"
87 #include "include/compat.h"
88 #include "include/stringify.h"
89 #include "include/random.h"
94 #include "Delegation.h"
96 #include "ClientSnapRealm.h"
98 #include "MetaSession.h"
99 #include "MetaRequest.h"
100 #include "ObjecterWriteback.h"
101 #include "posix_acl.h"
103 #include "include/ceph_assert.h"
104 #include "include/stat.h"
106 #include "include/cephfs/ceph_ll_client.h"
108 #if HAVE_GETGROUPLIST
115 #define dout_prefix *_dout << "client." << whoami << " "
117 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
119 // FreeBSD fails to define this
123 // Darwin fails to define this
132 // Windows doesn't define those values. While the Posix compatibilty layer
133 // doesn't support those values, the Windows native functions do provide
134 // similar flags. Special care should be taken if we're going to use those
135 // flags in ceph-dokan. The current values are no-ops, while propagating
136 // them to the rest of the code might cause the Windows functions to reject
139 #define O_NOFOLLOW 0x0
146 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
149 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
160 using namespace TOPNSPC::common
;
162 namespace bs
= boost::system
;
163 namespace ca
= ceph::async
;
165 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
167 Client
*client
= static_cast<Client
*>(p
);
168 client
->flush_set_callback(oset
);
171 bool Client::is_reserved_vino(vinodeno_t
&vino
) {
172 if (MDS_IS_PRIVATE_INO(vino
.ino
)) {
173 ldout(cct
, -1) << __func__
<< " attempt to access reserved inode number " << vino
<< dendl
;
179 // running average and standard deviation -- presented in
180 // Donald Knuth's TAoCP, Volume II.
181 double calc_average(double old_avg
, double value
, uint64_t count
) {
186 new_avg
= old_avg
+ ((value
- old_avg
) / count
);
192 double calc_sq_sum(double old_sq_sum
, double old_mean
, double new_mean
,
193 double value
, uint64_t count
) {
198 new_sq_sum
= old_sq_sum
+ (value
- old_mean
)*(value
- new_mean
);
206 Client::CommandHook::CommandHook(Client
*client
) :
211 int Client::CommandHook::call(
212 std::string_view command
,
213 const cmdmap_t
& cmdmap
,
218 f
->open_object_section("result");
220 std::scoped_lock l
{m_client
->client_lock
};
221 if (command
== "mds_requests")
222 m_client
->dump_mds_requests(f
);
223 else if (command
== "mds_sessions") {
224 bool cap_dump
= false;
225 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
226 m_client
->dump_mds_sessions(f
, cap_dump
);
227 } else if (command
== "dump_cache")
228 m_client
->dump_cache(f
);
229 else if (command
== "kick_stale_sessions")
230 m_client
->_kick_stale_sessions();
231 else if (command
== "status")
232 m_client
->dump_status(f
);
234 ceph_abort_msg("bad command registered");
243 int Client::get_fd_inode(int fd
, InodeRef
*in
) {
245 if (fd
== CEPHFS_AT_FDCWD
) {
248 Fh
*f
= get_filehandle(fd
);
258 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
259 : inode(in
), offset(0), next_offset(2),
260 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
264 void Client::_reset_faked_inos()
267 free_faked_inos
.clear();
268 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
269 last_used_faked_ino
= 0;
270 last_used_faked_root
= 0;
272 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
273 // Windows structures, including Dokan ones, are using 64B identifiers.
274 _use_faked_inos
= false;
276 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
280 void Client::_assign_faked_ino(Inode
*in
)
282 if (0 == last_used_faked_ino
)
283 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
284 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
285 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
286 last_used_faked_ino
= 2048;
287 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
289 ceph_assert(it
!= free_faked_inos
.end());
290 if (last_used_faked_ino
< it
.get_start()) {
291 ceph_assert(it
.get_len() > 0);
292 last_used_faked_ino
= it
.get_start();
294 ++last_used_faked_ino
;
295 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
297 in
->faked_ino
= last_used_faked_ino
;
298 free_faked_inos
.erase(in
->faked_ino
);
299 faked_ino_map
[in
->faked_ino
] = in
->vino();
303 * In the faked mode, if you export multiple subdirectories,
304 * you will see that the inode numbers of the exported subdirectories
305 * are the same. so we distinguish the mount point by reserving
306 * the "fake ids" between "1024~2048" and combining the last
307 * 10bits(0x3ff) of the "root inodes".
309 void Client::_assign_faked_root(Inode
*in
)
311 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
312 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
313 last_used_faked_root
= 0;
314 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
316 ceph_assert(it
!= free_faked_inos
.end());
317 vinodeno_t inode_info
= in
->vino();
318 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
319 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
320 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
321 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
323 in
->faked_ino
= last_used_faked_root
;
324 free_faked_inos
.erase(in
->faked_ino
);
325 faked_ino_map
[in
->faked_ino
] = in
->vino();
328 void Client::_release_faked_ino(Inode
*in
)
330 free_faked_inos
.insert(in
->faked_ino
);
331 faked_ino_map
.erase(in
->faked_ino
);
334 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
339 else if (faked_ino_map
.count(ino
))
340 vino
= faked_ino_map
[ino
];
342 vino
= vinodeno_t(0, CEPH_NOSNAP
);
343 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
347 vinodeno_t
Client::map_faked_ino(ino_t ino
)
349 std::scoped_lock
lock(client_lock
);
350 return _map_faked_ino(ino
);
355 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
356 : Dispatcher(m
->cct
->get()),
357 timer(m
->cct
, timer_lock
, false),
361 whoami(mc
->get_global_id()),
362 mount_state(CLIENT_UNMOUNTED
, "Client::mountstate_lock"),
363 initialize_state(CLIENT_NEW
, "Client::initstate_lock"),
364 cct_deleter
{m
->cct
, [](CephContext
*p
) {p
->put();}},
365 async_ino_invalidator(m
->cct
),
366 async_dentry_invalidator(m
->cct
),
367 interrupt_finisher(m
->cct
),
368 remount_finisher(m
->cct
),
369 async_ino_releasor(m
->cct
),
370 objecter_finisher(m
->cct
),
371 m_command_hook(this),
376 user_id
= cct
->_conf
->client_mount_uid
;
377 group_id
= cct
->_conf
->client_mount_gid
;
378 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
379 "fuse_default_permissions");
381 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
382 "client_collect_and_send_global_metrics");
384 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
385 "client_mount_timeout");
387 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
388 "client_caps_release_delay");
390 if (cct
->_conf
->client_acl_type
== "posix_acl")
391 acl_type
= POSIX_ACL
;
393 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
396 free_fd_set
.insert(10, 1<<30);
398 mdsmap
.reset(new MDSMap
);
401 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
403 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
404 client_flush_set_callback
, // all commit callback
406 cct
->_conf
->client_oc_size
,
407 cct
->_conf
->client_oc_max_objects
,
408 cct
->_conf
->client_oc_max_dirty
,
409 cct
->_conf
->client_oc_target_dirty
,
410 cct
->_conf
->client_oc_max_dirty_age
,
417 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
419 // If the task is crashed or aborted and doesn't
420 // get any chance to run the umount and shutdow.
422 std::scoped_lock l
{client_lock
};
423 tick_thread_stopped
= true;
424 upkeep_cond
.notify_one();
427 if (upkeeper
.joinable())
430 // It is necessary to hold client_lock, because any inode destruction
431 // may call into ObjectCacher, which asserts that it's lock (which is
432 // client_lock) is held.
433 std::scoped_lock l
{client_lock
};
437 void Client::tear_down_cache()
440 for (auto &[fd
, fh
] : fd_map
) {
441 ldout(cct
, 1) << __func__
<< " forcing close of fh " << fd
<< " ino " << fh
->inode
->ino
<< dendl
;
446 while (!opened_dirs
.empty()) {
447 dir_result_t
*dirp
= *opened_dirs
.begin();
448 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
457 ceph_assert(lru
.lru_get_size() == 0);
460 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
461 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
465 ceph_assert(inode_map
.empty());
468 inodeno_t
Client::get_root_ino()
470 std::scoped_lock
l(client_lock
);
471 if (use_faked_inos())
472 return root
->faked_ino
;
477 Inode
*Client::get_root()
479 std::scoped_lock
l(client_lock
);
487 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
490 in
->make_long_path(path
);
491 ldout(cct
, 1) << "dump_inode: "
492 << (disconnected
? "DISCONNECTED ":"")
493 << "inode " << in
->ino
495 << " ref " << in
->get_nref()
496 << " " << *in
<< dendl
;
499 f
->open_object_section("inode");
500 f
->dump_stream("path") << path
;
502 f
->dump_int("disconnected", 1);
509 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
510 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
511 it
!= in
->dir
->dentries
.end();
513 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
515 f
->open_object_section("dentry");
519 if (it
->second
->inode
)
520 dump_inode(f
, it
->second
->inode
.get(), did
, false);
525 void Client::dump_cache(Formatter
*f
)
529 ldout(cct
, 1) << __func__
<< dendl
;
532 f
->open_array_section("cache");
535 dump_inode(f
, root
.get(), did
, true);
537 // make a second pass to catch anything disconnected
538 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
539 it
!= inode_map
.end();
541 if (did
.count(it
->second
))
543 dump_inode(f
, it
->second
, did
, true);
550 void Client::dump_status(Formatter
*f
)
552 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
554 ldout(cct
, 1) << __func__
<< dendl
;
556 const epoch_t osd_epoch
557 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
560 f
->open_object_section("metadata");
561 for (const auto& kv
: metadata
)
562 f
->dump_string(kv
.first
.c_str(), kv
.second
);
565 f
->dump_int("dentry_count", lru
.lru_get_size());
566 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
567 f
->dump_int("id", get_nodeid().v
);
568 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
569 f
->dump_object("inst", inst
);
570 f
->dump_object("addr", inst
.addr
);
571 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
572 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
573 f
->dump_int("inode_count", inode_map
.size());
574 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
575 f
->dump_int("osd_epoch", osd_epoch
);
576 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
577 f
->dump_bool("blocklisted", blocklisted
);
578 f
->dump_string("fs_name", mdsmap
->get_fs_name());
582 void Client::_pre_init()
586 objecter_finisher
.start();
587 filer
.reset(new Filer(objecter
, &objecter_finisher
));
589 objectcacher
->start();
594 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
595 ceph_assert(iref_writer
.is_first_writer());
599 std::scoped_lock l
{client_lock
};
600 messenger
->add_dispatcher_tail(this);
603 iref_writer
.update_state(CLIENT_INITIALIZED
);
607 void Client::_finish_init()
610 std::scoped_lock l
{client_lock
};
612 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
613 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
614 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
615 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
616 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
617 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
618 // average, standard deviation mds/r/w/ latencies
619 plb
.add_time(l_c_md_avg
, "mdavg", "Average latency for processing metadata requests");
620 plb
.add_u64(l_c_md_sqsum
, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
621 plb
.add_u64(l_c_md_ops
, "mdops", "Total metadata IO operations");
622 plb
.add_time(l_c_rd_avg
, "readavg", "Average latency for processing read requests");
623 plb
.add_u64(l_c_rd_sqsum
, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
624 plb
.add_u64(l_c_rd_ops
, "rdops", "Total read IO operations");
625 plb
.add_time(l_c_wr_avg
, "writeavg", "Average latency for processing write requests");
626 plb
.add_u64(l_c_wr_sqsum
, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
627 plb
.add_u64(l_c_wr_ops
, "rdops", "Total write IO operations");
628 logger
.reset(plb
.create_perf_counters());
629 cct
->get_perfcounters_collection()->add(logger
.get());
632 cct
->_conf
.add_observer(this);
634 AdminSocket
* admin_socket
= cct
->get_admin_socket();
635 int ret
= admin_socket
->register_command("mds_requests",
637 "show in-progress mds requests");
639 lderr(cct
) << "error registering admin socket command: "
640 << cpp_strerror(-ret
) << dendl
;
642 ret
= admin_socket
->register_command("mds_sessions "
643 "name=cap_dump,type=CephBool,req=false",
645 "show mds session state");
647 lderr(cct
) << "error registering admin socket command: "
648 << cpp_strerror(-ret
) << dendl
;
650 ret
= admin_socket
->register_command("dump_cache",
652 "show in-memory metadata cache contents");
654 lderr(cct
) << "error registering admin socket command: "
655 << cpp_strerror(-ret
) << dendl
;
657 ret
= admin_socket
->register_command("kick_stale_sessions",
659 "kick sessions that were remote reset");
661 lderr(cct
) << "error registering admin socket command: "
662 << cpp_strerror(-ret
) << dendl
;
664 ret
= admin_socket
->register_command("status",
666 "show overall client status");
668 lderr(cct
) << "error registering admin socket command: "
669 << cpp_strerror(-ret
) << dendl
;
673 void Client::shutdown()
675 ldout(cct
, 1) << __func__
<< dendl
;
677 // If we were not mounted, but were being used for sending
678 // MDS commands, we may have sessions that need closing.
680 std::scoped_lock l
{client_lock
};
682 // To make sure the tick thread will be stoppped before
683 // destructing the Client, just in case like the _mount()
684 // failed but didn't not get a chance to stop the tick
686 tick_thread_stopped
= true;
687 upkeep_cond
.notify_one();
691 cct
->_conf
.remove_observer(this);
693 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
695 if (ino_invalidate_cb
) {
696 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
697 async_ino_invalidator
.wait_for_empty();
698 async_ino_invalidator
.stop();
701 if (dentry_invalidate_cb
) {
702 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
703 async_dentry_invalidator
.wait_for_empty();
704 async_dentry_invalidator
.stop();
707 if (switch_interrupt_cb
) {
708 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
709 interrupt_finisher
.wait_for_empty();
710 interrupt_finisher
.stop();
714 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
715 remount_finisher
.wait_for_empty();
716 remount_finisher
.stop();
719 if (ino_release_cb
) {
720 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
721 async_ino_releasor
.wait_for_empty();
722 async_ino_releasor
.stop();
725 objectcacher
->stop(); // outside of client_lock! this does a join.
728 * We are shuting down the client.
730 * Just declare the state to CLIENT_NEW to block and fail any
731 * new comming "reader" and then try to wait all the in-flight
732 * "readers" to finish.
734 RWRef_t
iref_writer(initialize_state
, CLIENT_NEW
, false);
735 if (!iref_writer
.is_first_writer())
737 iref_writer
.wait_readers_done();
740 std::scoped_lock
l(timer_lock
);
744 objecter_finisher
.wait_for_empty();
745 objecter_finisher
.stop();
748 cct
->get_perfcounters_collection()->remove(logger
.get());
753 void Client::update_io_stat_metadata(utime_t latency
) {
754 auto lat_nsec
= latency
.to_nsec();
755 // old values are used to compute new ones
756 auto o_avg
= logger
->tget(l_c_md_avg
).to_nsec();
757 auto o_sqsum
= logger
->get(l_c_md_sqsum
);
759 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_metadata_request
);
760 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
761 nr_metadata_request
);
763 logger
->tinc(l_c_lat
, latency
);
764 logger
->tinc(l_c_reply
, latency
);
767 avg
.set_from_double(n_avg
/ 1000000000);
768 logger
->tset(l_c_md_avg
, avg
);
769 logger
->set(l_c_md_sqsum
, n_sqsum
);
770 logger
->set(l_c_md_ops
, nr_metadata_request
);
773 void Client::update_io_stat_read(utime_t latency
) {
774 auto lat_nsec
= latency
.to_nsec();
775 // old values are used to compute new ones
776 auto o_avg
= logger
->tget(l_c_rd_avg
).to_nsec();
777 auto o_sqsum
= logger
->get(l_c_rd_sqsum
);
779 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_read_request
);
780 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
783 logger
->tinc(l_c_read
, latency
);
786 avg
.set_from_double(n_avg
/ 1000000000);
787 logger
->tset(l_c_rd_avg
, avg
);
788 logger
->set(l_c_rd_sqsum
, n_sqsum
);
789 logger
->set(l_c_rd_ops
, nr_read_request
);
792 void Client::update_io_stat_write(utime_t latency
) {
793 auto lat_nsec
= latency
.to_nsec();
794 // old values are used to compute new ones
795 auto o_avg
= logger
->tget(l_c_wr_avg
).to_nsec();
796 auto o_sqsum
= logger
->get(l_c_wr_sqsum
);
798 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_write_request
);
799 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
802 logger
->tinc(l_c_wrlat
, latency
);
805 avg
.set_from_double(n_avg
/ 1000000000);
806 logger
->tset(l_c_wr_avg
, avg
);
807 logger
->set(l_c_wr_sqsum
, n_sqsum
);
808 logger
->set(l_c_wr_ops
, nr_write_request
);
811 // ===================
812 // metadata cache stuff
814 void Client::trim_cache(bool trim_kernel_dcache
)
816 uint64_t max
= cct
->_conf
->client_cache_size
;
817 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
819 while (lru
.lru_get_size() != last
) {
820 last
= lru
.lru_get_size();
822 if (!is_unmounting() && lru
.lru_get_size() <= max
) break;
825 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
832 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
833 _invalidate_kernel_dcache();
836 if (lru
.lru_get_size() == 0 && root
&& root
->get_nref() == 1 && inode_map
.size() == 1 + root_parents
.size()) {
837 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
842 void Client::trim_cache_for_reconnect(MetaSession
*s
)
844 mds_rank_t mds
= s
->mds_num
;
845 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
848 list
<Dentry
*> skipped
;
849 while (lru
.lru_get_size() > 0) {
850 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
854 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
855 dn
->dir
->parent_inode
->caps
.count(mds
)) {
859 skipped
.push_back(dn
);
862 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
863 lru
.lru_insert_mid(*p
);
865 ldout(cct
, 20) << __func__
<< " mds." << mds
866 << " trimmed " << trimmed
<< " dentries" << dendl
;
868 if (s
->caps
.size() > 0)
869 _invalidate_kernel_dcache();
872 void Client::trim_dentry(Dentry
*dn
)
874 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
876 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
879 Inode
*diri
= dn
->dir
->parent_inode
;
880 clear_dir_complete_and_ordered(diri
, true);
882 unlink(dn
, false, false); // drop dir, drop dentry
886 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
887 uint64_t truncate_seq
, uint64_t truncate_size
)
889 uint64_t prior_size
= in
->size
;
891 if (truncate_seq
> in
->truncate_seq
||
892 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
893 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
895 in
->reported_size
= size
;
896 if (truncate_seq
!= in
->truncate_seq
) {
897 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
898 << truncate_seq
<< dendl
;
899 in
->truncate_seq
= truncate_seq
;
900 in
->oset
.truncate_seq
= truncate_seq
;
902 // truncate cached file data
903 if (prior_size
> size
) {
904 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
908 // truncate inline data
909 if (in
->inline_version
< CEPH_INLINE_NONE
) {
910 uint32_t len
= in
->inline_data
.length();
912 in
->inline_data
.splice(size
, len
- size
);
915 if (truncate_seq
>= in
->truncate_seq
&&
916 in
->truncate_size
!= truncate_size
) {
918 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
919 << truncate_size
<< dendl
;
920 in
->truncate_size
= truncate_size
;
921 in
->oset
.truncate_size
= truncate_size
;
923 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
928 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
929 utime_t ctime
, utime_t mtime
, utime_t atime
)
931 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
932 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
934 if (time_warp_seq
> in
->time_warp_seq
)
935 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
936 << " is higher than local time_warp_seq "
937 << in
->time_warp_seq
<< dendl
;
940 // be careful with size, mtime, atime
941 if (issued
& (CEPH_CAP_FILE_EXCL
|
943 CEPH_CAP_FILE_BUFFER
|
945 CEPH_CAP_XATTR_EXCL
)) {
946 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
947 if (ctime
> in
->ctime
)
949 if (time_warp_seq
> in
->time_warp_seq
) {
950 //the mds updated times, so take those!
953 in
->time_warp_seq
= time_warp_seq
;
954 } else if (time_warp_seq
== in
->time_warp_seq
) {
956 if (mtime
> in
->mtime
)
958 if (atime
> in
->atime
)
960 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
961 //ignore mds values as we have a higher seq
964 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
965 if (time_warp_seq
>= in
->time_warp_seq
) {
969 in
->time_warp_seq
= time_warp_seq
;
973 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
974 << time_warp_seq
<< " is lower than local time_warp_seq "
980 void Client::_fragmap_remove_non_leaves(Inode
*in
)
982 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
983 if (!in
->dirfragtree
.is_leaf(p
->first
))
984 in
->fragmap
.erase(p
++);
989 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
991 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
992 if (p
->second
== mds
)
993 in
->fragmap
.erase(p
++);
998 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
999 MetaSession
*session
,
1000 const UserPerm
& request_perms
)
1003 bool was_new
= false;
1004 if (inode_map
.count(st
->vino
)) {
1005 in
= inode_map
[st
->vino
];
1006 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1008 in
= new Inode(this, st
->vino
, &st
->layout
);
1009 inode_map
[st
->vino
] = in
;
1011 if (use_faked_inos())
1012 _assign_faked_ino(in
);
1016 if (use_faked_inos())
1017 _assign_faked_root(root
.get());
1020 } else if (is_mounting()) {
1021 root_parents
[root_ancestor
] = in
;
1026 in
->ino
= st
->vino
.ino
;
1027 in
->snapid
= st
->vino
.snapid
;
1028 in
->mode
= st
->mode
& S_IFMT
;
1032 in
->rdev
= st
->rdev
;
1033 if (in
->is_symlink())
1034 in
->symlink
= st
->symlink
;
1036 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1037 bool new_version
= false;
1038 if (in
->version
== 0 ||
1039 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
1040 (in
->version
& ~1) < st
->version
))
1044 in
->caps_issued(&issued
);
1045 issued
|= in
->caps_dirty();
1046 int new_issued
= ~issued
& (int)st
->cap
.caps
;
1048 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
1049 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
1050 in
->mode
= st
->mode
;
1053 in
->btime
= st
->btime
;
1054 in
->snap_btime
= st
->snap_btime
;
1055 in
->snap_metadata
= st
->snap_metadata
;
1058 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
1059 !(issued
& CEPH_CAP_LINK_EXCL
)) {
1060 in
->nlink
= st
->nlink
;
1063 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
1064 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
1065 st
->ctime
, st
->mtime
, st
->atime
);
1069 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
1070 in
->layout
= st
->layout
;
1071 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
1075 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
1076 in
->dirstat
= st
->dirstat
;
1078 // dir_layout/rstat/quota are not tracked by capability, update them only if
1079 // the inode stat is from auth mds
1080 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
1081 in
->dir_layout
= st
->dir_layout
;
1082 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
1083 in
->rstat
= st
->rstat
;
1084 in
->quota
= st
->quota
;
1085 in
->dir_pin
= st
->dir_pin
;
1087 // move me if/when version reflects fragtree changes.
1088 if (in
->dirfragtree
!= st
->dirfragtree
) {
1089 in
->dirfragtree
= st
->dirfragtree
;
1090 _fragmap_remove_non_leaves(in
);
1094 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
1095 st
->xattrbl
.length() &&
1096 st
->xattr_version
> in
->xattr_version
) {
1097 auto p
= st
->xattrbl
.cbegin();
1098 decode(in
->xattrs
, p
);
1099 in
->xattr_version
= st
->xattr_version
;
1102 if (st
->inline_version
> in
->inline_version
) {
1103 in
->inline_data
= st
->inline_data
;
1104 in
->inline_version
= st
->inline_version
;
1107 /* always take a newer change attr */
1108 if (st
->change_attr
> in
->change_attr
)
1109 in
->change_attr
= st
->change_attr
;
1111 if (st
->version
> in
->version
)
1112 in
->version
= st
->version
;
1115 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1118 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
1120 if (in
->snapid
== CEPH_NOSNAP
) {
1121 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
1122 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
1123 st
->cap
.flags
, request_perms
);
1124 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
1125 in
->max_size
= st
->max_size
;
1126 in
->rstat
= st
->rstat
;
1129 // setting I_COMPLETE needs to happen after adding the cap
1131 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
1132 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
1133 in
->dirstat
.nfiles
== 0 &&
1134 in
->dirstat
.nsubdirs
== 0) {
1135 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
1136 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
1138 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
1139 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
1140 in
->dir
->readdir_cache
.clear();
1141 for (const auto& p
: in
->dir
->dentries
) {
1142 unlink(p
.second
, true, true); // keep dir, keep dentry
1144 if (in
->dir
->dentries
.empty())
1149 in
->snap_caps
|= st
->cap
.caps
;
1152 in
->fscrypt
= st
->fscrypt
;
1158 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1160 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
1161 Inode
*in
, utime_t from
, MetaSession
*session
,
1165 if (dir
->dentries
.count(dname
))
1166 dn
= dir
->dentries
[dname
];
1168 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
1169 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
1172 if (dn
&& dn
->inode
) {
1173 if (dn
->inode
->vino() == in
->vino()) {
1175 ldout(cct
, 12) << " had dentry " << dname
1176 << " with correct vino " << dn
->inode
->vino()
1179 ldout(cct
, 12) << " had dentry " << dname
1180 << " with WRONG vino " << dn
->inode
->vino()
1182 unlink(dn
, true, true); // keep dir, keep dentry
1186 if (!dn
|| !dn
->inode
) {
1187 InodeRef
tmp_ref(in
);
1189 if (old_dentry
->dir
!= dir
) {
1190 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1191 clear_dir_complete_and_ordered(old_diri
, false);
1193 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1195 Inode
*diri
= dir
->parent_inode
;
1196 clear_dir_complete_and_ordered(diri
, false);
1197 dn
= link(dir
, dname
, in
, dn
);
1200 update_dentry_lease(dn
, dlease
, from
, session
);
1204 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1206 utime_t dttl
= from
;
1207 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1209 ldout(cct
, 15) << __func__
<< " " << *dn
<< " " << *dlease
<< " from " << from
<< dendl
;
1213 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1214 if (dttl
> dn
->lease_ttl
) {
1215 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1216 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1217 dn
->lease_ttl
= dttl
;
1218 dn
->lease_mds
= session
->mds_num
;
1219 dn
->lease_seq
= dlease
->seq
;
1220 dn
->lease_gen
= session
->cap_gen
;
1223 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1224 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1226 dn
->alternate_name
= std::move(dlease
->alternate_name
);
1231 * update MDS location cache for a single inode
1233 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
, mds_rank_t from
)
1236 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1237 if (dst
->auth
>= 0) {
1238 in
->fragmap
[dst
->frag
] = dst
->auth
;
1240 in
->fragmap
.erase(dst
->frag
);
1242 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1243 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1244 _fragmap_remove_non_leaves(in
);
1247 // replicated, only update from auth mds reply
1248 if (from
== dst
->auth
) {
1249 in
->dir_replicated
= !dst
->dist
.empty();
1250 if (!dst
->dist
.empty())
1251 in
->frag_repmap
[dst
->frag
].assign(dst
->dist
.begin(), dst
->dist
.end()) ;
1253 in
->frag_repmap
.erase(dst
->frag
);
1257 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1260 diri
->dir_release_count
++;
1262 diri
->dir_ordered_count
++;
1263 if (diri
->flags
& I_COMPLETE
) {
1265 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1266 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1268 if (diri
->flags
& I_DIR_ORDERED
) {
1269 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1270 diri
->flags
&= ~I_DIR_ORDERED
;
1274 diri
->dir
->readdir_cache
.clear();
1279 * insert results from readdir or lssnap into the metadata cache.
1281 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1283 auto& reply
= request
->reply
;
1284 ConnectionRef con
= request
->reply
->get_connection();
1286 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1287 features
= (uint64_t)-1;
1290 features
= con
->get_features();
1293 dir_result_t
*dirp
= request
->dirp
;
1296 // the extra buffer list is only set for readdir and lssnap replies
1297 auto p
= reply
->get_extra_bl().cbegin();
1300 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1302 diri
= open_snapdir(diri
);
1305 // only open dir if we're actually adding stuff to it!
1306 Dir
*dir
= diri
->open_dir();
1310 DirStat
dst(p
, features
);
1316 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1317 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1319 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1320 unsigned readdir_offset
= dirp
->next_offset
;
1321 string readdir_start
= dirp
->last_name
;
1322 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1324 unsigned last_hash
= 0;
1326 if (!readdir_start
.empty()) {
1327 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1328 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1329 /* mds understands offset_hash */
1330 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1334 if (fg
!= dst
.frag
) {
1335 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1339 readdir_start
.clear();
1340 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1344 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1345 << ", hash_order=" << hash_order
1346 << ", readdir_start " << readdir_start
1347 << ", last_hash " << last_hash
1348 << ", next_offset " << readdir_offset
<< dendl
;
1350 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1351 fg
.is_leftmost() && readdir_offset
== 2 &&
1352 !(hash_order
&& last_hash
)) {
1353 dirp
->release_count
= diri
->dir_release_count
;
1354 dirp
->ordered_count
= diri
->dir_ordered_count
;
1355 dirp
->start_shared_gen
= diri
->shared_gen
;
1356 dirp
->cache_index
= 0;
1359 dirp
->buffer_frag
= fg
;
1361 _readdir_drop_dirp_buffer(dirp
);
1362 dirp
->buffer
.reserve(numdn
);
1366 for (unsigned i
=0; i
<numdn
; i
++) {
1368 dlease
.decode(p
, features
);
1369 InodeStat
ist(p
, features
);
1371 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1373 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1376 if (diri
->dir
->dentries
.count(dname
)) {
1377 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1378 if (olddn
->inode
!= in
) {
1379 // replace incorrect dentry
1380 unlink(olddn
, true, true); // keep dir, dentry
1381 dn
= link(dir
, dname
, in
, olddn
);
1382 ceph_assert(dn
== olddn
);
1390 dn
= link(dir
, dname
, in
, NULL
);
1392 dn
->alternate_name
= std::move(dlease
.alternate_name
);
1394 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1396 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1397 if (hash
!= last_hash
)
1400 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1402 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1404 // add to readdir cache
1405 if (dirp
->release_count
== diri
->dir_release_count
&&
1406 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1407 dirp
->start_shared_gen
== diri
->shared_gen
) {
1408 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1410 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1411 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1413 dir
->readdir_cache
.push_back(dn
);
1414 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1415 if (dirp
->inode
->is_complete_and_ordered())
1416 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1418 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1420 ceph_abort_msg("unexpected readdir buffer idx");
1422 dirp
->cache_index
++;
1424 // add to cached result list
1425 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, dn
->alternate_name
, in
));
1426 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1430 dirp
->last_name
= dname
;
1432 dirp
->next_offset
= 2;
1434 dirp
->next_offset
= readdir_offset
;
1436 if (dir
->is_empty())
1443 * insert a trace from a MDS reply into the cache.
1445 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1447 auto& reply
= request
->reply
;
1448 int op
= request
->get_op();
1450 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1451 << " is_target=" << (int)reply
->head
.is_target
1452 << " is_dentry=" << (int)reply
->head
.is_dentry
1455 auto p
= reply
->get_trace_bl().cbegin();
1456 if (request
->got_unsafe
) {
1457 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1458 ceph_assert(p
.end());
1463 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1465 Dentry
*d
= request
->dentry();
1467 Inode
*diri
= d
->dir
->parent_inode
;
1468 clear_dir_complete_and_ordered(diri
, true);
1471 if (d
&& reply
->get_result() == 0) {
1472 if (op
== CEPH_MDS_OP_RENAME
) {
1474 Dentry
*od
= request
->old_dentry();
1475 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1477 unlink(od
, true, true); // keep dir, dentry
1478 } else if (op
== CEPH_MDS_OP_RMDIR
||
1479 op
== CEPH_MDS_OP_UNLINK
) {
1481 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1482 unlink(d
, true, true); // keep dir, dentry
1488 ConnectionRef con
= request
->reply
->get_connection();
1490 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1491 features
= (uint64_t)-1;
1494 features
= con
->get_features();
1496 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1499 SnapRealm
*realm
= NULL
;
1500 if (reply
->snapbl
.length())
1501 update_snap_trace(reply
->snapbl
, &realm
);
1503 ldout(cct
, 10) << " hrm "
1504 << " is_target=" << (int)reply
->head
.is_target
1505 << " is_dentry=" << (int)reply
->head
.is_dentry
1514 if (reply
->head
.is_dentry
) {
1515 dirst
.decode(p
, features
);
1516 dst
.decode(p
, features
);
1518 dlease
.decode(p
, features
);
1522 if (reply
->head
.is_target
) {
1523 ist
.decode(p
, features
);
1524 if (cct
->_conf
->client_debug_getattr_caps
) {
1525 unsigned wanted
= 0;
1526 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1527 wanted
= request
->head
.args
.getattr
.mask
;
1528 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1529 wanted
= request
->head
.args
.open
.mask
;
1531 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1532 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1533 ceph_abort_msg("MDS reply does not contain xattrs");
1536 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1541 if (reply
->head
.is_dentry
) {
1542 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1544 mds_rank_t from_mds
= mds_rank_t(reply
->get_source().num());
1545 update_dir_dist(diri
, &dst
, from_mds
); // dir stat info is attached to ..
1548 Dir
*dir
= diri
->open_dir();
1549 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1550 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1553 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1554 dn
= diri
->dir
->dentries
[dname
];
1556 clear_dir_complete_and_ordered(diri
, false);
1557 unlink(dn
, true, true); // keep dir, dentry
1560 if (dlease
.duration_ms
> 0) {
1562 Dir
*dir
= diri
->open_dir();
1563 dn
= link(dir
, dname
, NULL
, NULL
);
1565 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1568 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1569 op
== CEPH_MDS_OP_MKSNAP
) {
1570 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1571 // fake it for snap lookup
1572 vinodeno_t vino
= ist
.vino
;
1573 vino
.snapid
= CEPH_SNAPDIR
;
1574 ceph_assert(inode_map
.count(vino
));
1575 diri
= inode_map
[vino
];
1577 string dname
= request
->path
.last_dentry();
1580 dlease
.duration_ms
= 0;
1583 Dir
*dir
= diri
->open_dir();
1584 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1586 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1587 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1589 unlink(dn
, true, true); // keep dir, dentry
1595 if (op
== CEPH_MDS_OP_READDIR
||
1596 op
== CEPH_MDS_OP_LSSNAP
) {
1597 insert_readdir_results(request
, session
, in
);
1598 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1599 // hack: return parent inode instead
1603 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1604 // pin the target inode if its parent dentry is not pinned
1605 request
->set_other_inode(in
);
1610 put_snap_realm(realm
);
1612 request
->target
= in
;
1618 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1620 mds_rank_t mds
= MDS_RANK_NONE
;
1622 bool is_hash
= false;
1628 if (req
->resend_mds
>= 0) {
1629 mds
= req
->resend_mds
;
1630 req
->resend_mds
= -1;
1631 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1635 if (cct
->_conf
->client_use_random_mds
)
1641 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1642 if (req
->path
.depth()) {
1643 hash
= in
->hash_dentry_name(req
->path
[0]);
1644 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1645 << " on " << req
->path
[0]
1646 << " => " << hash
<< dendl
;
1651 in
= de
->inode
.get();
1652 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1654 in
= de
->dir
->parent_inode
;
1655 hash
= in
->hash_dentry_name(de
->name
);
1656 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1657 << " on " << de
->name
1658 << " => " << hash
<< dendl
;
1663 if (in
->snapid
!= CEPH_NOSNAP
) {
1664 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1665 while (in
->snapid
!= CEPH_NOSNAP
) {
1666 if (in
->snapid
== CEPH_SNAPDIR
)
1667 in
= in
->snapdir_parent
.get();
1668 else if (!in
->dentries
.empty())
1669 /* In most cases there will only be one dentry, so getting it
1670 * will be the correct action. If there are multiple hard links,
1671 * I think the MDS should be able to redirect as needed*/
1672 in
= in
->get_first_parent()->dir
->parent_inode
;
1674 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1681 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1682 << " hash=" << hash
<< dendl
;
1684 if (req
->get_op() == CEPH_MDS_OP_GETATTR
)
1685 issued
= req
->inode()->caps_issued();
1687 if (is_hash
&& S_ISDIR(in
->mode
) && (!in
->fragmap
.empty() || !in
->frag_repmap
.empty())) {
1688 frag_t fg
= in
->dirfragtree
[hash
];
1689 if (!req
->auth_is_best(issued
)) {
1690 auto repmapit
= in
->frag_repmap
.find(fg
);
1691 if (repmapit
!= in
->frag_repmap
.end()) {
1692 auto& repmap
= repmapit
->second
;
1693 auto r
= ceph::util::generate_random_number
<uint64_t>(0, repmap
.size()-1);
1696 } else if (in
->fragmap
.count(fg
)) {
1697 mds
= in
->fragmap
[fg
];
1700 } else if (in
->auth_cap
) {
1701 req
->send_to_auth
= true;
1702 mds
= in
->auth_cap
->session
->mds_num
;
1705 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1710 if (in
->auth_cap
&& req
->auth_is_best(issued
)) {
1711 mds
= in
->auth_cap
->session
->mds_num
;
1712 } else if (!in
->caps
.empty()) {
1713 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1717 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1724 mds
= _get_random_up_mds();
1725 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1729 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1733 void Client::connect_mds_targets(mds_rank_t mds
)
1735 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1736 ceph_assert(mds_sessions
.count(mds
));
1737 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1738 for (const auto &rank
: info
.export_targets
) {
1739 if (mds_sessions
.count(rank
) == 0 &&
1740 mdsmap
->is_clientreplay_or_active_or_stopping(rank
)) {
1741 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1742 << " export target mds." << rank
<< dendl
;
1743 _open_mds_session(rank
);
1748 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1750 f
->dump_int("id", get_nodeid().v
);
1751 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1752 f
->dump_object("inst", inst
);
1753 f
->dump_stream("inst_str") << inst
;
1754 f
->dump_stream("addr_str") << inst
.addr
;
1755 f
->open_array_section("sessions");
1756 for (const auto &p
: mds_sessions
) {
1757 f
->open_object_section("session");
1758 p
.second
->dump(f
, cap_dump
);
1762 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1765 void Client::dump_mds_requests(Formatter
*f
)
1767 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1768 p
!= mds_requests
.end();
1770 f
->open_object_section("request");
1776 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1777 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1778 InodeRef
*ptarget
, bool *pcreated
,
1779 const UserPerm
& perms
)
1781 // check whether this request actually did the create, and set created flag
1782 bufferlist extra_bl
;
1783 inodeno_t created_ino
;
1784 bool got_created_ino
= false;
1785 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1787 extra_bl
= reply
->get_extra_bl();
1788 if (extra_bl
.length() >= 8) {
1789 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1790 struct openc_response_t ocres
;
1792 decode(ocres
, extra_bl
);
1793 created_ino
= ocres
.created_ino
;
1795 * The userland cephfs client doesn't have a way to do an async create
1796 * (yet), so just discard delegated_inos for now. Eventually we should
1797 * store them and use them in create calls, even if they are synchronous,
1798 * if only for testing purposes.
1800 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1802 // u64 containing number of created ino
1803 decode(created_ino
, extra_bl
);
1805 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1806 got_created_ino
= true;
1810 *pcreated
= got_created_ino
;
1812 if (request
->target
) {
1813 *ptarget
= request
->target
;
1814 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1816 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1817 (*ptarget
) = p
->second
;
1818 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1820 // we got a traceless reply, and need to look up what we just
1821 // created. for now, do this by name. someday, do this by the
1822 // ino... which we know! FIXME.
1824 Dentry
*d
= request
->dentry();
1827 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1828 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1829 << " got_ino " << got_created_ino
1830 << " ino " << created_ino
1832 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1835 // if the dentry is not linked, just do our best. see #5021.
1836 ceph_abort_msg("how did this happen? i want logs!");
1839 Inode
*in
= request
->inode();
1840 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1841 << in
->ino
<< dendl
;
1842 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1846 // verify ino returned in reply and trace_dist are the same
1847 if (got_created_ino
&&
1848 created_ino
.val
!= target
->ino
.val
) {
1849 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1853 ptarget
->swap(target
);
1865 * Blocking helper to make an MDS request.
1867 * If the ptarget flag is set, behavior changes slightly: the caller
1868 * expects to get a pointer to the inode we are creating or operating
1869 * on. As a result, we will follow up any traceless mutation reply
1870 * with a getattr or lookup to transparently handle a traceless reply
1871 * from the MDS (as when the MDS restarts and the client has to replay
1874 * @param request the MetaRequest to execute
1875 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1876 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1877 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1878 * @param use_mds [optional] prefer a specific mds (-1 for default)
1879 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1881 int Client::make_request(MetaRequest
*request
,
1882 const UserPerm
& perms
,
1883 InodeRef
*ptarget
, bool *pcreated
,
1889 // assign a unique tid
1890 ceph_tid_t tid
= ++last_tid
;
1891 request
->set_tid(tid
);
1894 request
->op_stamp
= ceph_clock_now();
1895 request
->created
= ceph::coarse_mono_clock::now();
1898 mds_requests
[tid
] = request
->get();
1899 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1902 request
->set_caller_perms(perms
);
1904 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1905 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1906 request
->set_oldest_client_tid(1);
1908 request
->set_oldest_client_tid(oldest_tid
);
1913 request
->resend_mds
= use_mds
;
1915 MetaSessionRef session
= NULL
;
1917 if (request
->aborted())
1921 request
->abort(-CEPHFS_EBLOCKLISTED
);
1926 ceph::condition_variable caller_cond
;
1927 request
->caller_cond
= &caller_cond
;
1930 Inode
*hash_diri
= NULL
;
1931 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1932 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1933 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1934 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1936 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1937 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1939 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1940 request
->resend_mds
= _get_random_up_mds();
1943 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1944 wait_on_list(waiting_for_mdsmap
);
1950 if (!have_open_session(mds
)) {
1951 session
= _get_or_open_mds_session(mds
);
1952 if (session
->state
== MetaSession::STATE_REJECTED
) {
1953 request
->abort(-CEPHFS_EPERM
);
1957 if (session
->state
== MetaSession::STATE_OPENING
) {
1958 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1959 wait_on_context_list(session
->waiting_for_open
);
1963 if (!have_open_session(mds
))
1966 session
= mds_sessions
.at(mds
);
1970 send_request(request
, session
.get());
1973 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1974 request
->kick
= false;
1975 std::unique_lock l
{client_lock
, std::adopt_lock
};
1976 caller_cond
.wait(l
, [request
] {
1977 return (request
->reply
|| // reply
1978 request
->resend_mds
>= 0 || // forward
1982 request
->caller_cond
= nullptr;
1984 // did we get a reply?
1989 if (!request
->reply
) {
1990 ceph_assert(request
->aborted());
1991 ceph_assert(!request
->got_unsafe
);
1992 r
= request
->get_abort_code();
1993 request
->item
.remove_myself();
1994 unregister_request(request
);
1995 put_request(request
);
2000 auto reply
= std::move(request
->reply
);
2001 r
= reply
->get_result();
2003 request
->success
= true;
2005 // kick dispatcher (we've got it!)
2006 ceph_assert(request
->dispatch_cond
);
2007 request
->dispatch_cond
->notify_all();
2008 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
2009 request
->dispatch_cond
= 0;
2011 if (r
>= 0 && ptarget
)
2012 r
= verify_reply_trace(r
, session
.get(), request
, reply
, ptarget
, pcreated
, perms
);
2015 *pdirbl
= reply
->get_extra_bl();
2018 utime_t lat
= ceph_clock_now();
2019 lat
-= request
->sent_stamp
;
2020 ldout(cct
, 20) << "lat " << lat
<< dendl
;
2022 ++nr_metadata_request
;
2023 update_io_stat_metadata(lat
);
2025 put_request(request
);
2029 void Client::unregister_request(MetaRequest
*req
)
2031 mds_requests
.erase(req
->tid
);
2032 if (req
->tid
== oldest_tid
) {
2033 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
2035 if (p
== mds_requests
.end()) {
2039 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
2040 oldest_tid
= p
->first
;
2049 void Client::put_request(MetaRequest
*request
)
2051 if (request
->_put()) {
2053 if (request
->success
)
2054 op
= request
->get_op();
2056 request
->take_other_inode(&other_in
);
2060 (op
== CEPH_MDS_OP_RMDIR
||
2061 op
== CEPH_MDS_OP_RENAME
||
2062 op
== CEPH_MDS_OP_RMSNAP
)) {
2063 _try_to_trim_inode(other_in
.get(), false);
2068 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
2069 mds_rank_t mds
, int drop
,
2070 int unless
, int force
)
2072 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
2073 << " mds:" << mds
<< ", drop:" << ccap_string(drop
) << ", unless:" << ccap_string(unless
)
2074 << ", force:" << force
<< ")" << dendl
;
2076 auto it
= in
->caps
.find(mds
);
2077 if (it
!= in
->caps
.end()) {
2078 Cap
&cap
= it
->second
;
2079 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
2080 if ((drop
& cap
.issued
) &&
2081 !(unless
& cap
.issued
)) {
2082 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
2083 cap
.issued
&= ~drop
;
2084 cap
.implemented
&= ~drop
;
2090 cap
.wanted
= in
->caps_wanted();
2091 if (&cap
== in
->auth_cap
&&
2092 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
2093 in
->requested_max_size
= 0;
2094 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
2096 ceph_mds_request_release rel
;
2098 rel
.cap_id
= cap
.cap_id
;
2100 rel
.issue_seq
= cap
.issue_seq
;
2101 rel
.mseq
= cap
.mseq
;
2102 rel
.caps
= cap
.implemented
;
2103 rel
.wanted
= cap
.wanted
;
2106 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
2109 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
2110 << released
<< dendl
;
2114 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
2115 mds_rank_t mds
, int drop
, int unless
)
2117 ldout(cct
, 20) << __func__
<< " enter(dn:"
2118 << dn
<< ")" << dendl
;
2121 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
2122 mds
, drop
, unless
, 1);
2123 if (released
&& dn
->lease_mds
== mds
) {
2124 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
2125 auto& rel
= req
->cap_releases
.back();
2126 rel
.item
.dname_len
= dn
->name
.length();
2127 rel
.item
.dname_seq
= dn
->lease_seq
;
2128 rel
.dname
= dn
->name
;
2131 ldout(cct
, 25) << __func__
<< " exit(dn:"
2132 << dn
<< ")" << dendl
;
2137 * This requires the MClientRequest *request member to be set.
2138 * It will error out horribly without one.
2139 * Additionally, if you set any *drop member, you'd better have
2140 * set the corresponding dentry!
2142 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
2144 ldout(cct
, 20) << __func__
<< " enter (req: "
2145 << req
<< ", mds: " << mds
<< ")" << dendl
;
2146 if (req
->inode_drop
&& req
->inode())
2147 encode_inode_release(req
->inode(), req
,
2148 mds
, req
->inode_drop
,
2151 if (req
->old_inode_drop
&& req
->old_inode())
2152 encode_inode_release(req
->old_inode(), req
,
2153 mds
, req
->old_inode_drop
,
2154 req
->old_inode_unless
);
2155 if (req
->other_inode_drop
&& req
->other_inode())
2156 encode_inode_release(req
->other_inode(), req
,
2157 mds
, req
->other_inode_drop
,
2158 req
->other_inode_unless
);
2160 if (req
->dentry_drop
&& req
->dentry())
2161 encode_dentry_release(req
->dentry(), req
,
2162 mds
, req
->dentry_drop
,
2163 req
->dentry_unless
);
2165 if (req
->old_dentry_drop
&& req
->old_dentry())
2166 encode_dentry_release(req
->old_dentry(), req
,
2167 mds
, req
->old_dentry_drop
,
2168 req
->old_dentry_unless
);
2169 ldout(cct
, 25) << __func__
<< " exit (req: "
2170 << req
<< ", mds " << mds
<<dendl
;
2173 bool Client::have_open_session(mds_rank_t mds
)
2175 const auto &it
= mds_sessions
.find(mds
);
2176 return it
!= mds_sessions
.end() &&
2177 (it
->second
->state
== MetaSession::STATE_OPEN
||
2178 it
->second
->state
== MetaSession::STATE_STALE
);
2181 MetaSessionRef
Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
2183 const auto &it
= mds_sessions
.find(mds
);
2184 if (it
== mds_sessions
.end() || it
->second
->con
!= con
) {
2191 MetaSessionRef
Client::_get_or_open_mds_session(mds_rank_t mds
)
2193 auto it
= mds_sessions
.find(mds
);
2194 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : it
->second
;
2198 * Populate a map of strings with client-identifying metadata,
2199 * such as the hostname. Call this once at initialization.
2201 void Client::populate_metadata(const std::string
&mount_root
)
2205 // TODO: move this to compat.h
2207 DWORD hostname_sz
= 64;
2208 GetComputerNameA(hostname
, &hostname_sz
);
2209 metadata
["hostname"] = hostname
;
2214 metadata
["hostname"] = u
.nodename
;
2215 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2217 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2221 metadata
["pid"] = stringify(getpid());
2223 // Ceph entity id (the '0' in "client.0")
2224 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2226 // Our mount position
2227 if (!mount_root
.empty()) {
2228 metadata
["root"] = mount_root
;
2232 metadata
["ceph_version"] = pretty_version_to_str();
2233 metadata
["ceph_sha1"] = git_version_to_str();
2235 // Apply any metadata from the user's configured overrides
2236 std::vector
<std::string
> tokens
;
2237 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2238 for (const auto &i
: tokens
) {
2239 auto eqpos
= i
.find("=");
2240 // Throw out anything that isn't of the form "<str>=<str>"
2241 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2242 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2245 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2250 * Optionally add or override client metadata fields.
2252 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2254 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2255 ceph_assert(iref_reader
.is_state_satisfied());
2257 std::scoped_lock
l(client_lock
);
2259 auto it
= metadata
.find(k
);
2260 if (it
!= metadata
.end()) {
2261 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2262 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2268 MetaSessionRef
Client::_open_mds_session(mds_rank_t mds
)
2270 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2271 auto addrs
= mdsmap
->get_addrs(mds
);
2272 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2273 std::forward_as_tuple(mds
),
2274 std::forward_as_tuple(new MetaSession(mds
, messenger
->connect_to_mds(addrs
), addrs
)));
2275 ceph_assert(em
.second
); /* not already present */
2276 auto session
= em
.first
->second
;
2278 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2279 m
->metadata
= metadata
;
2280 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2281 m
->metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
2282 session
->con
->send_message2(std::move(m
));
2286 void Client::_close_mds_session(MetaSession
*s
)
2288 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2289 s
->state
= MetaSession::STATE_CLOSING
;
2290 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2293 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2295 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2296 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2297 s
->state
= MetaSession::STATE_REJECTED
;
2299 s
->state
= MetaSession::STATE_CLOSED
;
2300 s
->con
->mark_down();
2301 signal_context_list(s
->waiting_for_open
);
2302 mount_cond
.notify_all();
2303 remove_session_caps(s
, err
);
2304 kick_requests_closed(s
);
2305 mds_ranks_closing
.erase(s
->mds_num
);
2306 if (s
->state
== MetaSession::STATE_CLOSED
)
2307 mds_sessions
.erase(s
->mds_num
);
2310 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2312 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2313 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2315 std::scoped_lock
cl(client_lock
);
2316 auto session
= _get_mds_session(from
, m
->get_connection().get());
2318 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2322 switch (m
->get_op()) {
2323 case CEPH_SESSION_OPEN
:
2325 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2326 missing_features
-= m
->supported_features
;
2327 if (!missing_features
.empty()) {
2328 lderr(cct
) << "mds." << from
<< " lacks required features '"
2329 << missing_features
<< "', closing session " << dendl
;
2330 _close_mds_session(session
.get());
2331 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2334 session
->mds_features
= std::move(m
->supported_features
);
2335 session
->mds_metric_flags
= std::move(m
->metric_spec
.metric_flags
);
2337 renew_caps(session
.get());
2338 session
->state
= MetaSession::STATE_OPEN
;
2339 if (is_unmounting())
2340 mount_cond
.notify_all();
2342 connect_mds_targets(from
);
2343 signal_context_list(session
->waiting_for_open
);
2347 case CEPH_SESSION_CLOSE
:
2348 _closed_mds_session(session
.get());
2351 case CEPH_SESSION_RENEWCAPS
:
2352 if (session
->cap_renew_seq
== m
->get_seq()) {
2353 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2355 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2357 wake_up_session_caps(session
.get(), false);
2361 case CEPH_SESSION_STALE
:
2362 // invalidate session caps/leases
2364 session
->cap_ttl
= ceph_clock_now();
2365 session
->cap_ttl
-= 1;
2366 renew_caps(session
.get());
2369 case CEPH_SESSION_RECALL_STATE
:
2371 * Call the renew caps and flush cap releases just before
2372 * triming the caps in case the tick() won't get a chance
2373 * to run them, which could cause the client to be blocklisted
2374 * and MDS daemons trying to recall the caps again and
2377 * In most cases it will do nothing, and the new cap releases
2378 * added by trim_caps() followed will be deferred flushing
2381 renew_and_flush_cap_releases();
2382 trim_caps(session
.get(), m
->get_max_caps());
2385 case CEPH_SESSION_FLUSHMSG
:
2386 /* flush cap release */
2387 if (auto& m
= session
->release
; m
) {
2388 session
->con
->send_message2(std::move(m
));
2390 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2393 case CEPH_SESSION_FORCE_RO
:
2394 force_session_readonly(session
.get());
2397 case CEPH_SESSION_REJECT
:
2399 std::string_view error_str
;
2400 auto it
= m
->metadata
.find("error_string");
2401 if (it
!= m
->metadata
.end())
2402 error_str
= it
->second
;
2404 error_str
= "unknown error";
2405 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2407 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2416 bool Client::_any_stale_sessions() const
2418 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2420 for (const auto &p
: mds_sessions
) {
2421 if (p
.second
->state
== MetaSession::STATE_STALE
) {
2429 void Client::_kick_stale_sessions()
2431 ldout(cct
, 1) << __func__
<< dendl
;
2433 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2434 auto s
= it
->second
;
2435 if (s
->state
== MetaSession::STATE_REJECTED
) {
2436 mds_sessions
.erase(it
->first
);
2439 if (s
->state
== MetaSession::STATE_STALE
)
2440 _closed_mds_session(s
.get());
2444 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2445 bool drop_cap_releases
)
2448 mds_rank_t mds
= session
->mds_num
;
2449 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2450 << " for mds." << mds
<< dendl
;
2451 auto r
= build_client_request(request
);
2452 if (request
->dentry()) {
2453 r
->set_dentry_wanted();
2455 if (request
->got_unsafe
) {
2456 r
->set_replayed_op();
2457 if (request
->target
)
2458 r
->head
.ino
= request
->target
->ino
;
2460 encode_cap_releases(request
, mds
);
2461 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2462 request
->cap_releases
.clear();
2464 r
->releases
.swap(request
->cap_releases
);
2466 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2467 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2468 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2469 r
->set_osdmap_epoch(o
.get_epoch());
2473 if (request
->mds
== -1) {
2474 request
->sent_stamp
= ceph_clock_now();
2475 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2479 Inode
*in
= request
->inode();
2481 auto it
= in
->caps
.find(mds
);
2482 if (it
!= in
->caps
.end()) {
2483 request
->sent_on_mseq
= it
->second
.mseq
;
2487 session
->requests
.push_back(&request
->item
);
2489 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2490 session
->con
->send_message2(std::move(r
));
2493 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2495 auto req
= make_message
<MClientRequest
>(request
->get_op());
2496 req
->set_tid(request
->tid
);
2497 req
->set_stamp(request
->op_stamp
);
2498 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2500 // if the filepath's haven't been set, set them!
2501 if (request
->path
.empty()) {
2502 Inode
*in
= request
->inode();
2503 Dentry
*de
= request
->dentry();
2505 in
->make_nosnap_relative_path(request
->path
);
2508 de
->inode
->make_nosnap_relative_path(request
->path
);
2510 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2511 request
->path
.push_dentry(de
->name
);
2513 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2514 << " No path, inode, or appropriately-endowed dentry given!"
2516 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2517 << " No path, inode, or dentry given!"
2520 req
->set_filepath(request
->get_filepath());
2521 req
->set_filepath2(request
->get_filepath2());
2522 req
->set_alternate_name(request
->alternate_name
);
2523 req
->set_data(request
->data
);
2524 req
->set_retry_attempt(request
->retry_attempt
++);
2525 req
->head
.num_fwd
= request
->num_fwd
;
2527 int gid_count
= request
->perms
.get_gids(&_gids
);
2528 req
->set_gid_list(gid_count
, _gids
);
2534 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2536 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2538 std::scoped_lock
cl(client_lock
);
2539 auto session
= _get_mds_session(mds
, fwd
->get_connection().get());
2543 ceph_tid_t tid
= fwd
->get_tid();
2545 if (mds_requests
.count(tid
) == 0) {
2546 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2550 MetaRequest
*request
= mds_requests
[tid
];
2551 ceph_assert(request
);
2554 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2555 * is 'int32_t', while in 'ceph_mds_request_head' the
2556 * type is '__u8'. So in case the request bounces between
2557 * MDSes exceeding 256 times, the client will get stuck.
2559 * In this case it's ususally a bug in MDS and continue
2560 * bouncing the request makes no sense.
2562 * In future this could be fixed in ceph code, so avoid
2563 * using the hardcode here.
2565 int max_fwd
= sizeof(((struct ceph_mds_request_head
*)0)->num_fwd
);
2566 max_fwd
= 1 << (max_fwd
* CHAR_BIT
) - 1;
2567 auto num_fwd
= fwd
->get_num_fwd();
2568 if (num_fwd
<= request
->num_fwd
|| num_fwd
>= max_fwd
) {
2569 if (request
->num_fwd
>= max_fwd
|| num_fwd
>= max_fwd
) {
2570 request
->abort(-EMULTIHOP
);
2571 request
->caller_cond
->notify_all();
2572 ldout(cct
, 1) << __func__
<< " tid " << tid
<< " seq overflow"
2573 << ", abort it" << dendl
;
2575 ldout(cct
, 10) << __func__
<< " tid " << tid
2576 << " old fwd seq " << fwd
->get_num_fwd()
2577 << " <= req fwd " << request
->num_fwd
2578 << ", ignore it" << dendl
;
2583 // reset retry counter
2584 request
->retry_attempt
= 0;
2586 // request not forwarded, or dest mds has no session.
2588 ldout(cct
, 10) << __func__
<< " tid " << tid
2589 << " fwd " << fwd
->get_num_fwd()
2590 << " to mds." << fwd
->get_dest_mds()
2591 << ", resending to " << fwd
->get_dest_mds()
2595 request
->item
.remove_myself();
2596 request
->num_fwd
= num_fwd
;
2597 request
->resend_mds
= fwd
->get_dest_mds();
2598 request
->caller_cond
->notify_all();
2601 bool Client::is_dir_operation(MetaRequest
*req
)
2603 int op
= req
->get_op();
2604 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2605 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2606 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2607 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2612 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2614 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2616 std::scoped_lock
cl(client_lock
);
2617 auto session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2622 ceph_tid_t tid
= reply
->get_tid();
2623 bool is_safe
= reply
->is_safe();
2625 if (mds_requests
.count(tid
) == 0) {
2626 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2627 << " safe is:" << is_safe
<< dendl
;
2630 MetaRequest
*request
= mds_requests
.at(tid
);
2632 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2633 << " tid " << tid
<< dendl
;
2635 if (request
->got_unsafe
&& !is_safe
) {
2636 //duplicate response
2637 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2638 << mds_num
<< " safe:" << is_safe
<< dendl
;
2642 ceph_assert(!request
->reply
);
2643 request
->reply
= reply
;
2644 insert_trace(request
, session
.get());
2646 // Handle unsafe reply
2648 request
->got_unsafe
= true;
2649 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2650 if (is_dir_operation(request
)) {
2651 Inode
*dir
= request
->inode();
2653 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2655 if (request
->target
) {
2656 InodeRef
&in
= request
->target
;
2657 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2661 // Only signal the caller once (on the first reply):
2662 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2663 if (!is_safe
|| !request
->got_unsafe
) {
2664 ceph::condition_variable cond
;
2665 request
->dispatch_cond
= &cond
;
2668 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2669 request
->caller_cond
->notify_all();
2671 // wake for kick back
2672 std::unique_lock l
{client_lock
, std::adopt_lock
};
2673 cond
.wait(l
, [tid
, request
, &cond
, this] {
2674 if (request
->dispatch_cond
) {
2675 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2676 << tid
<< " " << &cond
<< dendl
;
2678 return !request
->dispatch_cond
;
2684 // the filesystem change is committed to disk
2685 // we're done, clean up
2686 if (request
->got_unsafe
) {
2687 request
->unsafe_item
.remove_myself();
2688 request
->unsafe_dir_item
.remove_myself();
2689 request
->unsafe_target_item
.remove_myself();
2690 signal_cond_list(request
->waitfor_safe
);
2692 request
->item
.remove_myself();
2693 unregister_request(request
);
2695 if (is_unmounting())
2696 mount_cond
.notify_all();
2699 void Client::_handle_full_flag(int64_t pool
)
2701 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2702 << "on " << pool
<< dendl
;
2703 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2704 // to do this rather than blocking, because otherwise when we fill up we
2705 // potentially lock caps forever on files with dirty pages, and we need
2706 // to be able to release those caps to the MDS so that it can delete files
2707 // and free up space.
2708 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-CEPHFS_ENOSPC
, pool
);
2710 // For all inodes with layouts in this pool and a pending flush write op
2711 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2712 // from ObjectCacher so that it doesn't re-issue the write in response to
2713 // the ENOSPC error.
2714 // Fortunately since we're cancelling everything in a given pool, we don't
2715 // need to know which ops belong to which ObjectSet, we can just blow all
2716 // the un-flushed cached data away and mark any dirty inodes' async_err
2717 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2718 // affecting this pool, and all the objectsets we're purging were also
2720 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2721 i
!= inode_map
.end(); ++i
)
2723 Inode
*inode
= i
->second
;
2724 if (inode
->oset
.dirty_or_tx
2725 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2726 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2727 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2728 objectcacher
->purge_set(&inode
->oset
);
2729 inode
->set_async_err(-CEPHFS_ENOSPC
);
2733 if (cancelled_epoch
!= (epoch_t
)-1) {
2734 set_cap_epoch_barrier(cancelled_epoch
);
2738 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2740 std::scoped_lock
cl(client_lock
);
2742 const auto myaddrs
= messenger
->get_myaddrs();
2743 bool new_blocklist
= objecter
->with_osdmap(
2744 [&](const OSDMap
& o
) {
2745 return o
.is_blocklisted(myaddrs
);
2748 if (new_blocklist
&& !blocklisted
) {
2749 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2750 return o
.get_epoch();
2752 lderr(cct
) << "I was blocklisted at osd epoch " << epoch
<< dendl
;
2755 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED
);
2757 // Since we know all our OSD ops will fail, cancel them all preemtively,
2758 // so that on an unhealthy cluster we can umount promptly even if e.g.
2759 // some PGs were inaccessible.
2760 objecter
->op_cancel_writes(-CEPHFS_EBLOCKLISTED
);
2765 // Handle case where we were blocklisted but no longer are
2766 blocklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2767 return o
.is_blocklisted(myaddrs
);});
2770 // Always subscribe to next osdmap for blocklisted client
2771 // until this client is not blocklisted.
2773 objecter
->maybe_request_map();
2776 if (objecter
->osdmap_full_flag()) {
2777 _handle_full_flag(-1);
2779 // Accumulate local list of full pools so that I can drop
2780 // the objecter lock before re-entering objecter in
2782 std::vector
<int64_t> full_pools
;
2784 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2785 for (const auto& kv
: o
.get_pools()) {
2786 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2787 full_pools
.push_back(kv
.first
);
2792 for (auto p
: full_pools
)
2793 _handle_full_flag(p
);
2795 // Subscribe to subsequent maps to watch for the full flag going
2796 // away. For the global full flag objecter does this for us, but
2797 // it pays no attention to the per-pool full flag so in this branch
2798 // we do it ourselves.
2799 if (!full_pools
.empty()) {
2800 objecter
->maybe_request_map();
2806 // ------------------------
2807 // incoming messages
2810 bool Client::ms_dispatch2(const MessageRef
&m
)
2812 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2813 if (!iref_reader
.is_state_satisfied()) {
2814 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2818 switch (m
->get_type()) {
2819 // mounting and mds sessions
2820 case CEPH_MSG_MDS_MAP
:
2821 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2823 case CEPH_MSG_FS_MAP
:
2824 handle_fs_map(ref_cast
<MFSMap
>(m
));
2826 case CEPH_MSG_FS_MAP_USER
:
2827 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2829 case CEPH_MSG_CLIENT_SESSION
:
2830 handle_client_session(ref_cast
<MClientSession
>(m
));
2833 case CEPH_MSG_OSD_MAP
:
2834 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2838 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2839 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2841 case CEPH_MSG_CLIENT_REPLY
:
2842 handle_client_reply(ref_cast
<MClientReply
>(m
));
2846 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2847 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2850 case CEPH_MSG_CLIENT_SNAP
:
2851 handle_snap(ref_cast
<MClientSnap
>(m
));
2853 case CEPH_MSG_CLIENT_CAPS
:
2854 handle_caps(ref_cast
<MClientCaps
>(m
));
2856 case CEPH_MSG_CLIENT_LEASE
:
2857 handle_lease(ref_cast
<MClientLease
>(m
));
2859 case MSG_COMMAND_REPLY
:
2860 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2861 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2866 case CEPH_MSG_CLIENT_QUOTA
:
2867 handle_quota(ref_cast
<MClientQuota
>(m
));
2875 std::scoped_lock
cl(client_lock
);
2876 if (is_unmounting()) {
2877 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2878 << "+" << inode_map
.size() << dendl
;
2879 uint64_t size
= lru
.lru_get_size() + inode_map
.size();
2881 if (size
> lru
.lru_get_size() + inode_map
.size()) {
2882 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2883 mount_cond
.notify_all();
2885 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2886 << "+" << inode_map
.size() << dendl
;
2893 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2895 std::scoped_lock
cl(client_lock
);
2896 fsmap
.reset(new FSMap(m
->get_fsmap()));
2898 signal_cond_list(waiting_for_fsmap
);
2900 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2903 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2905 std::scoped_lock
cl(client_lock
);
2906 fsmap_user
.reset(new FSMapUser
);
2907 *fsmap_user
= m
->get_fsmap();
2909 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2910 signal_cond_list(waiting_for_fsmap
);
2913 // Cancel all the commands for missing or laggy GIDs
2914 void Client::cancel_commands(const MDSMap
& newmap
)
2916 std::vector
<ceph_tid_t
> cancel_ops
;
2918 std::scoped_lock
cmd_lock(command_lock
);
2919 auto &commands
= command_table
.get_commands();
2920 for (const auto &[tid
, op
] : commands
) {
2921 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2922 if (newmap
.is_dne_gid(op_mds_gid
) || newmap
.is_laggy_gid(op_mds_gid
)) {
2923 ldout(cct
, 1) << __func__
<< ": cancelling command op " << tid
<< dendl
;
2924 cancel_ops
.push_back(tid
);
2926 std::ostringstream ss
;
2927 ss
<< "MDS " << op_mds_gid
<< " went away";
2928 *(op
.outs
) = ss
.str();
2931 * No need to make the con->mark_down under
2932 * client_lock here, because the con will
2935 op
.con
->mark_down();
2937 op
.on_finish
->complete(-CEPHFS_ETIMEDOUT
);
2941 for (const auto &tid
: cancel_ops
)
2942 command_table
.erase(tid
);
2945 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2947 std::unique_lock
cl(client_lock
);
2948 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2949 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2950 << " is identical to or older than our "
2951 << mdsmap
->get_epoch() << dendl
;
2956 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2957 std::unique_ptr
<MDSMap
> _mdsmap(new MDSMap
);
2958 _mdsmap
->decode(m
->get_encoded());
2959 cancel_commands(*_mdsmap
.get());
2962 _mdsmap
.swap(mdsmap
);
2965 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2966 mds_rank_t mds
= p
->first
;
2967 MetaSessionRef session
= p
->second
;
2970 int oldstate
= _mdsmap
->get_state(mds
);
2971 int newstate
= mdsmap
->get_state(mds
);
2972 if (!mdsmap
->is_up(mds
)) {
2973 session
->con
->mark_down();
2974 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2975 auto old_inc
= _mdsmap
->get_incarnation(mds
);
2976 auto new_inc
= mdsmap
->get_incarnation(mds
);
2977 if (old_inc
!= new_inc
) {
2978 ldout(cct
, 1) << "mds incarnation changed from "
2979 << old_inc
<< " to " << new_inc
<< dendl
;
2980 oldstate
= MDSMap::STATE_NULL
;
2982 session
->con
->mark_down();
2983 session
->addrs
= mdsmap
->get_addrs(mds
);
2984 // When new MDS starts to take over, notify kernel to trim unused entries
2985 // in its dcache/icache. Hopefully, the kernel will release some unused
2986 // inodes before the new MDS enters reconnect state.
2987 trim_cache_for_reconnect(session
.get());
2988 } else if (oldstate
== newstate
)
2989 continue; // no change
2991 session
->mds_state
= newstate
;
2992 if (newstate
== MDSMap::STATE_RECONNECT
) {
2993 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2994 send_reconnect(session
.get());
2995 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2996 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2997 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2998 _closed_mds_session(session
.get());
3001 if (newstate
>= MDSMap::STATE_ACTIVE
) {
3002 if (oldstate
< MDSMap::STATE_ACTIVE
) {
3003 // kick new requests
3004 kick_requests(session
.get());
3005 kick_flushing_caps(session
.get());
3006 signal_context_list(session
->waiting_for_open
);
3007 wake_up_session_caps(session
.get(), true);
3009 connect_mds_targets(mds
);
3011 } else if (newstate
== MDSMap::STATE_NULL
&&
3012 mds
>= mdsmap
->get_max_mds()) {
3013 _closed_mds_session(session
.get());
3017 // kick any waiting threads
3018 signal_cond_list(waiting_for_mdsmap
);
3020 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
3023 void Client::send_reconnect(MetaSession
*session
)
3025 mds_rank_t mds
= session
->mds_num
;
3026 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
3028 // trim unused caps to reduce MDS's cache rejoin time
3029 trim_cache_for_reconnect(session
);
3031 session
->readonly
= false;
3033 session
->release
.reset();
3035 // reset my cap seq number
3037 //connect to the mds' offload targets
3038 connect_mds_targets(mds
);
3039 //make sure unsafe requests get saved
3040 resend_unsafe_requests(session
);
3042 early_kick_flushing_caps(session
);
3044 auto m
= make_message
<MClientReconnect
>();
3045 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
3047 // i have an open session.
3048 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
3049 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
3050 p
!= inode_map
.end();
3052 Inode
*in
= p
->second
;
3053 auto it
= in
->caps
.find(mds
);
3054 if (it
!= in
->caps
.end()) {
3056 m
->get_approx_size() >=
3057 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
3059 session
->con
->send_message2(std::move(m
));
3061 m
= make_message
<MClientReconnect
>();
3064 Cap
&cap
= it
->second
;
3065 ldout(cct
, 10) << " caps on " << p
->first
3066 << " " << ccap_string(cap
.issued
)
3067 << " wants " << ccap_string(in
->caps_wanted())
3070 in
->make_short_path(path
);
3071 ldout(cct
, 10) << " path " << path
<< dendl
;
3074 _encode_filelocks(in
, flockbl
);
3076 cap
.seq
= 0; // reset seq.
3077 cap
.issue_seq
= 0; // reset seq.
3078 cap
.mseq
= 0; // reset seq.
3079 // cap gen should catch up with session cap_gen
3080 if (cap
.gen
< session
->cap_gen
) {
3081 cap
.gen
= session
->cap_gen
;
3082 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
3084 cap
.issued
= cap
.implemented
;
3086 snapid_t snap_follows
= 0;
3087 if (!in
->cap_snaps
.empty())
3088 snap_follows
= in
->cap_snaps
.begin()->first
;
3090 m
->add_cap(p
->first
.ino
,
3092 path
.get_ino(), path
.get_path(), // ino
3093 in
->caps_wanted(), // wanted
3094 cap
.issued
, // issued
3099 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
3100 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
3101 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
3102 did_snaprealm
.insert(in
->snaprealm
->ino
);
3108 m
->set_encoding_version(0); // use connection features to choose encoding
3109 session
->con
->send_message2(std::move(m
));
3111 mount_cond
.notify_all();
3113 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
3114 signal_cond_list(waiting_for_reclaim
);
3118 void Client::kick_requests(MetaSession
*session
)
3120 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3121 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3122 p
!= mds_requests
.end();
3124 MetaRequest
*req
= p
->second
;
3125 if (req
->got_unsafe
)
3127 if (req
->aborted()) {
3128 if (req
->caller_cond
) {
3130 req
->caller_cond
->notify_all();
3134 if (req
->retry_attempt
> 0)
3135 continue; // new requests only
3136 if (req
->mds
== session
->mds_num
) {
3137 send_request(p
->second
, session
);
3142 void Client::resend_unsafe_requests(MetaSession
*session
)
3144 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
3147 send_request(*iter
, session
);
3149 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3150 // process completed requests in clientreplay stage.
3151 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3152 p
!= mds_requests
.end();
3154 MetaRequest
*req
= p
->second
;
3155 if (req
->got_unsafe
)
3159 if (req
->retry_attempt
== 0)
3160 continue; // old requests only
3161 if (req
->mds
== session
->mds_num
)
3162 send_request(req
, session
, true);
3166 void Client::wait_unsafe_requests()
3168 list
<MetaRequest
*> last_unsafe_reqs
;
3169 for (const auto &p
: mds_sessions
) {
3170 const auto s
= p
.second
;
3171 if (!s
->unsafe_requests
.empty()) {
3172 MetaRequest
*req
= s
->unsafe_requests
.back();
3174 last_unsafe_reqs
.push_back(req
);
3178 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
3179 p
!= last_unsafe_reqs
.end();
3181 MetaRequest
*req
= *p
;
3182 if (req
->unsafe_item
.is_on_list())
3183 wait_on_list(req
->waitfor_safe
);
3188 void Client::kick_requests_closed(MetaSession
*session
)
3190 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3191 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3192 p
!= mds_requests
.end(); ) {
3193 MetaRequest
*req
= p
->second
;
3195 if (req
->mds
== session
->mds_num
) {
3196 if (req
->caller_cond
) {
3198 req
->caller_cond
->notify_all();
3200 req
->item
.remove_myself();
3201 if (req
->got_unsafe
) {
3202 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
3203 req
->unsafe_item
.remove_myself();
3204 if (is_dir_operation(req
)) {
3205 Inode
*dir
= req
->inode();
3207 dir
->set_async_err(-CEPHFS_EIO
);
3208 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
3209 << dir
->ino
<< " " << req
->get_tid() << dendl
;
3210 req
->unsafe_dir_item
.remove_myself();
3213 InodeRef
&in
= req
->target
;
3214 in
->set_async_err(-CEPHFS_EIO
);
3215 lderr(cct
) << "kick_requests_closed drop req of inode : "
3216 << in
->ino
<< " " << req
->get_tid() << dendl
;
3217 req
->unsafe_target_item
.remove_myself();
3219 signal_cond_list(req
->waitfor_safe
);
3220 unregister_request(req
);
3224 ceph_assert(session
->requests
.empty());
3225 ceph_assert(session
->unsafe_requests
.empty());
3235 void Client::got_mds_push(MetaSession
*s
)
3238 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
3239 if (s
->state
== MetaSession::STATE_CLOSING
) {
3240 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3244 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3246 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3248 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3249 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3251 std::scoped_lock
cl(client_lock
);
3252 auto session
= _get_mds_session(mds
, m
->get_connection().get());
3257 got_mds_push(session
.get());
3259 ceph_seq_t seq
= m
->get_seq();
3262 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3263 if (inode_map
.count(vino
) == 0) {
3264 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3267 in
= inode_map
[vino
];
3269 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3270 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3271 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3274 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3275 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3281 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3282 m
->get_mask(), m
->get_ino(),
3283 m
->get_first(), m
->get_last(), m
->dname
);
3284 m
->get_connection()->send_message2(std::move(reply
));
3288 void Client::_put_inode(Inode
*in
, int n
)
3290 ldout(cct
, 10) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3292 int left
= in
->get_nref();
3293 ceph_assert(left
>= n
+ 1);
3296 if (left
== 1) { // the last one will be held by the inode_map
3298 remove_all_caps(in
);
3300 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3301 bool unclean
= objectcacher
->release_set(&in
->oset
);
3302 ceph_assert(!unclean
);
3303 inode_map
.erase(in
->vino());
3304 if (use_faked_inos())
3305 _release_faked_ino(in
);
3307 if (root
== nullptr) {
3309 while (!root_parents
.empty())
3310 root_parents
.erase(root_parents
.begin());
3317 void Client::delay_put_inodes(bool wakeup
)
3319 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
3321 std::map
<Inode
*,int> release
;
3323 std::scoped_lock
dl(delay_i_lock
);
3324 release
.swap(delay_i_release
);
3327 if (release
.empty())
3330 for (auto &[in
, cnt
] : release
)
3331 _put_inode(in
, cnt
);
3334 mount_cond
.notify_all();
3337 void Client::put_inode(Inode
*in
, int n
)
3339 ldout(cct
, 20) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3341 std::scoped_lock
dl(delay_i_lock
);
3342 delay_i_release
[in
] += n
;
3345 void Client::close_dir(Dir
*dir
)
3347 Inode
*in
= dir
->parent_inode
;
3348 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3349 ceph_assert(dir
->is_empty());
3350 ceph_assert(in
->dir
== dir
);
3351 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3352 if (!in
->dentries
.empty())
3353 in
->get_first_parent()->put(); // unpin dentry
3357 put_inode(in
); // unpin inode
3361 * Don't call this with in==NULL, use get_or_create for that
3362 * leave dn set to default NULL unless you're trying to add
3363 * a new inode to a pre-created Dentry
3365 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3368 // create a new Dentry
3369 dn
= new Dentry(dir
, name
);
3371 lru
.lru_insert_mid(dn
); // mid or top?
3373 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3374 << " dn " << dn
<< " (new dn)" << dendl
;
3376 ceph_assert(!dn
->inode
);
3377 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3378 << " dn " << dn
<< " (old dn)" << dendl
;
3381 if (in
) { // link to inode
3383 // only one parent for directories!
3384 if (in
->is_dir() && !in
->dentries
.empty()) {
3385 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3386 Dentry
*olddn
= in
->get_first_parent();
3387 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3388 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3389 clear_dir_complete_and_ordered(old_diri
, true);
3390 unlink(olddn
, true, true); // keep dir, dentry
3395 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3401 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3403 InodeRef
in(dn
->inode
);
3404 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3405 << " inode " << dn
->inode
<< dendl
;
3407 // unlink from inode
3411 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3417 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3427 if (dir
->is_empty() && !keepdir
)
3433 * For asynchronous flushes, check for errors from the IO and
3434 * update the inode if necessary
3436 class C_Client_FlushComplete
: public Context
{
3441 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3442 void finish(int r
) override
{
3443 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3445 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3446 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3447 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3448 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3449 inode
->set_async_err(r
);
3459 void Client::get_cap_ref(Inode
*in
, int cap
)
3461 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3462 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3463 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3466 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3467 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3468 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3471 in
->get_cap_ref(cap
);
3474 void Client::put_cap_ref(Inode
*in
, int cap
)
3476 int last
= in
->put_cap_ref(cap
);
3479 int drop
= last
& ~in
->caps_issued();
3480 if (in
->snapid
== CEPH_NOSNAP
) {
3481 if ((last
& (CEPH_CAP_FILE_WR
| CEPH_CAP_FILE_BUFFER
)) &&
3482 !in
->cap_snaps
.empty() &&
3483 in
->cap_snaps
.rbegin()->second
.writing
) {
3484 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3485 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3486 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3487 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3489 if (last
& CEPH_CAP_FILE_BUFFER
) {
3490 for (auto &p
: in
->cap_snaps
)
3491 p
.second
.dirty_data
= 0;
3492 signal_cond_list(in
->waitfor_commit
);
3493 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3497 if (last
& CEPH_CAP_FILE_CACHE
) {
3498 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3504 put_inode(in
, put_nref
);
3508 // get caps for a given file handle -- the inode should have @need caps
3509 // issued by the mds and @want caps not revoked (or not under revocation).
3510 // this routine blocks till the cap requirement is satisfied. also account
3511 // (track) for capability hit when required (when cap requirement succeedes).
3512 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3514 Inode
*in
= fh
->inode
.get();
3516 int r
= check_pool_perm(in
, need
);
3521 int file_wanted
= in
->caps_file_wanted();
3522 if ((file_wanted
& need
) != need
) {
3523 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3524 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3526 return -CEPHFS_EBADF
;
3529 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3530 return -CEPHFS_EBADF
;
3532 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3536 int have
= in
->caps_issued(&implemented
);
3538 bool waitfor_caps
= false;
3539 bool waitfor_commit
= false;
3541 if (have
& need
& CEPH_CAP_FILE_WR
) {
3543 if ((endoff
>= (loff_t
)in
->max_size
||
3544 endoff
> (loff_t
)(in
->size
<< 1)) &&
3545 endoff
> (loff_t
)in
->wanted_max_size
) {
3546 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3547 in
->wanted_max_size
= endoff
;
3549 if (in
->wanted_max_size
> in
->max_size
&&
3550 in
->wanted_max_size
> in
->requested_max_size
)
3554 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3555 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3556 waitfor_caps
= true;
3558 if (!in
->cap_snaps
.empty()) {
3559 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3560 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3561 waitfor_caps
= true;
3563 for (auto &p
: in
->cap_snaps
) {
3564 if (p
.second
.dirty_data
) {
3565 waitfor_commit
= true;
3569 if (waitfor_commit
) {
3570 _flush(in
, new C_Client_FlushComplete(this, in
));
3571 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3576 if (!waitfor_caps
&& !waitfor_commit
) {
3577 if ((have
& need
) == need
) {
3578 int revoking
= implemented
& ~have
;
3579 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3580 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3581 << " revoking " << ccap_string(revoking
)
3583 if ((revoking
& want
) == 0) {
3584 *phave
= need
| (have
& want
);
3585 in
->get_cap_ref(need
);
3590 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3591 waitfor_caps
= true;
3594 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3595 in
->auth_cap
->session
->readonly
)
3596 return -CEPHFS_EROFS
;
3598 if (in
->flags
& I_CAP_DROPPED
) {
3599 int mds_wanted
= in
->caps_mds_wanted();
3600 if ((mds_wanted
& need
) != need
) {
3601 int ret
= _renew_caps(in
);
3606 if (!(file_wanted
& ~mds_wanted
))
3607 in
->flags
&= ~I_CAP_DROPPED
;
3611 wait_on_list(in
->waitfor_caps
);
3612 else if (waitfor_commit
)
3613 wait_on_list(in
->waitfor_commit
);
3617 int Client::get_caps_used(Inode
*in
)
3619 unsigned used
= in
->caps_used();
3620 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3621 !objectcacher
->set_is_empty(&in
->oset
))
3622 used
|= CEPH_CAP_FILE_CACHE
;
3626 void Client::cap_delay_requeue(Inode
*in
)
3628 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3630 in
->hold_caps_until
= ceph::coarse_mono_clock::now() + caps_release_delay
;
3631 delayed_list
.push_back(&in
->delay_cap_item
);
3634 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3635 int flags
, int used
, int want
, int retain
,
3636 int flush
, ceph_tid_t flush_tid
)
3638 int held
= cap
->issued
| cap
->implemented
;
3639 int revoking
= cap
->implemented
& ~cap
->issued
;
3640 retain
&= ~revoking
;
3641 int dropping
= cap
->issued
& ~retain
;
3642 int op
= CEPH_CAP_OP_UPDATE
;
3644 ldout(cct
, 10) << __func__
<< " " << *in
3645 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3646 << " used " << ccap_string(used
)
3647 << " want " << ccap_string(want
)
3648 << " flush " << ccap_string(flush
)
3649 << " retain " << ccap_string(retain
)
3650 << " held "<< ccap_string(held
)
3651 << " revoking " << ccap_string(revoking
)
3652 << " dropping " << ccap_string(dropping
)
3655 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3656 const int would_have_issued
= cap
->issued
& retain
;
3657 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3659 // - tell the server we think issued is whatever they issued plus whatever we implemented
3660 // - leave what we have implemented in place
3661 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3662 cap
->issued
= cap
->issued
| cap
->implemented
;
3664 // Make an exception for revoking xattr caps: we are injecting
3665 // failure to release other caps, but allow xattr because client
3666 // will block on xattr ops if it can't release these to MDS (#9800)
3667 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3668 cap
->issued
^= xattr_mask
& revoking
;
3669 cap
->implemented
^= xattr_mask
& revoking
;
3671 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3672 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3675 cap
->issued
&= retain
;
3676 cap
->implemented
&= cap
->issued
| used
;
3679 snapid_t follows
= 0;
3682 follows
= in
->snaprealm
->get_snap_context().seq
;
3684 auto m
= make_message
<MClientCaps
>(op
,
3687 cap
->cap_id
, cap
->seq
,
3693 m
->caller_uid
= in
->cap_dirtier_uid
;
3694 m
->caller_gid
= in
->cap_dirtier_gid
;
3696 m
->head
.issue_seq
= cap
->issue_seq
;
3697 m
->set_tid(flush_tid
);
3699 m
->head
.uid
= in
->uid
;
3700 m
->head
.gid
= in
->gid
;
3701 m
->head
.mode
= in
->mode
;
3703 m
->head
.nlink
= in
->nlink
;
3705 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3706 encode(in
->xattrs
, m
->xattrbl
);
3707 m
->head
.xattr_version
= in
->xattr_version
;
3711 m
->max_size
= in
->max_size
;
3712 m
->truncate_seq
= in
->truncate_seq
;
3713 m
->truncate_size
= in
->truncate_size
;
3714 m
->mtime
= in
->mtime
;
3715 m
->atime
= in
->atime
;
3716 m
->ctime
= in
->ctime
;
3717 m
->btime
= in
->btime
;
3718 m
->time_warp_seq
= in
->time_warp_seq
;
3719 m
->change_attr
= in
->change_attr
;
3721 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3722 !in
->cap_snaps
.empty() &&
3723 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3724 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3727 if (flush
& CEPH_CAP_FILE_WR
) {
3728 m
->inline_version
= in
->inline_version
;
3729 m
->inline_data
= in
->inline_data
;
3732 in
->reported_size
= in
->size
;
3733 m
->set_snap_follows(follows
);
3735 if (cap
== in
->auth_cap
) {
3736 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3737 m
->set_max_size(in
->wanted_max_size
);
3738 in
->requested_max_size
= in
->wanted_max_size
;
3739 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3741 in
->requested_max_size
= 0;
3742 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3746 if (!session
->flushing_caps_tids
.empty())
3747 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3749 session
->con
->send_message2(std::move(m
));
3752 static bool is_max_size_approaching(Inode
*in
)
3754 /* mds will adjust max size according to the reported size */
3755 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3757 if (in
->size
>= in
->max_size
)
3759 /* half of previous max_size increment has been used */
3760 if (in
->max_size
> in
->reported_size
&&
3761 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3766 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3768 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3770 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3773 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3774 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3775 used
&= ~CEPH_CAP_FILE_CACHE
;
3776 used
|= CEPH_CAP_FILE_LAZYIO
;
3778 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3779 used
&= ~CEPH_CAP_FILE_BUFFER
;
3780 used
|= CEPH_CAP_FILE_LAZYIO
;
3783 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3784 used
&= ~CEPH_CAP_FILE_CACHE
;
3785 used
|= CEPH_CAP_FILE_LAZYIO
;
3787 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3788 used
&= ~CEPH_CAP_FILE_BUFFER
;
3789 used
|= CEPH_CAP_FILE_LAZYIO
;
3798 * Examine currently used and wanted versus held caps. Release, flush or ack
3799 * revoked caps to the MDS as appropriate.
3801 * @param in the inode to check
3802 * @param flags flags to apply to cap check
3804 void Client::check_caps(Inode
*in
, unsigned flags
)
3806 unsigned wanted
= in
->caps_wanted();
3807 unsigned used
= get_caps_used(in
);
3811 int issued
= in
->caps_issued(&implemented
);
3812 int revoking
= implemented
& ~issued
;
3814 int orig_used
= used
;
3815 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3817 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3818 if (!is_unmounting() && in
->nlink
> 0) {
3820 retain
|= CEPH_CAP_ANY
;
3821 } else if (in
->is_dir() &&
3822 (issued
& CEPH_CAP_FILE_SHARED
) &&
3823 (in
->flags
& I_COMPLETE
)) {
3824 // we do this here because we don't want to drop to Fs (and then
3825 // drop the Fs if we do a create!) if that alone makes us send lookups
3826 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3827 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3830 retain
|= CEPH_CAP_ANY_SHARED
;
3831 // keep RD only if we didn't have the file open RW,
3832 // because then the mds would revoke it anyway to
3833 // journal max_size=0.
3834 if (in
->max_size
== 0)
3835 retain
|= CEPH_CAP_ANY_RD
;
3839 ldout(cct
, 10) << __func__
<< " on " << *in
3840 << " wanted " << ccap_string(wanted
)
3841 << " used " << ccap_string(used
)
3842 << " issued " << ccap_string(issued
)
3843 << " revoking " << ccap_string(revoking
)
3844 << " flags=" << flags
3847 if (in
->snapid
!= CEPH_NOSNAP
)
3848 return; //snap caps last forever, can't write
3850 if (in
->caps
.empty())
3851 return; // guard if at end of func
3853 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3854 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3856 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3859 for (auto &[mds
, cap
] : in
->caps
) {
3860 auto session
= mds_sessions
.at(mds
);
3863 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3864 cap_used
&= ~in
->auth_cap
->issued
;
3866 revoking
= cap
.implemented
& ~cap
.issued
;
3868 ldout(cct
, 10) << " cap mds." << mds
3869 << " issued " << ccap_string(cap
.issued
)
3870 << " implemented " << ccap_string(cap
.implemented
)
3871 << " revoking " << ccap_string(revoking
) << dendl
;
3873 if (in
->wanted_max_size
> in
->max_size
&&
3874 in
->wanted_max_size
> in
->requested_max_size
&&
3875 &cap
== in
->auth_cap
)
3878 /* approaching file_max? */
3879 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3880 &cap
== in
->auth_cap
&&
3881 is_max_size_approaching(in
)) {
3882 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3883 << ", reported " << in
->reported_size
<< dendl
;
3887 /* completed revocation? */
3888 if (revoking
&& (revoking
& cap_used
) == 0) {
3889 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3893 /* want more caps from mds? */
3894 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3897 if (!revoking
&& is_unmounting() && (cap_used
== 0))
3900 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3901 !in
->dirty_caps
) // and we have no dirty caps
3904 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3905 ldout(cct
, 10) << "delaying cap release" << dendl
;
3906 cap_delay_requeue(in
);
3911 if (&cap
== in
->auth_cap
) {
3912 if (in
->flags
& I_KICK_FLUSH
) {
3913 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3914 << " to mds." << mds
<< dendl
;
3915 kick_flushing_caps(in
, session
.get());
3917 if (!in
->cap_snaps
.empty() &&
3918 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3924 ceph_tid_t flush_tid
;
3925 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3926 flushing
= mark_caps_flushing(in
, &flush_tid
);
3927 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3928 msg_flags
|= MClientCaps::FLAG_SYNC
;
3934 in
->delay_cap_item
.remove_myself();
3935 send_cap(in
, session
.get(), &cap
, msg_flags
, cap_used
, wanted
, retain
,
3936 flushing
, flush_tid
);
3941 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3943 int used
= get_caps_used(in
);
3944 int dirty
= in
->caps_dirty();
3945 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3947 if (in
->cap_snaps
.size() &&
3948 in
->cap_snaps
.rbegin()->second
.writing
) {
3949 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3951 } else if (in
->caps_dirty() ||
3952 (used
& CEPH_CAP_FILE_WR
) ||
3953 (dirty
& CEPH_CAP_ANY_WR
)) {
3954 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3955 ceph_assert(capsnapem
.second
); /* element inserted */
3956 CapSnap
&capsnap
= capsnapem
.first
->second
;
3957 capsnap
.context
= old_snapc
;
3958 capsnap
.issued
= in
->caps_issued();
3959 capsnap
.dirty
= in
->caps_dirty();
3961 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3963 capsnap
.uid
= in
->uid
;
3964 capsnap
.gid
= in
->gid
;
3965 capsnap
.mode
= in
->mode
;
3966 capsnap
.btime
= in
->btime
;
3967 capsnap
.xattrs
= in
->xattrs
;
3968 capsnap
.xattr_version
= in
->xattr_version
;
3969 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3970 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3972 if (used
& CEPH_CAP_FILE_WR
) {
3973 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3974 capsnap
.writing
= 1;
3976 finish_cap_snap(in
, capsnap
, used
);
3979 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3983 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3985 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3986 capsnap
.size
= in
->size
;
3987 capsnap
.mtime
= in
->mtime
;
3988 capsnap
.atime
= in
->atime
;
3989 capsnap
.ctime
= in
->ctime
;
3990 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3991 capsnap
.change_attr
= in
->change_attr
;
3992 capsnap
.dirty
|= in
->caps_dirty();
3994 /* Only reset it if it wasn't set before */
3995 if (capsnap
.cap_dirtier_uid
== -1) {
3996 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3997 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
4000 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4001 capsnap
.inline_data
= in
->inline_data
;
4002 capsnap
.inline_version
= in
->inline_version
;
4005 if (used
& CEPH_CAP_FILE_BUFFER
) {
4006 capsnap
.writing
= 1;
4007 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
4008 << " WRBUFFER, delaying" << dendl
;
4010 capsnap
.dirty_data
= 0;
4015 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
4016 snapid_t follows
, CapSnap
& capsnap
)
4018 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
4019 in
->ino
, in
->snaprealm
->ino
, 0,
4020 in
->auth_cap
->mseq
, cap_epoch_barrier
);
4021 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
4022 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
4024 m
->set_client_tid(capsnap
.flush_tid
);
4025 m
->head
.snap_follows
= follows
;
4027 m
->head
.caps
= capsnap
.issued
;
4028 m
->head
.dirty
= capsnap
.dirty
;
4030 m
->head
.uid
= capsnap
.uid
;
4031 m
->head
.gid
= capsnap
.gid
;
4032 m
->head
.mode
= capsnap
.mode
;
4033 m
->btime
= capsnap
.btime
;
4035 m
->size
= capsnap
.size
;
4037 m
->head
.xattr_version
= capsnap
.xattr_version
;
4038 encode(capsnap
.xattrs
, m
->xattrbl
);
4040 m
->ctime
= capsnap
.ctime
;
4041 m
->btime
= capsnap
.btime
;
4042 m
->mtime
= capsnap
.mtime
;
4043 m
->atime
= capsnap
.atime
;
4044 m
->time_warp_seq
= capsnap
.time_warp_seq
;
4045 m
->change_attr
= capsnap
.change_attr
;
4047 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4048 m
->inline_version
= in
->inline_version
;
4049 m
->inline_data
= in
->inline_data
;
4052 ceph_assert(!session
->flushing_caps_tids
.empty());
4053 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
4055 session
->con
->send_message2(std::move(m
));
4058 void Client::flush_snaps(Inode
*in
)
4060 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
4061 ceph_assert(in
->cap_snaps
.size());
4064 ceph_assert(in
->auth_cap
);
4065 MetaSession
*session
= in
->auth_cap
->session
;
4067 for (auto &p
: in
->cap_snaps
) {
4068 CapSnap
&capsnap
= p
.second
;
4069 // only do new flush
4070 if (capsnap
.flush_tid
> 0)
4073 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
4074 << " follows " << p
.first
4075 << " size " << capsnap
.size
4076 << " mtime " << capsnap
.mtime
4077 << " dirty_data=" << capsnap
.dirty_data
4078 << " writing=" << capsnap
.writing
4079 << " on " << *in
<< dendl
;
4080 if (capsnap
.dirty_data
|| capsnap
.writing
)
4083 capsnap
.flush_tid
= ++last_flush_tid
;
4084 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4085 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
4086 if (!in
->flushing_cap_item
.is_on_list())
4087 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4089 send_flush_snap(in
, session
, p
.first
, capsnap
);
4093 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
4095 ceph::condition_variable cond
;
4096 ls
.push_back(&cond
);
4097 std::unique_lock l
{client_lock
, std::adopt_lock
};
4103 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
4105 for (auto cond
: ls
) {
4110 void Client::wait_on_context_list(list
<Context
*>& ls
)
4112 ceph::condition_variable cond
;
4115 ls
.push_back(new C_Cond(cond
, &done
, &r
));
4116 std::unique_lock l
{client_lock
, std::adopt_lock
};
4117 cond
.wait(l
, [&done
] { return done
;});
4121 void Client::signal_context_list(list
<Context
*>& ls
)
4123 while (!ls
.empty()) {
4124 ls
.front()->complete(0);
4129 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
4131 for (const auto &cap
: s
->caps
) {
4132 auto &in
= cap
->inode
;
4134 in
.requested_max_size
= 0;
4135 in
.wanted_max_size
= 0;
4137 if (cap
->gen
< s
->cap_gen
) {
4138 // mds did not re-issue stale cap.
4139 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
4140 // make sure mds knows what we want.
4141 if (in
.caps_file_wanted() & ~cap
->wanted
)
4142 in
.flags
|= I_CAP_DROPPED
;
4145 signal_cond_list(in
.waitfor_caps
);
4150 // flush dirty data (from objectcache)
4152 class C_Client_CacheInvalidate
: public Context
{
4156 int64_t offset
, length
;
4158 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
4159 client(c
), offset(off
), length(len
) {
4160 if (client
->use_faked_inos())
4161 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4165 void finish(int r
) override
{
4166 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4167 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4168 client
->_async_invalidate(ino
, offset
, length
);
4172 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
4174 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4175 if (!mref_reader
.is_state_satisfied())
4178 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
4179 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
4182 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
4184 if (ino_invalidate_cb
)
4185 // we queue the invalidate, which calls the callback and decrements the ref
4186 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
4189 void Client::_invalidate_inode_cache(Inode
*in
)
4191 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
4193 // invalidate our userspace inode cache
4194 if (cct
->_conf
->client_oc
) {
4195 objectcacher
->release_set(&in
->oset
);
4196 if (!objectcacher
->set_is_empty(&in
->oset
))
4197 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
4200 _schedule_invalidate_callback(in
, 0, 0);
4203 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
4205 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
4207 // invalidate our userspace inode cache
4208 if (cct
->_conf
->client_oc
) {
4209 vector
<ObjectExtent
> ls
;
4210 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
4211 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
4214 _schedule_invalidate_callback(in
, off
, len
);
4217 bool Client::_release(Inode
*in
)
4219 ldout(cct
, 20) << "_release " << *in
<< dendl
;
4220 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
4221 _invalidate_inode_cache(in
);
4227 bool Client::_flush(Inode
*in
, Context
*onfinish
)
4229 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
4231 if (!in
->oset
.dirty_or_tx
) {
4232 ldout(cct
, 10) << " nothing to flush" << dendl
;
4233 onfinish
->complete(0);
4237 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
4238 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
4239 objectcacher
->purge_set(&in
->oset
);
4241 onfinish
->complete(-CEPHFS_ENOSPC
);
4246 return objectcacher
->flush_set(&in
->oset
, onfinish
);
4249 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
4251 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
4252 if (!in
->oset
.dirty_or_tx
) {
4253 ldout(cct
, 10) << " nothing to flush" << dendl
;
4257 C_SaferCond
onflush("Client::_flush_range flock");
4258 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
4259 offset
, size
, &onflush
);
4262 client_lock
.unlock();
4268 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
4270 // std::scoped_lock l(client_lock);
4271 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
)); // will be called via dispatch() -> objecter -> ...
4272 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
4277 void Client::_flushed(Inode
*in
)
4279 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4281 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4286 // checks common to add_update_cap, handle_cap_grant
4287 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4289 unsigned had
= in
->caps_issued();
4291 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4292 !(had
& CEPH_CAP_FILE_CACHE
))
4295 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4296 (had
& CEPH_CAP_FILE_SHARED
)) {
4297 if (issued
& CEPH_CAP_FILE_SHARED
)
4300 clear_dir_complete_and_ordered(in
, true);
4304 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4305 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4306 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4308 if (!in
->is_any_caps()) {
4309 ceph_assert(in
->snaprealm
== 0);
4310 in
->snaprealm
= get_snap_realm(realm
);
4311 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4312 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4314 ceph_assert(in
->snaprealm
);
4315 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4316 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4317 in
->snaprealm_item
.remove_myself();
4318 auto oldrealm
= in
->snaprealm
;
4319 in
->snaprealm
= get_snap_realm(realm
);
4320 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4321 put_snap_realm(oldrealm
);
4325 mds_rank_t mds
= mds_session
->mds_num
;
4326 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4327 Cap
&cap
= capem
.first
->second
;
4328 if (!capem
.second
) {
4329 if (cap
.gen
< mds_session
->cap_gen
)
4330 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4333 * auth mds of the inode changed. we received the cap export
4334 * message, but still haven't received the cap import message.
4335 * handle_cap_export() updated the new auth MDS' cap.
4337 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4338 * a message that was send before the cap import message. So
4339 * don't remove caps.
4341 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4342 if (&cap
!= in
->auth_cap
)
4343 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4345 ceph_assert(cap
.cap_id
== cap_id
);
4348 issued
|= cap
.issued
;
4349 flags
|= CEPH_CAP_FLAG_AUTH
;
4355 check_cap_issue(in
, issued
);
4357 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4358 if (in
->auth_cap
!= &cap
&&
4359 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4360 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4361 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4362 << "add myself to new auth MDS' flushing caps list" << dendl
;
4363 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4365 in
->auth_cap
= &cap
;
4369 unsigned old_caps
= cap
.issued
;
4370 cap
.cap_id
= cap_id
;
4371 cap
.issued
= issued
;
4372 cap
.implemented
|= issued
;
4373 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4374 cap
.wanted
= wanted
;
4376 cap
.wanted
|= wanted
;
4378 cap
.issue_seq
= seq
;
4380 cap
.gen
= mds_session
->cap_gen
;
4381 cap
.latest_perms
= cap_perms
;
4382 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4383 << " from mds." << mds
4387 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4388 // non-auth MDS is revoking the newly grant caps ?
4389 for (auto &p
: in
->caps
) {
4390 if (&p
.second
== &cap
)
4392 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4393 check_caps(in
, CHECK_CAPS_NODELAY
);
4399 if (issued
& ~old_caps
)
4400 signal_cond_list(in
->waitfor_caps
);
4403 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4405 auto &in
= cap
->inode
;
4406 MetaSession
*session
= cap
->session
;
4407 mds_rank_t mds
= cap
->session
->mds_num
;
4409 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4411 if (queue_release
) {
4412 session
->enqueue_cap_release(
4423 if (in
.auth_cap
== cap
) {
4424 if (in
.flushing_cap_item
.is_on_list()) {
4425 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4426 in
.flushing_cap_item
.remove_myself();
4430 size_t n
= in
.caps
.erase(mds
);
4431 ceph_assert(n
== 1);
4434 if (!in
.is_any_caps()) {
4435 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4436 in
.snaprealm_item
.remove_myself();
4437 put_snap_realm(in
.snaprealm
);
4442 void Client::remove_all_caps(Inode
*in
)
4444 while (!in
->caps
.empty())
4445 remove_cap(&in
->caps
.begin()->second
, true);
4448 void Client::remove_session_caps(MetaSession
*s
, int err
)
4450 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4452 while (s
->caps
.size()) {
4453 Cap
*cap
= *s
->caps
.begin();
4454 InodeRef
in(&cap
->inode
);
4455 bool dirty_caps
= false;
4456 if (in
->auth_cap
== cap
) {
4457 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4458 in
->wanted_max_size
= 0;
4459 in
->requested_max_size
= 0;
4460 if (in
->has_any_filelocks())
4461 in
->flags
|= I_ERROR_FILELOCK
;
4463 auto caps
= cap
->implemented
;
4464 if (cap
->wanted
| cap
->issued
)
4465 in
->flags
|= I_CAP_DROPPED
;
4466 remove_cap(cap
, false);
4467 in
->cap_snaps
.clear();
4469 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4470 if (in
->flushing_caps
) {
4471 num_flushing_caps
--;
4472 in
->flushing_cap_tids
.clear();
4474 in
->flushing_caps
= 0;
4475 in
->mark_caps_clean();
4476 put_inode(in
.get());
4478 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4479 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4480 if (err
== -CEPHFS_EBLOCKLISTED
) {
4481 if (in
->oset
.dirty_or_tx
) {
4482 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4483 in
->set_async_err(err
);
4485 objectcacher
->purge_set(&in
->oset
);
4487 objectcacher
->release_set(&in
->oset
);
4489 _schedule_invalidate_callback(in
.get(), 0, 0);
4492 signal_cond_list(in
->waitfor_caps
);
4494 s
->flushing_caps_tids
.clear();
4495 sync_cond
.notify_all();
4498 std::pair
<int, bool> Client::_do_remount(bool retry_on_error
)
4500 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4501 bool abort_on_failure
= false;
4504 int r
= remount_cb(callback_handle
);
4506 retries_on_invalidate
= 0;
4509 client_t whoami
= get_nodeid();
4512 "failed to remount (to trim kernel dentries): "
4513 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4516 "failed to remount (to trim kernel dentries): "
4517 "return code = " << r
<< dendl
;
4520 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4521 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4522 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4523 if (should_abort
&& !is_unmounting()) {
4524 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4525 abort_on_failure
= true;
4528 return std::make_pair(r
, abort_on_failure
);
4531 class C_Client_Remount
: public Context
{
4535 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4536 void finish(int r
) override
{
4537 ceph_assert(r
== 0);
4538 client
->_do_remount(true);
4542 void Client::_invalidate_kernel_dcache()
4544 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4545 if (!mref_reader
.is_state_satisfied())
4548 if (can_invalidate_dentries
) {
4549 if (dentry_invalidate_cb
&& root
->dir
) {
4550 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4551 p
!= root
->dir
->dentries
.end();
4553 if (p
->second
->inode
)
4554 _schedule_invalidate_dentry_callback(p
->second
, false);
4557 } else if (remount_cb
) {
4559 // when remounting a file system, linux kernel trims all unused dentries in the fs
4560 remount_finisher
.queue(new C_Client_Remount(this));
4564 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4570 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4571 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4572 Dentry
*dn
= p
->second
;
4574 ceph_assert(!dn
->inode
);
4575 if (dn
->lru_is_expireable())
4576 unlink(dn
, true, false); // keep dir, drop dentry
4578 if (dir
->dentries
.empty()) {
4583 if (in
->flags
& I_SNAPDIR_OPEN
) {
4584 InodeRef snapdir
= open_snapdir(in
.get());
4585 _trim_negative_child_dentries(snapdir
);
4589 class C_Client_CacheRelease
: public Context
{
4594 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4596 if (client
->use_faked_inos())
4597 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4601 void finish(int r
) override
{
4602 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4603 client
->_async_inode_release(ino
);
4607 void Client::_async_inode_release(vinodeno_t ino
)
4609 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4610 if (!mref_reader
.is_state_satisfied())
4613 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4614 ino_release_cb(callback_handle
, ino
);
4617 void Client::_schedule_ino_release_callback(Inode
*in
) {
4620 // we queue the invalidate, which calls the callback and decrements the ref
4621 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4624 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4626 mds_rank_t mds
= s
->mds_num
;
4627 size_t caps_size
= s
->caps
.size();
4628 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4629 << " caps " << caps_size
<< dendl
;
4631 uint64_t trimmed
= 0;
4632 auto p
= s
->caps
.begin();
4633 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4634 * looking at from getting deleted during traversal. */
4635 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4637 InodeRef
in(&cap
->inode
);
4639 // Increment p early because it will be invalidated if cap
4640 // is deleted inside remove_cap
4643 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4644 int mine
= cap
->issued
| cap
->implemented
;
4645 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4646 // disposable non-auth cap
4647 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4648 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4649 cap
= (remove_cap(cap
, true), nullptr);
4653 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4654 _trim_negative_child_dentries(in
);
4656 auto q
= in
->dentries
.begin();
4657 while (q
!= in
->dentries
.end()) {
4660 if (dn
->lru_is_expireable()) {
4661 if (can_invalidate_dentries
&&
4662 dn
->dir
->parent_inode
->ino
== CEPH_INO_ROOT
) {
4663 // Only issue one of these per DN for inodes in root: handle
4664 // others more efficiently by calling for root-child DNs at
4665 // the end of this function.
4666 _schedule_invalidate_dentry_callback(dn
, true);
4668 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4671 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4675 if (in
->ll_ref
== 1 && in
->ino
!= CEPH_INO_ROOT
) {
4676 _schedule_ino_release_callback(in
.get());
4678 if (all
&& in
->ino
!= CEPH_INO_ROOT
) {
4679 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4684 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4685 for (const auto &dn
: to_trim
) {
4690 caps_size
= s
->caps
.size();
4691 if (caps_size
> (size_t)max
)
4692 _invalidate_kernel_dcache();
4695 void Client::force_session_readonly(MetaSession
*s
)
4698 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4699 auto &in
= (*p
)->inode
;
4700 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4701 signal_cond_list(in
.waitfor_caps
);
4705 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4707 MetaSession
*session
= in
->auth_cap
->session
;
4709 int flushing
= in
->dirty_caps
;
4710 ceph_assert(flushing
);
4712 ceph_tid_t flush_tid
= ++last_flush_tid
;
4713 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4715 if (!in
->flushing_caps
) {
4716 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4717 num_flushing_caps
++;
4719 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4722 in
->flushing_caps
|= flushing
;
4723 in
->mark_caps_clean();
4725 if (!in
->flushing_cap_item
.is_on_list())
4726 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4727 session
->flushing_caps_tids
.insert(flush_tid
);
4733 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4735 for (auto &p
: in
->cap_snaps
) {
4736 CapSnap
&capsnap
= p
.second
;
4737 if (capsnap
.flush_tid
> 0) {
4738 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4739 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4742 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4743 it
!= in
->flushing_cap_tids
.end();
4745 old_s
->flushing_caps_tids
.erase(it
->first
);
4746 new_s
->flushing_caps_tids
.insert(it
->first
);
4748 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4752 * Flush all the dirty caps back to the MDS. Because the callers
4753 * generally wait on the result of this function (syncfs and umount
4754 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4756 void Client::flush_caps_sync()
4758 ldout(cct
, 10) << __func__
<< dendl
;
4759 for (auto &q
: mds_sessions
) {
4761 xlist
<Inode
*>::iterator p
= s
->dirty_list
.begin();
4763 unsigned flags
= CHECK_CAPS_NODELAY
;
4768 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4769 check_caps(in
, flags
);
4774 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4776 while (in
->flushing_caps
) {
4777 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4778 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4779 if (it
->first
> want
)
4781 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4782 << ccap_string(it
->second
) << " want " << want
4783 << " last " << it
->first
<< dendl
;
4784 wait_on_list(in
->waitfor_caps
);
4788 void Client::wait_sync_caps(ceph_tid_t want
)
4791 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4792 << num_flushing_caps
<< " total flushing)" << dendl
;
4793 for (auto &p
: mds_sessions
) {
4795 if (s
->flushing_caps_tids
.empty())
4797 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4798 if (oldest_tid
<= want
) {
4799 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4800 << " (want " << want
<< ")" << dendl
;
4801 std::unique_lock l
{client_lock
, std::adopt_lock
};
4809 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4811 in
->flags
&= ~I_KICK_FLUSH
;
4813 Cap
*cap
= in
->auth_cap
;
4814 ceph_assert(cap
->session
== session
);
4816 ceph_tid_t last_snap_flush
= 0;
4817 for (auto p
= in
->flushing_cap_tids
.rbegin();
4818 p
!= in
->flushing_cap_tids
.rend();
4821 last_snap_flush
= p
->first
;
4826 int wanted
= in
->caps_wanted();
4827 int used
= get_caps_used(in
) | in
->caps_dirty();
4828 auto it
= in
->cap_snaps
.begin();
4829 for (auto& p
: in
->flushing_cap_tids
) {
4831 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4832 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4835 ceph_assert(it
!= in
->cap_snaps
.end());
4836 ceph_assert(it
->second
.flush_tid
== p
.first
);
4837 send_flush_snap(in
, session
, it
->first
, it
->second
);
4843 void Client::kick_flushing_caps(MetaSession
*session
)
4845 mds_rank_t mds
= session
->mds_num
;
4846 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4848 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4850 if (in
->flags
& I_KICK_FLUSH
) {
4851 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4852 kick_flushing_caps(in
, session
);
4857 void Client::early_kick_flushing_caps(MetaSession
*session
)
4859 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4861 Cap
*cap
= in
->auth_cap
;
4864 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4865 // stage. This guarantees that MDS processes the cap flush message before issuing
4866 // the flushing caps to other client.
4867 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4868 in
->flags
|= I_KICK_FLUSH
;
4872 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4873 << " to mds." << session
->mds_num
<< dendl
;
4874 // send_reconnect() also will reset these sequence numbers. make sure
4875 // sequence numbers in cap flush message match later reconnect message.
4879 cap
->issued
= cap
->implemented
;
4881 kick_flushing_caps(in
, session
);
4885 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4890 while (!q
.empty()) {
4894 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4895 realm
->invalidate_cache();
4897 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4898 p
!= realm
->pchildren
.end();
4904 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4906 SnapRealm
*realm
= snap_realms
[r
];
4908 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref was "
4909 << (realm
? realm
->nref
: 0) << dendl
;
4911 snap_realms
[r
] = realm
= new SnapRealm(r
);
4913 // Do not release the global snaprealm until unmounting.
4914 if (r
== CEPH_INO_GLOBAL_SNAPREALM
)
4919 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref now is "
4920 << realm
->nref
<< dendl
;
4924 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4926 if (snap_realms
.count(r
) == 0) {
4927 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4930 SnapRealm
*realm
= snap_realms
[r
];
4931 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4936 void Client::put_snap_realm(SnapRealm
*realm
)
4938 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4939 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4940 if (--realm
->nref
== 0) {
4941 snap_realms
.erase(realm
->ino
);
4942 if (realm
->pparent
) {
4943 realm
->pparent
->pchildren
.erase(realm
);
4944 put_snap_realm(realm
->pparent
);
4950 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4952 if (realm
->parent
!= parent
) {
4953 ldout(cct
, 10) << __func__
<< " " << *realm
4954 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4955 realm
->parent
= parent
;
4956 if (realm
->pparent
) {
4957 realm
->pparent
->pchildren
.erase(realm
);
4958 put_snap_realm(realm
->pparent
);
4960 realm
->pparent
= get_snap_realm(parent
);
4961 realm
->pparent
->pchildren
.insert(realm
);
4967 static bool has_new_snaps(const SnapContext
& old_snapc
,
4968 const SnapContext
& new_snapc
)
4970 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4974 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4976 SnapRealm
*first_realm
= NULL
;
4977 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4979 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4981 auto p
= bl
.cbegin();
4985 SnapRealm
*realm
= get_snap_realm(info
.ino());
4987 bool invalidate
= false;
4989 if (info
.seq() > realm
->seq
) {
4990 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4994 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4995 // flush me + children
4998 while (!q
.empty()) {
4999 SnapRealm
*realm
= q
.front();
5002 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
5003 p
!= realm
->pchildren
.end();
5007 if (dirty_realms
.count(realm
) == 0) {
5009 dirty_realms
[realm
] = realm
->get_snap_context();
5015 realm
->seq
= info
.seq();
5016 realm
->created
= info
.created();
5017 realm
->parent_since
= info
.parent_since();
5018 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
5019 realm
->my_snaps
= info
.my_snaps
;
5023 // _always_ verify parent
5024 if (adjust_realm_parent(realm
, info
.parent()))
5028 invalidate_snaprealm_and_children(realm
);
5029 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
5030 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
5032 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
5033 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
5037 first_realm
= realm
;
5039 put_snap_realm(realm
);
5042 for (auto &[realm
, snapc
] : dirty_realms
) {
5043 // if there are new snaps ?
5044 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
5045 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
5046 for (auto&& in
: realm
->inodes_with_caps
) {
5047 queue_cap_snap(in
, snapc
);
5050 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
5052 put_snap_realm(realm
);
5056 *realm_ret
= first_realm
;
5058 put_snap_realm(first_realm
);
5061 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
5063 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
5064 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5066 std::scoped_lock
cl(client_lock
);
5067 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5072 got_mds_push(session
.get());
5074 map
<Inode
*, SnapContext
> to_move
;
5075 SnapRealm
*realm
= 0;
5077 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
5078 ceph_assert(m
->head
.split
);
5080 auto p
= m
->bl
.cbegin();
5082 ceph_assert(info
.ino() == m
->head
.split
);
5084 // flush, then move, ino's.
5085 realm
= get_snap_realm(info
.ino());
5086 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
5087 for (auto& ino
: m
->split_inos
) {
5088 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
5089 if (inode_map
.count(vino
)) {
5090 Inode
*in
= inode_map
[vino
];
5091 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
5093 if (in
->snaprealm
->created
> info
.created()) {
5094 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
5095 << *in
->snaprealm
<< dendl
;
5098 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
5101 in
->snaprealm_item
.remove_myself();
5102 to_move
[in
] = in
->snaprealm
->get_snap_context();
5103 put_snap_realm(in
->snaprealm
);
5107 // move child snaprealms, too
5108 for (auto& child_realm
: m
->split_realms
) {
5109 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
5110 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
5113 adjust_realm_parent(child
, realm
->ino
);
5114 put_snap_realm(child
);
5118 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
5121 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
5122 Inode
*in
= p
->first
;
5123 in
->snaprealm
= realm
;
5124 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
5126 // queue for snap writeback
5127 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
5128 queue_cap_snap(in
, p
->second
);
5130 put_snap_realm(realm
);
5134 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
5136 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5138 std::scoped_lock
cl(client_lock
);
5139 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5144 got_mds_push(session
.get());
5146 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
5148 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
5149 if (inode_map
.count(vino
)) {
5151 in
= inode_map
[vino
];
5154 in
->quota
= m
->quota
;
5155 in
->rstat
= m
->rstat
;
5160 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
5162 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5164 std::scoped_lock
cl(client_lock
);
5165 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5170 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
5171 // Pause RADOS operations until we see the required epoch
5172 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
5175 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
5176 // Record the barrier so that we will transmit it to MDS when releasing
5177 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
5180 got_mds_push(session
.get());
5183 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
5184 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
5187 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
5188 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
5189 session
->enqueue_cap_release(
5196 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
5199 // in case the mds is waiting on e.g. a revocation
5200 flush_cap_releases();
5204 switch (m
->get_op()) {
5205 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
.get(), in
, m
);
5206 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
.get(), in
, m
);
5207 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
.get(), in
, m
);
5210 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
5211 Cap
&cap
= in
->caps
.at(mds
);
5213 switch (m
->get_op()) {
5214 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
.get(), in
, m
);
5215 case CEPH_CAP_OP_IMPORT
:
5216 case CEPH_CAP_OP_REVOKE
:
5217 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
.get(), in
, &cap
, m
);
5218 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
.get(), in
, &cap
, m
);
5221 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
5226 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5228 mds_rank_t mds
= session
->mds_num
;
5230 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5231 << " IMPORT from mds." << mds
<< dendl
;
5233 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
5236 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
5238 cap_perms
= cap
->latest_perms
;
5242 SnapRealm
*realm
= NULL
;
5243 update_snap_trace(m
->snapbl
, &realm
);
5245 int issued
= m
->get_caps();
5246 int wanted
= m
->get_wanted();
5247 add_update_cap(in
, session
, m
->get_cap_id(),
5248 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
5249 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
5251 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
5252 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
5256 put_snap_realm(realm
);
5258 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
5259 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
5260 in
->requested_max_size
> m
->get_max_size()) {
5261 in
->requested_max_size
= 0;
5262 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5264 // reflush any/all caps (if we are now the auth_cap)
5265 kick_flushing_caps(in
, session
);
5269 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5271 mds_rank_t mds
= session
->mds_num
;
5273 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5274 << " EXPORT from mds." << mds
<< dendl
;
5276 auto it
= in
->caps
.find(mds
);
5277 if (it
!= in
->caps
.end()) {
5278 Cap
&cap
= it
->second
;
5279 if (cap
.cap_id
== m
->get_cap_id()) {
5280 if (m
->peer
.cap_id
) {
5281 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5282 auto tsession
= _get_or_open_mds_session(peer_mds
);
5283 auto it
= in
->caps
.find(peer_mds
);
5284 if (it
!= in
->caps
.end()) {
5285 Cap
&tcap
= it
->second
;
5286 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5287 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5288 tcap
.cap_id
= m
->peer
.cap_id
;
5289 tcap
.seq
= m
->peer
.seq
- 1;
5290 tcap
.issue_seq
= tcap
.seq
;
5291 tcap
.issued
|= cap
.issued
;
5292 tcap
.implemented
|= cap
.issued
;
5293 if (&cap
== in
->auth_cap
)
5294 in
->auth_cap
= &tcap
;
5295 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5296 adjust_session_flushing_caps(in
, session
, tsession
.get());
5299 add_update_cap(in
, tsession
.get(), m
->peer
.cap_id
, cap
.issued
, 0,
5300 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5301 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5305 if (cap
.wanted
| cap
.issued
)
5306 in
->flags
|= I_CAP_DROPPED
;
5309 remove_cap(&cap
, false);
5314 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5316 mds_rank_t mds
= session
->mds_num
;
5317 ceph_assert(in
->caps
.count(mds
));
5319 ldout(cct
, 10) << __func__
<< " on ino " << *in
5320 << " size " << in
->size
<< " -> " << m
->get_size()
5324 in
->caps_issued(&issued
);
5325 issued
|= in
->caps_dirty();
5326 update_inode_file_size(in
, issued
, m
->get_size(),
5327 m
->get_truncate_seq(), m
->get_truncate_size());
5330 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5332 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5333 int dirty
= m
->get_dirty();
5337 auto it
= in
->flushing_cap_tids
.begin();
5338 if (it
->first
< flush_ack_tid
) {
5339 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5340 << " got unexpected flush ack tid " << flush_ack_tid
5341 << " expected is " << it
->first
<< dendl
;
5343 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5349 if (it
->first
== flush_ack_tid
)
5350 cleaned
= it
->second
;
5351 if (it
->first
<= flush_ack_tid
) {
5352 session
->flushing_caps_tids
.erase(it
->first
);
5353 in
->flushing_cap_tids
.erase(it
++);
5357 cleaned
&= ~it
->second
;
5363 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5364 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5365 << " with " << ccap_string(dirty
) << dendl
;
5368 signal_cond_list(in
->waitfor_caps
);
5369 if (session
->flushing_caps_tids
.empty() ||
5370 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5371 sync_cond
.notify_all();
5375 in
->cap_dirtier_uid
= -1;
5376 in
->cap_dirtier_gid
= -1;
5380 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5382 if (in
->flushing_caps
) {
5383 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5384 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5385 in
->flushing_caps
&= ~cleaned
;
5386 if (in
->flushing_caps
== 0) {
5387 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5388 num_flushing_caps
--;
5389 if (in
->flushing_cap_tids
.empty())
5390 in
->flushing_cap_item
.remove_myself();
5392 if (!in
->caps_dirty())
5399 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5401 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5402 mds_rank_t mds
= session
->mds_num
;
5403 ceph_assert(in
->caps
.count(mds
));
5404 snapid_t follows
= m
->get_snap_follows();
5406 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5407 auto& capsnap
= it
->second
;
5408 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5409 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5411 InodeRef
tmp_ref(in
);
5412 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5413 << " on " << *in
<< dendl
;
5414 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5415 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5416 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5417 in
->flushing_cap_item
.remove_myself();
5418 in
->cap_snaps
.erase(it
);
5420 signal_cond_list(in
->waitfor_caps
);
5421 if (session
->flushing_caps_tids
.empty() ||
5422 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5423 sync_cond
.notify_all();
5426 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5427 << " on " << *in
<< dendl
;
5428 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5432 class C_Client_DentryInvalidate
: public Context
{
5439 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5440 client(c
), name(dn
->name
) {
5441 if (client
->use_faked_inos()) {
5442 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5444 ino
.ino
= dn
->inode
->faked_ino
;
5446 dirino
= dn
->dir
->parent_inode
->vino();
5448 ino
= dn
->inode
->vino();
5451 ino
.ino
= inodeno_t();
5453 void finish(int r
) override
{
5454 // _async_dentry_invalidate is responsible for its own locking
5455 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5456 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5460 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5462 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5463 if (!mref_reader
.is_state_satisfied())
5466 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5467 << " in dir " << dirino
<< dendl
;
5468 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5471 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5473 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5474 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5477 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5479 int ref
= in
->get_nref();
5480 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5482 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5483 for (auto p
= in
->dir
->dentries
.begin();
5484 p
!= in
->dir
->dentries
.end(); ) {
5485 Dentry
*dn
= p
->second
;
5487 /* rmsnap removes whole subtree, need trim inodes recursively.
5488 * we don't need to invalidate dentries recursively. because
5489 * invalidating a directory dentry effectively invalidate
5491 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5492 _try_to_trim_inode(dn
->inode
.get(), false);
5494 if (dn
->lru_is_expireable())
5495 unlink(dn
, true, false); // keep dir, drop dentry
5497 if (in
->dir
->dentries
.empty()) {
5503 if (ref
> 1 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5504 InodeRef snapdir
= open_snapdir(in
);
5505 _try_to_trim_inode(snapdir
.get(), false);
5510 auto q
= in
->dentries
.begin();
5511 while (q
!= in
->dentries
.end()) {
5514 if( in
->ll_ref
> 0 && sched_inval
) {
5515 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5516 // so in->dentries doesn't always reflect the state of kernel's dcache.
5517 _schedule_invalidate_dentry_callback(dn
, true);
5519 unlink(dn
, true, true);
5524 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5526 mds_rank_t mds
= session
->mds_num
;
5527 int used
= get_caps_used(in
);
5528 int wanted
= in
->caps_wanted();
5531 const unsigned new_caps
= m
->get_caps();
5532 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5533 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5534 << " mds." << mds
<< " seq " << m
->get_seq()
5535 << " caps now " << ccap_string(new_caps
)
5536 << " was " << ccap_string(cap
->issued
)
5537 << (was_stale
? " (stale)" : "") << dendl
;
5540 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5541 cap
->seq
= m
->get_seq();
5542 cap
->gen
= session
->cap_gen
;
5544 check_cap_issue(in
, new_caps
);
5548 in
->caps_issued(&issued
);
5549 issued
|= in
->caps_dirty();
5551 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5552 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5553 in
->mode
= m
->head
.mode
;
5554 in
->uid
= m
->head
.uid
;
5555 in
->gid
= m
->head
.gid
;
5556 in
->btime
= m
->btime
;
5558 bool deleted_inode
= false;
5559 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5560 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5561 in
->nlink
= m
->head
.nlink
;
5563 deleted_inode
= true;
5565 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5566 m
->xattrbl
.length() &&
5567 m
->head
.xattr_version
> in
->xattr_version
) {
5568 auto p
= m
->xattrbl
.cbegin();
5569 decode(in
->xattrs
, p
);
5570 in
->xattr_version
= m
->head
.xattr_version
;
5573 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5574 in
->dirstat
.nfiles
= m
->get_nfiles();
5575 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5578 if (new_caps
& CEPH_CAP_ANY_RD
) {
5579 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5580 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5583 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5584 in
->layout
= m
->get_layout();
5585 update_inode_file_size(in
, issued
, m
->get_size(),
5586 m
->get_truncate_seq(), m
->get_truncate_size());
5589 if (m
->inline_version
> in
->inline_version
) {
5590 in
->inline_data
= m
->inline_data
;
5591 in
->inline_version
= m
->inline_version
;
5594 /* always take a newer change attr */
5595 if (m
->get_change_attr() > in
->change_attr
)
5596 in
->change_attr
= m
->get_change_attr();
5599 if (cap
== in
->auth_cap
&&
5600 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5601 (m
->get_max_size() != in
->max_size
)) {
5602 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5603 in
->max_size
= m
->get_max_size();
5604 if (in
->max_size
> in
->wanted_max_size
) {
5605 in
->wanted_max_size
= 0;
5606 in
->requested_max_size
= 0;
5611 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5612 (wanted
& ~(cap
->wanted
| new_caps
))) {
5613 // If mds is importing cap, prior cap messages that update 'wanted'
5614 // may get dropped by mds (migrate seq mismatch).
5616 // We don't send cap message to update 'wanted' if what we want are
5617 // already issued. If mds revokes caps, cap message that releases caps
5618 // also tells mds what we want. But if caps got revoked by mds forcedly
5619 // (session stale). We may haven't told mds what we want.
5625 auto revoked
= cap
->issued
& ~new_caps
;
5627 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5628 cap
->issued
= new_caps
;
5629 cap
->implemented
|= new_caps
;
5631 // recall delegations if we're losing caps necessary for them
5632 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5633 in
->recall_deleg(false);
5634 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5635 in
->recall_deleg(true);
5637 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5638 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5639 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5640 // waitin' for flush
5641 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5644 flags
= CHECK_CAPS_NODELAY
;
5647 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5649 flags
= CHECK_CAPS_NODELAY
;
5651 } else if (cap
->issued
== new_caps
) {
5652 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5654 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5655 cap
->issued
= new_caps
;
5656 cap
->implemented
|= new_caps
;
5658 if (cap
== in
->auth_cap
) {
5659 // non-auth MDS is revoking the newly grant caps ?
5660 for (const auto &p
: in
->caps
) {
5661 if (&p
.second
== cap
)
5663 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5672 check_caps(in
, flags
);
5676 signal_cond_list(in
->waitfor_caps
);
5678 // may drop inode's last ref
5680 _try_to_trim_inode(in
, true);
5683 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5685 if (perms
.uid() == 0) {
5686 // For directories, DACs are overridable.
5687 // For files, Read/write DACs are always overridable but executable DACs are
5688 // overridable when there is at least one exec bit set
5689 if(!S_ISDIR(in
->mode
) && (want
& MAY_EXEC
) && !(in
->mode
& S_IXUGO
))
5690 return -CEPHFS_EACCES
;
5694 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5695 int ret
= _posix_acl_permission(in
, perms
, want
);
5696 if (ret
!= -CEPHFS_EAGAIN
)
5700 // check permissions before doing anything else
5701 if (!in
->check_mode(perms
, want
))
5702 return -CEPHFS_EACCES
;
5706 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5707 const UserPerm
& perms
)
5709 int r
= _getattr_for_perm(in
, perms
);
5714 if (strncmp(name
, "system.", 7) == 0) {
5715 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5718 r
= inode_permission(in
, perms
, want
);
5721 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5725 std::ostream
& operator<<(std::ostream
&out
, const UserPerm
& perm
) {
5726 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5730 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5731 const UserPerm
& perms
)
5733 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5734 int r
= _getattr_for_perm(in
, perms
);
5738 if (mask
& CEPH_SETATTR_SIZE
) {
5739 r
= inode_permission(in
, perms
, MAY_WRITE
);
5745 if (mask
& CEPH_SETATTR_UID
) {
5746 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5749 if (mask
& CEPH_SETATTR_GID
) {
5750 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5751 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5755 if (mask
& CEPH_SETATTR_MODE
) {
5756 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5759 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5760 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5761 stx
->stx_mode
&= ~S_ISGID
;
5764 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5765 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5766 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5767 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5768 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5769 check_mask
|= CEPH_SETATTR_MTIME
;
5770 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5771 check_mask
|= CEPH_SETATTR_ATIME
;
5772 if (check_mask
& mask
) {
5775 r
= inode_permission(in
, perms
, MAY_WRITE
);
5783 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5787 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5789 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5792 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5794 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5795 want
= MAY_READ
| MAY_WRITE
;
5796 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5798 if (flags
& O_TRUNC
)
5802 switch (in
->mode
& S_IFMT
) {
5807 if (want
& MAY_WRITE
) {
5814 r
= _getattr_for_perm(in
, perms
);
5818 r
= inode_permission(in
, perms
, want
);
5820 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5824 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5826 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5827 int r
= _getattr_for_perm(dir
, perms
);
5831 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5833 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5837 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5839 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5840 int r
= _getattr_for_perm(dir
, perms
);
5844 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5846 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5850 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5852 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5853 int r
= _getattr_for_perm(dir
, perms
);
5857 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5861 /* 'name == NULL' means rmsnap w/o permission checks */
5862 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5864 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5867 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5871 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5875 int Client::may_delete(const char *relpath
, const UserPerm
& perms
) {
5876 ldout(cct
, 20) << __func__
<< " " << relpath
<< "; " << perms
<< dendl
;
5878 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5879 if (!mref_reader
.is_state_satisfied())
5882 filepath
path(relpath
);
5883 string name
= path
.last_dentry();
5887 std::scoped_lock
lock(client_lock
);
5888 int r
= path_walk(path
, &dir
, perms
);
5891 if (cct
->_conf
->client_permissions
) {
5892 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
5900 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5902 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5903 int r
= _getattr_for_perm(in
, perms
);
5907 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5913 if (!S_ISREG(in
->mode
))
5916 if (in
->mode
& S_ISUID
)
5919 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5922 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5924 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5928 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5930 int mask
= CEPH_STAT_CAP_MODE
;
5932 if (acl_type
!= NO_ACL
) {
5933 mask
|= CEPH_STAT_CAP_XATTR
;
5934 force
= in
->xattr_version
== 0;
5936 return _getattr(in
, mask
, perms
, force
);
5939 vinodeno_t
Client::_get_vino(Inode
*in
)
5941 /* The caller must hold the client lock */
5942 return vinodeno_t(in
->ino
, in
->snapid
);
5946 * Resolve an MDS spec to a list of MDS daemon GIDs.
5948 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5949 * It may be '*' in which case it matches all GIDs.
5951 * If no error is returned, the `targets` vector will be populated with at least
5954 int Client::resolve_mds(
5955 const std::string
&mds_spec
,
5956 std::vector
<mds_gid_t
> *targets
)
5959 ceph_assert(targets
!= nullptr);
5962 CachedStackStringStream css
;
5963 int role_r
= fsmap
->parse_role(mds_spec
, &role
, *css
);
5965 // We got a role, resolve it to a GID
5966 auto& info
= fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
);
5967 ldout(cct
, 10) << __func__
<< ": resolved " << mds_spec
<< " to role '"
5968 << role
<< "' aka " << info
.human_name() << dendl
;
5969 targets
->push_back(info
.global_id
);
5973 std::string strtol_err
;
5974 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5975 if (strtol_err
.empty()) {
5976 // It is a possible GID
5977 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5978 if (fsmap
->gid_exists(mds_gid
)) {
5979 auto& info
= fsmap
->get_info_gid(mds_gid
);
5980 ldout(cct
, 10) << __func__
<< ": validated gid " << mds_gid
<< " aka "
5981 << info
.human_name() << dendl
;
5982 targets
->push_back(mds_gid
);
5985 lderr(cct
) << __func__
<< ": gid " << mds_gid
<< " not in MDS map"
5987 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5988 return -CEPHFS_ENOENT
;
5990 } else if (mds_spec
== "*") {
5991 // It is a wildcard: use all MDSs
5992 const auto& mds_info
= fsmap
->get_mds_info();
5994 ldout(cct
, 10) << __func__
<< ": resolving `*' to all MDS daemons" << dendl
;
5995 if (mds_info
.empty()) {
5996 lderr(cct
) << __func__
<< ": no MDS daemons found" << dendl
;
5997 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5998 return -CEPHFS_ENOENT
;
6001 for (const auto& [gid
, info
] : mds_info
) {
6002 ldout(cct
, 10) << __func__
<< ": appending " << info
.human_name() << " to targets" << dendl
;
6003 targets
->push_back(gid
);
6007 // It did not parse as an integer, it is not a wildcard, it must be a name
6008 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
6010 lderr(cct
) << __func__
<< ": no MDS daemons found by name `" << mds_spec
<< "'" << dendl
;
6011 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6012 return -CEPHFS_ENOENT
;
6014 auto& info
= fsmap
->get_info_gid(mds_gid
);
6015 ldout(cct
, 10) << __func__
<< ": resolved name '" << mds_spec
6016 << "' to " << info
.human_name() << dendl
;
6017 targets
->push_back(mds_gid
);
6025 * Authenticate with mon and establish global ID
6027 int Client::authenticate()
6029 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6031 if (monclient
->is_authenticated()) {
6035 client_lock
.unlock();
6036 int r
= monclient
->authenticate(std::chrono::duration
<double>(mount_timeout
).count());
6042 whoami
= monclient
->get_global_id();
6043 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
6048 int Client::fetch_fsmap(bool user
)
6050 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6052 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6053 // rather than MDSMap because no one MDSMap contains all the daemons, and
6054 // a `tell` can address any daemon.
6055 version_t fsmap_latest
;
6058 client_lock
.unlock();
6059 std::tie(fsmap_latest
, std::ignore
) =
6060 monclient
->get_version("fsmap", ca::use_blocked
[ec
]);
6062 } while (ec
== bs::errc::resource_unavailable_try_again
);
6065 lderr(cct
) << "Failed to learn FSMap version: " << ec
<< dendl
;
6066 return ceph::from_error_code(ec
);
6069 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
6072 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
6073 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6074 monclient
->renew_subs();
6075 wait_on_list(waiting_for_fsmap
);
6077 ceph_assert(fsmap_user
);
6078 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
6080 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
6081 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6082 monclient
->renew_subs();
6083 wait_on_list(waiting_for_fsmap
);
6086 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
6088 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
6089 << fsmap_latest
<< dendl
;
6095 * @mds_spec one of ID, rank, GID, "*"
6098 int Client::mds_command(
6099 const std::string
&mds_spec
,
6100 const vector
<string
>& cmd
,
6101 const bufferlist
& inbl
,
6106 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
6107 if (!iref_reader
.is_state_satisfied())
6108 return -CEPHFS_ENOTCONN
;
6110 std::unique_lock
cl(client_lock
);
6118 r
= fetch_fsmap(false);
6123 // Look up MDS target(s) of the command
6124 std::vector
<mds_gid_t
> targets
;
6125 r
= resolve_mds(mds_spec
, &targets
);
6130 // If daemons are laggy, we won't send them commands. If all
6131 // are laggy then we fail.
6132 std::vector
<mds_gid_t
> non_laggy
;
6133 for (const auto& gid
: targets
) {
6134 const auto info
= fsmap
->get_info_gid(gid
);
6135 if (!info
.laggy()) {
6136 non_laggy
.push_back(gid
);
6139 if (non_laggy
.size() == 0) {
6140 *outs
= "All targeted MDS daemons are laggy";
6141 return -CEPHFS_ENOENT
;
6144 if (metadata
.empty()) {
6145 // We are called on an unmounted client, so metadata
6146 // won't be initialized yet.
6147 populate_metadata("");
6150 // Send commands to targets
6151 C_GatherBuilder
gather(cct
, onfinish
);
6152 for (const auto& target_gid
: non_laggy
) {
6153 const auto info
= fsmap
->get_info_gid(target_gid
);
6155 // Open a connection to the target MDS
6156 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
6160 std::scoped_lock
cmd_lock(command_lock
);
6161 // Generate MDSCommandOp state
6162 auto &op
= command_table
.start_command();
6164 op
.on_finish
= gather
.new_sub();
6169 op
.mds_gid
= target_gid
;
6172 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
6173 << " tid=" << op
.tid
<< cmd
<< dendl
;
6175 // Construct and send MCommand
6176 MessageRef m
= op
.get_message(monclient
->get_fsid());
6177 conn
->send_message2(std::move(m
));
6186 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
6188 ceph_tid_t
const tid
= m
->get_tid();
6190 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
6192 std::scoped_lock
cmd_lock(command_lock
);
6193 if (!command_table
.exists(tid
)) {
6194 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
6198 auto &op
= command_table
.get_command(tid
);
6200 *op
.outbl
= m
->get_data();
6207 op
.on_finish
->complete(m
->r
);
6210 command_table
.erase(tid
);
6213 // -------------------
6216 int Client::subscribe_mdsmap(const std::string
&fs_name
)
6218 int r
= authenticate();
6220 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
6224 std::string resolved_fs_name
;
6225 if (fs_name
.empty()) {
6226 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
6227 if (resolved_fs_name
.empty())
6228 // Try the backwards compatibility fs name option
6229 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
6231 resolved_fs_name
= fs_name
;
6234 std::string want
= "mdsmap";
6235 if (!resolved_fs_name
.empty()) {
6236 r
= fetch_fsmap(true);
6239 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
6240 if (fscid
== FS_CLUSTER_ID_NONE
) {
6241 return -CEPHFS_ENOENT
;
6244 std::ostringstream oss
;
6245 oss
<< want
<< "." << fscid
;
6248 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
6250 monclient
->sub_want(want
, 0, 0);
6251 monclient
->renew_subs();
6256 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
6257 bool require_mds
, const std::string
&fs_name
)
6259 ceph_assert(is_initialized());
6262 * To make sure that the _unmount() must wait until the mount()
6265 RWRef_t
mref_writer(mount_state
, CLIENT_MOUNTING
, false);
6266 if (!mref_writer
.is_first_writer()) // already mounting or mounted
6269 std::unique_lock
cl(client_lock
);
6271 int r
= subscribe_mdsmap(fs_name
);
6273 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
6277 start_tick_thread(); // start tick thread
6281 auto availability
= mdsmap
->is_cluster_available();
6282 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
6284 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
6285 return CEPH_FUSE_NO_MDS_UP
;
6286 } else if (availability
== MDSMap::AVAILABLE
) {
6287 // Continue to mount
6289 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
6290 // Else, wait. MDSMonitor will update the map to bring
6291 // us to a conclusion eventually.
6292 wait_on_list(waiting_for_mdsmap
);
6294 // Unexpected value!
6300 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
6302 filepath
fp(CEPH_INO_ROOT
);
6303 if (!mount_root
.empty()) {
6304 fp
= filepath(mount_root
.c_str());
6307 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6308 req
->set_filepath(fp
);
6309 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
6310 int res
= make_request(req
, perms
);
6312 if (res
== -CEPHFS_EACCES
&& root
) {
6313 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6326 _ll_get(root
.get());
6329 if (!cct
->_conf
->client_trace
.empty()) {
6330 traceout
.open(cct
->_conf
->client_trace
.c_str());
6331 if (traceout
.is_open()) {
6332 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6334 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6339 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6340 ldout(cct, 3) << "op: struct stat st;" << dendl;
6341 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6342 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6343 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6344 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6345 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6346 ldout(cct, 3) << "op: int fd;" << dendl;
6349 mref_writer
.update_state(CLIENT_MOUNTED
);
6355 void Client::_close_sessions()
6357 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6358 if (it
->second
->state
== MetaSession::STATE_REJECTED
)
6359 mds_sessions
.erase(it
++);
6364 while (!mds_sessions
.empty()) {
6365 // send session closes!
6366 for (auto &p
: mds_sessions
) {
6367 if (p
.second
->state
!= MetaSession::STATE_CLOSING
) {
6368 _close_mds_session(p
.second
.get());
6369 mds_ranks_closing
.insert(p
.first
);
6373 // wait for sessions to close
6374 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6375 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6376 << timo
<< "s)" << dendl
;
6377 std::unique_lock l
{client_lock
, std::adopt_lock
};
6380 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6381 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6382 while (!mds_ranks_closing
.empty()) {
6383 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6384 // this prunes entry from mds_sessions and mds_ranks_closing
6385 _closed_mds_session(session
.get(), -CEPHFS_ETIMEDOUT
);
6389 mds_ranks_closing
.clear();
6394 void Client::flush_mdlog_sync(Inode
*in
)
6396 if (in
->unsafe_ops
.empty()) {
6400 std::set
<mds_rank_t
> anchor
;
6401 for (auto &&p
: in
->unsafe_ops
) {
6402 anchor
.emplace(p
->mds
);
6405 anchor
.emplace(in
->auth_cap
->session
->mds_num
);
6408 for (auto &rank
: anchor
) {
6409 auto session
= &mds_sessions
.at(rank
);
6410 flush_mdlog(session
->get());
6414 void Client::flush_mdlog_sync()
6416 if (mds_requests
.empty())
6418 for (auto &p
: mds_sessions
) {
6419 flush_mdlog(p
.second
.get());
6423 void Client::flush_mdlog(MetaSession
*session
)
6425 // Only send this to Luminous or newer MDS daemons, older daemons
6426 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6427 const uint64_t features
= session
->con
->get_features();
6428 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6429 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6430 session
->con
->send_message2(std::move(m
));
6435 void Client::_abort_mds_sessions(int err
)
6437 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6438 auto req
= p
->second
;
6440 // unsafe requests will be removed during close session below.
6441 if (req
->got_unsafe
)
6445 if (req
->caller_cond
) {
6447 req
->caller_cond
->notify_all();
6451 // Process aborts on any requests that were on this waitlist.
6452 // Any requests that were on a waiting_for_open session waitlist
6453 // will get kicked during close session below.
6454 signal_cond_list(waiting_for_mdsmap
);
6456 // Force-close all sessions
6457 while(!mds_sessions
.empty()) {
6458 auto session
= mds_sessions
.begin()->second
;
6459 _closed_mds_session(session
.get(), err
);
6463 void Client::_unmount(bool abort
)
6466 * We are unmounting the client.
6468 * Just declare the state to STATE_UNMOUNTING to block and fail
6469 * any new comming "reader" and then try to wait all the in-flight
6470 * "readers" to finish.
6472 RWRef_t
mref_writer(mount_state
, CLIENT_UNMOUNTING
, false);
6473 if (!mref_writer
.is_first_writer())
6475 mref_writer
.wait_readers_done();
6477 std::unique_lock lock
{client_lock
};
6479 if (abort
|| blocklisted
) {
6480 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blocklisted)") << dendl
;
6482 ldout(cct
, 2) << "unmounting" << dendl
;
6488 mount_aborted
= true;
6489 // Abort all mds sessions
6490 _abort_mds_sessions(-CEPHFS_ENOTCONN
);
6492 objecter
->op_cancel_writes(-CEPHFS_ENOTCONN
);
6494 // flush the mdlog for pending requests, if any
6498 mount_cond
.wait(lock
, [this] {
6499 if (!mds_requests
.empty()) {
6500 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6503 return mds_requests
.empty();
6509 // clean up any unclosed files
6510 while (!fd_map
.empty()) {
6511 Fh
*fh
= fd_map
.begin()->second
;
6512 fd_map
.erase(fd_map
.begin());
6513 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6517 while (!ll_unclosed_fh_set
.empty()) {
6518 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6520 ll_unclosed_fh_set
.erase(fh
);
6521 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6525 while (!opened_dirs
.empty()) {
6526 dir_result_t
*dirp
= *opened_dirs
.begin();
6527 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6533 if (cct
->_conf
->client_oc
) {
6534 // flush/release all buffered data
6535 std::list
<InodeRef
> anchor
;
6536 for (auto& p
: inode_map
) {
6537 Inode
*in
= p
.second
;
6539 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6543 // prevent inode from getting freed
6544 anchor
.emplace_back(in
);
6546 if (abort
|| blocklisted
) {
6547 objectcacher
->purge_set(&in
->oset
);
6548 } else if (!in
->caps
.empty()) {
6550 _flush(in
, new C_Client_FlushComplete(this, in
));
6555 if (abort
|| blocklisted
) {
6556 for (auto &q
: mds_sessions
) {
6558 for (auto p
= s
->dirty_list
.begin(); !p
.end(); ) {
6561 if (in
->dirty_caps
) {
6562 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6563 in
->mark_caps_clean();
6570 wait_sync_caps(last_flush_tid
);
6578 while (lru
.lru_get_size() > 0 ||
6579 !inode_map
.empty()) {
6580 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6581 << "+" << inode_map
.size() << " items"
6582 << ", waiting (for caps to release?)"
6585 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6586 r
== std::cv_status::timeout
) {
6590 ceph_assert(lru
.lru_get_size() == 0);
6591 ceph_assert(inode_map
.empty());
6594 if (!cct
->_conf
->client_trace
.empty()) {
6595 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6599 // stop the tick thread
6600 tick_thread_stopped
= true;
6601 upkeep_cond
.notify_one();
6605 // release the global snapshot realm
6606 SnapRealm
*global_realm
= snap_realms
[CEPH_INO_GLOBAL_SNAPREALM
];
6608 ceph_assert(global_realm
->nref
== 1);
6609 put_snap_realm(global_realm
);
6612 mref_writer
.update_state(CLIENT_UNMOUNTED
);
6614 ldout(cct
, 2) << "unmounted." << dendl
;
6617 void Client::unmount()
6622 void Client::abort_conn()
6627 void Client::flush_cap_releases()
6629 uint64_t nr_caps
= 0;
6631 // send any cap releases
6632 for (auto &p
: mds_sessions
) {
6633 auto session
= p
.second
;
6634 if (session
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6636 nr_caps
+= session
->release
->caps
.size();
6637 if (cct
->_conf
->client_inject_release_failure
) {
6638 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6640 session
->con
->send_message2(std::move(session
->release
));
6642 session
->release
.reset();
6647 dec_pinned_icaps(nr_caps
);
6651 void Client::renew_and_flush_cap_releases()
6653 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6655 if (!mount_aborted
&& mdsmap
->get_epoch()) {
6657 auto el
= ceph::coarse_mono_clock::now() - last_cap_renew
;
6658 if (unlikely(utime_t(el
) > mdsmap
->get_session_timeout() / 3.0))
6661 flush_cap_releases();
6667 ldout(cct
, 20) << "tick" << dendl
;
6669 auto now
= ceph::coarse_mono_clock::now();
6672 * If the mount() is not finished
6674 if (is_mounting() && !mds_requests
.empty()) {
6675 MetaRequest
*req
= mds_requests
.begin()->second
;
6677 if (req
->created
+ mount_timeout
< now
) {
6678 req
->abort(-CEPHFS_ETIMEDOUT
);
6679 if (req
->caller_cond
) {
6681 req
->caller_cond
->notify_all();
6683 signal_cond_list(waiting_for_mdsmap
);
6684 for (auto &p
: mds_sessions
) {
6685 signal_context_list(p
.second
->waiting_for_open
);
6690 renew_and_flush_cap_releases();
6693 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6697 if (!mount_aborted
&& in
->hold_caps_until
> now
)
6699 delayed_list
.pop_front();
6701 check_caps(in
, CHECK_CAPS_NODELAY
);
6705 collect_and_send_metrics();
6707 delay_put_inodes(is_unmounting());
6710 if (blocklisted
&& (is_mounted() || is_unmounting()) &&
6711 last_auto_reconnect
+ std::chrono::seconds(30 * 60) < now
&&
6712 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6713 messenger
->client_reset();
6714 fd_gen
++; // invalidate open files
6715 blocklisted
= false;
6716 _kick_stale_sessions();
6717 last_auto_reconnect
= now
;
6721 void Client::start_tick_thread()
6723 upkeeper
= std::thread([this]() {
6724 using time
= ceph::coarse_mono_time
;
6725 using sec
= std::chrono::seconds
;
6727 auto last_tick
= time::min();
6729 std::unique_lock
cl(client_lock
);
6730 while (!tick_thread_stopped
) {
6731 auto now
= clock::now();
6732 auto since
= now
- last_tick
;
6734 auto t_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_tick_interval"));
6735 auto d_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_debug_inject_tick_delay"));
6737 auto interval
= std::max(t_interval
, d_interval
);
6738 if (likely(since
>= interval
*.90)) {
6740 last_tick
= clock::now();
6745 ldout(cct
, 20) << "upkeep thread waiting interval " << interval
<< dendl
;
6746 if (!tick_thread_stopped
)
6747 upkeep_cond
.wait_for(cl
, interval
);
6752 void Client::collect_and_send_metrics() {
6753 ldout(cct
, 20) << __func__
<< dendl
;
6755 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6757 // right now, we only track and send global metrics. its sufficient
6758 // to send these metrics to MDS rank0.
6759 collect_and_send_global_metrics();
6762 void Client::collect_and_send_global_metrics() {
6763 ldout(cct
, 20) << __func__
<< dendl
;
6764 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6766 if (!have_open_session((mds_rank_t
)0)) {
6767 ldout(cct
, 5) << __func__
<< ": no session with rank=0 -- not sending metric"
6771 auto session
= _get_or_open_mds_session((mds_rank_t
)0);
6772 if (!session
->mds_features
.test(CEPHFS_FEATURE_METRIC_COLLECT
)) {
6773 ldout(cct
, 5) << __func__
<< ": rank=0 does not support metrics" << dendl
;
6777 ClientMetricMessage metric
;
6778 std::vector
<ClientMetricMessage
> message
;
6781 if (_collect_and_send_global_metrics
||
6782 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_LATENCY
)) {
6783 metric
= ClientMetricMessage(ReadLatencyPayload(logger
->tget(l_c_read
),
6784 logger
->tget(l_c_rd_avg
),
6785 logger
->get(l_c_rd_sqsum
),
6787 message
.push_back(metric
);
6791 if (_collect_and_send_global_metrics
||
6792 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_LATENCY
)) {
6793 metric
= ClientMetricMessage(WriteLatencyPayload(logger
->tget(l_c_wrlat
),
6794 logger
->tget(l_c_wr_avg
),
6795 logger
->get(l_c_wr_sqsum
),
6797 message
.push_back(metric
);
6801 if (_collect_and_send_global_metrics
||
6802 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_METADATA_LATENCY
)) {
6803 metric
= ClientMetricMessage(MetadataLatencyPayload(logger
->tget(l_c_lat
),
6804 logger
->tget(l_c_md_avg
),
6805 logger
->get(l_c_md_sqsum
),
6806 nr_metadata_request
));
6807 message
.push_back(metric
);
6810 // cap hit ratio -- nr_caps is unused right now
6811 if (_collect_and_send_global_metrics
||
6812 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_CAP_INFO
)) {
6813 auto [cap_hits
, cap_misses
] = get_cap_hit_rates();
6814 metric
= ClientMetricMessage(CapInfoPayload(cap_hits
, cap_misses
, 0));
6815 message
.push_back(metric
);
6818 // dentry lease hit ratio
6819 if (_collect_and_send_global_metrics
||
6820 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_DENTRY_LEASE
)) {
6821 auto [dlease_hits
, dlease_misses
, nr
] = get_dlease_hit_rates();
6822 metric
= ClientMetricMessage(DentryLeasePayload(dlease_hits
, dlease_misses
, nr
));
6823 message
.push_back(metric
);
6827 if (_collect_and_send_global_metrics
||
6828 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_FILES
)) {
6829 auto [opened_files
, total_inodes
] = get_opened_files_rates();
6830 metric
= ClientMetricMessage(OpenedFilesPayload(opened_files
, total_inodes
));
6831 message
.push_back(metric
);
6835 if (_collect_and_send_global_metrics
||
6836 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_PINNED_ICAPS
)) {
6837 auto [pinned_icaps
, total_inodes
] = get_pinned_icaps_rates();
6838 metric
= ClientMetricMessage(PinnedIcapsPayload(pinned_icaps
, total_inodes
));
6839 message
.push_back(metric
);
6843 if (_collect_and_send_global_metrics
||
6844 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_INODES
)) {
6845 auto [opened_inodes
, total_inodes
] = get_opened_inodes_rates();
6846 metric
= ClientMetricMessage(OpenedInodesPayload(opened_inodes
, total_inodes
));
6847 message
.push_back(metric
);
6851 if (_collect_and_send_global_metrics
||
6852 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_IO_SIZES
)) {
6853 metric
= ClientMetricMessage(ReadIoSizesPayload(total_read_ops
,
6855 message
.push_back(metric
);
6859 if (_collect_and_send_global_metrics
||
6860 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES
)) {
6861 metric
= ClientMetricMessage(WriteIoSizesPayload(total_write_ops
,
6863 message
.push_back(metric
);
6866 session
->con
->send_message2(make_message
<MClientMetrics
>(std::move(message
)));
6869 void Client::renew_caps()
6871 ldout(cct
, 10) << "renew_caps()" << dendl
;
6872 last_cap_renew
= ceph::coarse_mono_clock::now();
6874 for (auto &p
: mds_sessions
) {
6875 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6876 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6877 renew_caps(p
.second
.get());
6881 void Client::renew_caps(MetaSession
*session
)
6883 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6884 session
->last_cap_renew_request
= ceph_clock_now();
6885 uint64_t seq
= ++session
->cap_renew_seq
;
6886 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6890 // ===============================================================
6891 // high level (POSIXy) interface
6893 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6894 InodeRef
*target
, const UserPerm
& perms
)
6896 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6897 MetaRequest
*req
= new MetaRequest(op
);
6899 dir
->make_nosnap_relative_path(path
);
6900 path
.push_dentry(name
);
6901 req
->set_filepath(path
);
6902 req
->set_inode(dir
);
6903 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6904 mask
|= DEBUG_GETATTR_CAPS
;
6905 req
->head
.args
.getattr
.mask
= mask
;
6907 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6909 int r
= make_request(req
, perms
, target
);
6910 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6914 bool Client::_dentry_valid(const Dentry
*dn
)
6916 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6918 // is dn lease valid?
6919 utime_t now
= ceph_clock_now();
6920 if (dn
->lease_mds
>= 0 && dn
->lease_ttl
> now
&&
6921 mds_sessions
.count(dn
->lease_mds
)) {
6922 auto s
= mds_sessions
.at(dn
->lease_mds
);
6923 if (s
->cap_ttl
> now
&& s
->cap_gen
== dn
->lease_gen
) {
6928 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
6929 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6936 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6937 const UserPerm
& perms
, std::string
* alternate_name
)
6941 bool did_lookup_request
= false;
6942 // can only request shared caps
6943 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
6945 if (dname
== "..") {
6946 if (dir
->dentries
.empty()) {
6947 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6948 filepath
path(dir
->ino
);
6949 req
->set_filepath(path
);
6952 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6955 *target
= std::move(tmptarget
);
6956 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6962 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6971 if (!dir
->is_dir()) {
6972 r
= -CEPHFS_ENOTDIR
;
6976 if (dname
.length() > NAME_MAX
) {
6977 r
= -CEPHFS_ENAMETOOLONG
;
6981 if (dname
== cct
->_conf
->client_snapdir
&&
6982 dir
->snapid
== CEPH_NOSNAP
) {
6983 *target
= open_snapdir(dir
);
6989 dir
->dir
->dentries
.count(dname
)) {
6990 dn
= dir
->dir
->dentries
[dname
];
6992 ldout(cct
, 20) << __func__
<< " have " << *dn
<< " from mds." << dn
->lease_mds
6993 << " ttl " << dn
->lease_ttl
<< " seq " << dn
->lease_seq
<< dendl
;
6995 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6996 if (_dentry_valid(dn
)) {
6997 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6998 // make trim_caps() behave.
6999 dir
->try_touch_cap(dn
->lease_mds
);
7003 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
7004 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
7005 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
7007 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
7008 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
7009 << *dir
<< " dn '" << dname
<< "'" << dendl
;
7010 return -CEPHFS_ENOENT
;
7014 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
7017 // can we conclude ENOENT locally?
7018 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
7019 (dir
->flags
& I_COMPLETE
)) {
7020 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
7021 return -CEPHFS_ENOENT
;
7025 if (did_lookup_request
) {
7029 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
7030 did_lookup_request
= true;
7032 /* complete lookup to get dentry for alternate_name */
7040 *target
= dn
->inode
;
7042 *alternate_name
= dn
->alternate_name
;
7051 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
7053 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
7057 int Client::get_or_create(Inode
*dir
, const char* name
,
7058 Dentry
**pdn
, bool expect_null
)
7061 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
7063 if (dir
->dir
->dentries
.count(name
)) {
7064 Dentry
*dn
= dir
->dir
->dentries
[name
];
7065 if (_dentry_valid(dn
)) {
7067 return -CEPHFS_EEXIST
;
7071 // otherwise link up a new one
7072 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
7079 int Client::walk(std::string_view path
, walk_dentry_result
* wdr
, const UserPerm
& perms
, bool followsym
)
7081 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7082 if (!mref_reader
.is_state_satisfied())
7083 return -CEPHFS_ENOTCONN
;
7085 ldout(cct
, 10) << __func__
<< ": " << path
<< dendl
;
7087 std::scoped_lock
lock(client_lock
);
7089 return path_walk(path
, wdr
, perms
, followsym
);
7092 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
7093 const UserPerm
& perms
, bool followsym
, int mask
, InodeRef dirinode
)
7095 walk_dentry_result wdr
;
7096 int rc
= path_walk(origpath
, &wdr
, perms
, followsym
, mask
, dirinode
);
7097 *end
= std::move(wdr
.in
);
7101 int Client::path_walk(const filepath
& origpath
, walk_dentry_result
* result
, const UserPerm
& perms
,
7102 bool followsym
, int mask
, InodeRef dirinode
)
7104 filepath path
= origpath
;
7106 std::string alternate_name
;
7107 if (origpath
.absolute())
7116 ldout(cct
, 20) << __func__
<< " cur=" << *cur
<< dendl
;
7117 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
7122 while (i
< path
.depth() && cur
) {
7124 const string
&dname
= path
[i
];
7125 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
7126 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
7128 if (cct
->_conf
->client_permissions
) {
7129 int r
= may_lookup(cur
.get(), perms
);
7132 caps
= CEPH_CAP_AUTH_SHARED
;
7135 /* Get extra requested caps on the last component */
7136 if (i
== (path
.depth() - 1))
7138 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
, &alternate_name
);
7141 // only follow trailing symlink if followsym. always follow
7142 // 'directory' symlinks.
7143 if (next
&& next
->is_symlink()) {
7145 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
7146 if (symlinks
> MAXSYMLINKS
) {
7147 return -CEPHFS_ELOOP
;
7150 if (i
< path
.depth() - 1) {
7152 // replace consumed components of path with symlink dir target
7153 filepath
resolved(next
->symlink
.c_str());
7154 resolved
.append(path
.postfixpath(i
+ 1));
7157 if (next
->symlink
[0] == '/') {
7161 } else if (followsym
) {
7162 if (next
->symlink
[0] == '/') {
7163 path
= next
->symlink
.c_str();
7168 filepath
more(next
->symlink
.c_str());
7169 // we need to remove the symlink component from off of the path
7170 // before adding the target that the symlink points to. remain
7171 // at the same position in the path.
7182 return -CEPHFS_ENOENT
;
7184 result
->in
= std::move(cur
);
7185 result
->alternate_name
= std::move(alternate_name
);
7193 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
, std::string alternate_name
)
7195 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7196 if (!mref_reader
.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN
;
7199 tout(cct
) << "link" << std::endl
;
7200 tout(cct
) << relexisting
<< std::endl
;
7201 tout(cct
) << relpath
<< std::endl
;
7203 filepath
existing(relexisting
);
7207 std::scoped_lock
lock(client_lock
);
7208 int r
= path_walk(existing
, &in
, perm
, true);
7211 if (std::string(relpath
) == "/") {
7215 filepath
path(relpath
);
7216 string name
= path
.last_dentry();
7219 r
= path_walk(path
, &dir
, perm
, true);
7222 if (cct
->_conf
->client_permissions
) {
7223 if (S_ISDIR(in
->mode
)) {
7227 r
= may_hardlink(in
.get(), perm
);
7230 r
= may_create(dir
.get(), perm
);
7234 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
, std::move(alternate_name
));
7238 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
7240 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, 0, perm
);
7243 int Client::unlinkat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perm
)
7245 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7246 if (!mref_reader
.is_state_satisfied()) {
7247 return -CEPHFS_ENOTCONN
;
7250 tout(cct
) << __func__
<< std::endl
;
7251 tout(cct
) << dirfd
<< std::endl
;
7252 tout(cct
) << relpath
<< std::endl
;
7253 tout(cct
) << flags
<< std::endl
;
7255 if (std::string(relpath
) == "/") {
7256 return flags
& AT_REMOVEDIR
? -CEPHFS_EBUSY
: -CEPHFS_EISDIR
;
7259 filepath
path(relpath
);
7260 string name
= path
.last_dentry();
7264 std::scoped_lock
lock(client_lock
);
7267 int r
= get_fd_inode(dirfd
, &dirinode
);
7272 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7276 if (cct
->_conf
->client_permissions
) {
7277 r
= may_delete(dir
.get(), name
.c_str(), perm
);
7282 if (flags
& AT_REMOVEDIR
) {
7283 r
= _rmdir(dir
.get(), name
.c_str(), perm
);
7285 r
= _unlink(dir
.get(), name
.c_str(), perm
);
7290 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
, std::string alternate_name
)
7292 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7293 if (!mref_reader
.is_state_satisfied())
7294 return -CEPHFS_ENOTCONN
;
7296 tout(cct
) << __func__
<< std::endl
;
7297 tout(cct
) << relfrom
<< std::endl
;
7298 tout(cct
) << relto
<< std::endl
;
7300 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
7301 return -CEPHFS_EBUSY
;
7303 filepath
from(relfrom
);
7305 string fromname
= from
.last_dentry();
7307 string toname
= to
.last_dentry();
7310 InodeRef fromdir
, todir
;
7312 std::scoped_lock
lock(client_lock
);
7313 int r
= path_walk(from
, &fromdir
, perm
);
7316 r
= path_walk(to
, &todir
, perm
);
7320 if (cct
->_conf
->client_permissions
) {
7321 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
7324 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
7325 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
7328 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
, std::move(alternate_name
));
7335 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
, std::string alternate_name
)
7337 return mkdirat(CEPHFS_AT_FDCWD
, relpath
, mode
, perm
, alternate_name
);
7340 int Client::mkdirat(int dirfd
, const char *relpath
, mode_t mode
, const UserPerm
& perm
,
7341 std::string alternate_name
)
7343 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7344 if (!mref_reader
.is_state_satisfied())
7345 return -CEPHFS_ENOTCONN
;
7347 tout(cct
) << __func__
<< std::endl
;
7348 tout(cct
) << dirfd
<< std::endl
;
7349 tout(cct
) << relpath
<< std::endl
;
7350 tout(cct
) << mode
<< std::endl
;
7351 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
7353 if (std::string(relpath
) == "/") {
7354 return -CEPHFS_EEXIST
;
7357 filepath
path(relpath
);
7358 string name
= path
.last_dentry();
7362 std::scoped_lock
lock(client_lock
);
7365 int r
= get_fd_inode(dirfd
, &dirinode
);
7370 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7374 if (cct
->_conf
->client_permissions
) {
7375 r
= may_create(dir
.get(), perm
);
7380 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
, 0, {}, std::move(alternate_name
));
7383 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7385 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7386 if (!mref_reader
.is_state_satisfied())
7387 return -CEPHFS_ENOTCONN
;
7389 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
7390 tout(cct
) << __func__
<< std::endl
;
7391 tout(cct
) << relpath
<< std::endl
;
7392 tout(cct
) << mode
<< std::endl
;
7394 //get through existing parts of path
7395 filepath
path(relpath
);
7397 int r
= 0, caps
= 0;
7400 std::scoped_lock
lock(client_lock
);
7402 for (i
=0; i
<path
.depth(); ++i
) {
7403 if (cct
->_conf
->client_permissions
) {
7404 r
= may_lookup(cur
.get(), perms
);
7407 caps
= CEPH_CAP_AUTH_SHARED
;
7409 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
7414 if (r
!=-CEPHFS_ENOENT
) return r
;
7415 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
7416 //make new directory at each level
7417 for (; i
<path
.depth(); ++i
) {
7418 if (cct
->_conf
->client_permissions
) {
7419 r
= may_create(cur
.get(), perms
);
7424 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
7426 //check proper creation/existence
7427 if(-CEPHFS_EEXIST
== r
&& i
< path
.depth() - 1) {
7428 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
7432 //move to new dir and continue
7434 ldout(cct
, 20) << __func__
<< ": successfully created directory "
7435 << filepath(cur
->ino
).get_path() << dendl
;
7440 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
7442 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, AT_REMOVEDIR
, perms
);
7445 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
7447 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7448 if (!mref_reader
.is_state_satisfied())
7449 return -CEPHFS_ENOTCONN
;
7451 tout(cct
) << __func__
<< std::endl
;
7452 tout(cct
) << relpath
<< std::endl
;
7453 tout(cct
) << mode
<< std::endl
;
7454 tout(cct
) << rdev
<< std::endl
;
7456 if (std::string(relpath
) == "/")
7457 return -CEPHFS_EEXIST
;
7459 filepath
path(relpath
);
7460 string name
= path
.last_dentry();
7464 std::scoped_lock
lock(client_lock
);
7465 int r
= path_walk(path
, &dir
, perms
);
7468 if (cct
->_conf
->client_permissions
) {
7469 int r
= may_create(dir
.get(), perms
);
7473 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
7478 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
, std::string alternate_name
)
7480 return symlinkat(target
, CEPHFS_AT_FDCWD
, relpath
, perms
, alternate_name
);
7483 int Client::symlinkat(const char *target
, int dirfd
, const char *relpath
, const UserPerm
& perms
,
7484 std::string alternate_name
)
7486 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7487 if (!mref_reader
.is_state_satisfied()) {
7488 return -CEPHFS_ENOTCONN
;
7491 tout(cct
) << __func__
<< std::endl
;
7492 tout(cct
) << target
<< std::endl
;
7493 tout(cct
) << dirfd
<< std::endl
;
7494 tout(cct
) << relpath
<< std::endl
;
7496 if (std::string(relpath
) == "/") {
7497 return -CEPHFS_EEXIST
;
7500 filepath
path(relpath
);
7501 string name
= path
.last_dentry();
7505 std::scoped_lock
lock(client_lock
);
7508 int r
= get_fd_inode(dirfd
, &dirinode
);
7512 r
= path_walk(path
, &dir
, perms
, true, 0, dirinode
);
7516 if (cct
->_conf
->client_permissions
) {
7517 int r
= may_create(dir
.get(), perms
);
7522 return _symlink(dir
.get(), name
.c_str(), target
, perms
, std::move(alternate_name
));
7525 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
7527 return readlinkat(CEPHFS_AT_FDCWD
, relpath
, buf
, size
, perms
);
7530 int Client::readlinkat(int dirfd
, const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
) {
7531 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7532 if (!mref_reader
.is_state_satisfied()) {
7533 return -CEPHFS_ENOTCONN
;
7536 tout(cct
) << __func__
<< std::endl
;
7537 tout(cct
) << dirfd
<< std::endl
;
7538 tout(cct
) << relpath
<< std::endl
;
7541 std::scoped_lock
lock(client_lock
);
7542 int r
= get_fd_inode(dirfd
, &dirinode
);
7548 filepath
path(relpath
);
7549 r
= path_walk(path
, &in
, perms
, false, 0, dirinode
);
7554 return _readlink(in
.get(), buf
, size
);
7557 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
7559 if (!in
->is_symlink())
7560 return -CEPHFS_EINVAL
;
7562 // copy into buf (at most size bytes)
7563 int r
= in
->symlink
.length();
7566 memcpy(buf
, in
->symlink
.c_str(), r
);
7573 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
7575 bool yes
= in
->caps_issued_mask(mask
, true);
7577 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
7581 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
7583 in
->make_nosnap_relative_path(path
);
7584 req
->set_filepath(path
);
7586 req
->head
.args
.getattr
.mask
= mask
;
7588 int res
= make_request(req
, perms
);
7589 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7593 int Client::_getvxattr(
7595 const UserPerm
& perms
,
7596 const char *xattr_name
,
7601 if (!xattr_name
|| strlen(xattr_name
) <= 0 || strlen(xattr_name
) > 255) {
7602 return -CEPHFS_ENODATA
;
7605 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETVXATTR
);
7607 in
->make_nosnap_relative_path(path
);
7608 req
->set_filepath(path
);
7610 req
->set_string2(xattr_name
);
7613 int res
= make_request(req
, perms
, nullptr, nullptr, rank
, &bl
);
7614 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7621 auto p
= bl
.cbegin();
7627 ssize_t len
= buf
.length();
7629 res
= len
; // refer to man getxattr(2) for output buffer size == 0
7633 res
= -CEPHFS_ERANGE
; // insufficient output buffer space
7635 memcpy(value
, buf
.c_str(), len
);
7641 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7642 const UserPerm
& perms
, InodeRef
*inp
)
7644 int issued
= in
->caps_issued();
7645 union ceph_mds_request_args args
;
7646 bool kill_sguid
= false;
7649 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
7650 ccap_string(issued
) << dendl
;
7652 if (in
->snapid
!= CEPH_NOSNAP
) {
7653 return -CEPHFS_EROFS
;
7655 if ((mask
& CEPH_SETATTR_SIZE
) &&
7656 (uint64_t)stx
->stx_size
> in
->size
&&
7657 is_quota_bytes_exceeded(in
, (uint64_t)stx
->stx_size
- in
->size
,
7659 return -CEPHFS_EDQUOT
;
7662 memset(&args
, 0, sizeof(args
));
7664 // make the change locally?
7665 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
7666 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
7667 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
7668 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7669 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7672 * This works because we implicitly flush the caps as part of the
7673 * request, so the cap update check will happen with the writeback
7674 * cap context, and then the setattr check will happen with the
7677 * In reality this pattern is likely pretty rare (different users
7678 * setattr'ing the same file). If that turns out not to be the
7679 * case later, we can build a more complex pipelined cap writeback
7682 mask
|= CEPH_SETATTR_CTIME
;
7686 // caller just needs us to bump the ctime
7687 in
->ctime
= ceph_clock_now();
7688 in
->cap_dirtier_uid
= perms
.uid();
7689 in
->cap_dirtier_gid
= perms
.gid();
7690 if (issued
& CEPH_CAP_AUTH_EXCL
)
7691 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7692 else if (issued
& CEPH_CAP_FILE_EXCL
)
7693 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7694 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7695 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7697 mask
|= CEPH_SETATTR_CTIME
;
7700 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7701 kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7703 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7704 } else if (mask
& CEPH_SETATTR_SIZE
) {
7705 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7706 mask
|= CEPH_SETATTR_KILL_SGUID
;
7707 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7710 if (mask
& CEPH_SETATTR_UID
) {
7711 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7713 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7714 in
->ctime
= ceph_clock_now();
7715 in
->cap_dirtier_uid
= perms
.uid();
7716 in
->cap_dirtier_gid
= perms
.gid();
7717 in
->uid
= stx
->stx_uid
;
7718 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7719 mask
&= ~CEPH_SETATTR_UID
;
7721 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7722 in
->uid
!= stx
->stx_uid
) {
7723 args
.setattr
.uid
= stx
->stx_uid
;
7724 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7726 mask
&= ~CEPH_SETATTR_UID
;
7730 if (mask
& CEPH_SETATTR_GID
) {
7731 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7733 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7734 in
->ctime
= ceph_clock_now();
7735 in
->cap_dirtier_uid
= perms
.uid();
7736 in
->cap_dirtier_gid
= perms
.gid();
7737 in
->gid
= stx
->stx_gid
;
7738 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7739 mask
&= ~CEPH_SETATTR_GID
;
7741 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7742 in
->gid
!= stx
->stx_gid
) {
7743 args
.setattr
.gid
= stx
->stx_gid
;
7744 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7746 mask
&= ~CEPH_SETATTR_GID
;
7750 if (mask
& CEPH_SETATTR_MODE
) {
7751 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7753 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7754 in
->ctime
= ceph_clock_now();
7755 in
->cap_dirtier_uid
= perms
.uid();
7756 in
->cap_dirtier_gid
= perms
.gid();
7757 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7758 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7759 mask
&= ~CEPH_SETATTR_MODE
;
7760 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7761 in
->mode
!= stx
->stx_mode
) {
7762 args
.setattr
.mode
= stx
->stx_mode
;
7763 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7765 mask
&= ~CEPH_SETATTR_MODE
;
7767 } else if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
) &&
7768 kill_sguid
&& S_ISREG(in
->mode
) &&
7769 (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7770 /* Must squash the any setuid/setgid bits with an ownership change */
7771 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7772 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7775 if (mask
& CEPH_SETATTR_BTIME
) {
7776 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7778 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7779 in
->ctime
= ceph_clock_now();
7780 in
->cap_dirtier_uid
= perms
.uid();
7781 in
->cap_dirtier_gid
= perms
.gid();
7782 in
->btime
= utime_t(stx
->stx_btime
);
7783 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7784 mask
&= ~CEPH_SETATTR_BTIME
;
7785 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7786 in
->btime
!= utime_t(stx
->stx_btime
)) {
7787 args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7788 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7790 mask
&= ~CEPH_SETATTR_BTIME
;
7794 if (mask
& CEPH_SETATTR_SIZE
) {
7795 if ((uint64_t)stx
->stx_size
>= mdsmap
->get_max_filesize()) {
7797 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7798 return -CEPHFS_EFBIG
;
7801 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7802 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
) &&
7803 !(mask
& CEPH_SETATTR_KILL_SGUID
) &&
7804 stx
->stx_size
>= in
->size
) {
7805 if (stx
->stx_size
> in
->size
) {
7806 in
->size
= in
->reported_size
= stx
->stx_size
;
7807 in
->cap_dirtier_uid
= perms
.uid();
7808 in
->cap_dirtier_gid
= perms
.gid();
7809 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7810 mask
&= ~(CEPH_SETATTR_SIZE
);
7811 mask
|= CEPH_SETATTR_MTIME
;
7813 // ignore it when size doesn't change
7814 mask
&= ~(CEPH_SETATTR_SIZE
);
7817 args
.setattr
.size
= stx
->stx_size
;
7818 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7823 if (mask
& CEPH_SETATTR_MTIME
) {
7824 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7825 in
->mtime
= utime_t(stx
->stx_mtime
);
7826 in
->ctime
= ceph_clock_now();
7827 in
->cap_dirtier_uid
= perms
.uid();
7828 in
->cap_dirtier_gid
= perms
.gid();
7829 in
->time_warp_seq
++;
7830 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7831 mask
&= ~CEPH_SETATTR_MTIME
;
7832 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7833 utime_t(stx
->stx_mtime
) > in
->mtime
) {
7834 in
->mtime
= utime_t(stx
->stx_mtime
);
7835 in
->ctime
= ceph_clock_now();
7836 in
->cap_dirtier_uid
= perms
.uid();
7837 in
->cap_dirtier_gid
= perms
.gid();
7838 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7839 mask
&= ~CEPH_SETATTR_MTIME
;
7840 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7841 in
->mtime
!= utime_t(stx
->stx_mtime
)) {
7842 args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7843 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7846 mask
&= ~CEPH_SETATTR_MTIME
;
7850 if (mask
& CEPH_SETATTR_ATIME
) {
7851 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7852 in
->atime
= utime_t(stx
->stx_atime
);
7853 in
->ctime
= ceph_clock_now();
7854 in
->cap_dirtier_uid
= perms
.uid();
7855 in
->cap_dirtier_gid
= perms
.gid();
7856 in
->time_warp_seq
++;
7857 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7858 mask
&= ~CEPH_SETATTR_ATIME
;
7859 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
7860 utime_t(stx
->stx_atime
) > in
->atime
) {
7861 in
->atime
= utime_t(stx
->stx_atime
);
7862 in
->ctime
= ceph_clock_now();
7863 in
->cap_dirtier_uid
= perms
.uid();
7864 in
->cap_dirtier_gid
= perms
.gid();
7865 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
7866 mask
&= ~CEPH_SETATTR_ATIME
;
7867 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
7868 in
->atime
!= utime_t(stx
->stx_atime
)) {
7869 args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7870 inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7873 mask
&= ~CEPH_SETATTR_ATIME
;
7882 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7886 in
->make_nosnap_relative_path(path
);
7887 req
->set_filepath(path
);
7890 req
->head
.args
= args
;
7891 req
->inode_drop
= inode_drop
;
7892 req
->head
.args
.setattr
.mask
= mask
;
7893 req
->regetattr_mask
= mask
;
7895 int res
= make_request(req
, perms
, inp
);
7896 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7900 /* Note that we only care about attrs that setattr cares about */
7901 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7903 stx
->stx_size
= st
->st_size
;
7904 stx
->stx_mode
= st
->st_mode
;
7905 stx
->stx_uid
= st
->st_uid
;
7906 stx
->stx_gid
= st
->st_gid
;
7908 stx
->stx_mtime
= st
->st_mtimespec
;
7909 stx
->stx_atime
= st
->st_atimespec
;
7911 stx
->stx_mtime
.tv_sec
= st
->st_mtime
;
7912 stx
->stx_atime
.tv_sec
= st
->st_atime
;
7914 stx
->stx_mtime
= st
->st_mtim
;
7915 stx
->stx_atime
= st
->st_atim
;
7919 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7920 const UserPerm
& perms
, InodeRef
*inp
)
7922 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7925 if (mask
& CEPH_SETATTR_MODE
)
7926 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7930 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7931 const UserPerm
& perms
)
7933 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7934 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7935 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7936 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7937 if (cct
->_conf
->client_permissions
) {
7938 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7942 return __setattrx(in
.get(), stx
, mask
, perms
);
7945 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7946 const UserPerm
& perms
)
7948 struct ceph_statx stx
;
7950 stat_to_statx(attr
, &stx
);
7951 mask
&= ~CEPH_SETATTR_BTIME
;
7953 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7954 mask
&= ~CEPH_SETATTR_UID
;
7956 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7957 mask
&= ~CEPH_SETATTR_GID
;
7960 return _setattrx(in
, &stx
, mask
, perms
);
7963 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7964 const UserPerm
& perms
)
7966 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7967 if (!mref_reader
.is_state_satisfied())
7968 return -CEPHFS_ENOTCONN
;
7970 tout(cct
) << __func__
<< std::endl
;
7971 tout(cct
) << relpath
<< std::endl
;
7972 tout(cct
) << mask
<< std::endl
;
7974 filepath
path(relpath
);
7977 std::scoped_lock
lock(client_lock
);
7978 int r
= path_walk(path
, &in
, perms
);
7981 return _setattr(in
, attr
, mask
, perms
);
7984 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7985 const UserPerm
& perms
, int flags
)
7987 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7988 if (!mref_reader
.is_state_satisfied())
7989 return -CEPHFS_ENOTCONN
;
7991 tout(cct
) << __func__
<< std::endl
;
7992 tout(cct
) << relpath
<< std::endl
;
7993 tout(cct
) << mask
<< std::endl
;
7995 filepath
path(relpath
);
7998 std::scoped_lock
lock(client_lock
);
7999 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
8002 return _setattrx(in
, stx
, mask
, perms
);
8005 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
8007 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8008 if (!mref_reader
.is_state_satisfied())
8009 return -CEPHFS_ENOTCONN
;
8011 tout(cct
) << __func__
<< std::endl
;
8012 tout(cct
) << fd
<< std::endl
;
8013 tout(cct
) << mask
<< std::endl
;
8015 std::scoped_lock
lock(client_lock
);
8016 Fh
*f
= get_filehandle(fd
);
8018 return -CEPHFS_EBADF
;
8019 #if defined(__linux__) && defined(O_PATH)
8020 if (f
->flags
& O_PATH
)
8021 return -CEPHFS_EBADF
;
8023 return _setattr(f
->inode
, attr
, mask
, perms
);
8026 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
8028 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8029 if (!mref_reader
.is_state_satisfied())
8030 return -CEPHFS_ENOTCONN
;
8032 tout(cct
) << __func__
<< std::endl
;
8033 tout(cct
) << fd
<< std::endl
;
8034 tout(cct
) << mask
<< std::endl
;
8036 std::scoped_lock
lock(client_lock
);
8037 Fh
*f
= get_filehandle(fd
);
8039 return -CEPHFS_EBADF
;
8040 #if defined(__linux__) && defined(O_PATH)
8041 if (f
->flags
& O_PATH
)
8042 return -CEPHFS_EBADF
;
8044 return _setattrx(f
->inode
, stx
, mask
, perms
);
8047 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
8048 frag_info_t
*dirstat
, int mask
)
8050 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8051 if (!mref_reader
.is_state_satisfied())
8052 return -CEPHFS_ENOTCONN
;
8054 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8055 tout(cct
) << "stat" << std::endl
;
8056 tout(cct
) << relpath
<< std::endl
;
8058 filepath
path(relpath
);
8061 std::scoped_lock
lock(client_lock
);
8062 int r
= path_walk(path
, &in
, perms
, true, mask
);
8065 r
= _getattr(in
, mask
, perms
);
8067 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8070 fill_stat(in
, stbuf
, dirstat
);
8071 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8075 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
8079 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8080 if ((flags
& AT_STATX_SYNC_TYPE
) == AT_STATX_DONT_SYNC
)
8083 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8084 mask
|= CEPH_CAP_PIN
;
8085 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8086 mask
|= CEPH_CAP_AUTH_SHARED
;
8087 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8088 mask
|= CEPH_CAP_LINK_SHARED
;
8089 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
8090 mask
|= CEPH_CAP_FILE_SHARED
;
8091 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
8092 mask
|= CEPH_CAP_XATTR_SHARED
;
8097 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
8098 const UserPerm
& perms
,
8099 unsigned int want
, unsigned int flags
)
8101 return statxat(CEPHFS_AT_FDCWD
, relpath
, stx
, perms
, want
, flags
);
8104 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
8105 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
8107 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8108 if (!mref_reader
.is_state_satisfied())
8109 return -CEPHFS_ENOTCONN
;
8111 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8112 tout(cct
) << __func__
<< std::endl
;
8113 tout(cct
) << relpath
<< std::endl
;
8115 filepath
path(relpath
);
8118 std::scoped_lock
lock(client_lock
);
8119 // don't follow symlinks
8120 int r
= path_walk(path
, &in
, perms
, false, mask
);
8123 r
= _getattr(in
, mask
, perms
);
8125 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8128 fill_stat(in
, stbuf
, dirstat
);
8129 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8133 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
8135 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8136 << " mode 0" << oct
<< in
->mode
<< dec
8137 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
8138 memset(st
, 0, sizeof(struct stat
));
8139 if (use_faked_inos())
8140 st
->st_ino
= in
->faked_ino
;
8142 st
->st_ino
= in
->ino
;
8143 st
->st_dev
= in
->snapid
;
8144 st
->st_mode
= in
->mode
;
8145 st
->st_rdev
= in
->rdev
;
8147 switch (in
->nlink
) {
8149 st
->st_nlink
= 0; /* dir is unlinked */
8152 st
->st_nlink
= 1 /* parent dentry */
8154 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8160 st
->st_nlink
= in
->nlink
;
8162 st
->st_uid
= in
->uid
;
8163 st
->st_gid
= in
->gid
;
8164 if (in
->ctime
> in
->mtime
) {
8165 stat_set_ctime_sec(st
, in
->ctime
.sec());
8166 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
8168 stat_set_ctime_sec(st
, in
->mtime
.sec());
8169 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
8171 stat_set_atime_sec(st
, in
->atime
.sec());
8172 stat_set_atime_nsec(st
, in
->atime
.nsec());
8173 stat_set_mtime_sec(st
, in
->mtime
.sec());
8174 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
8176 if (cct
->_conf
->client_dirsize_rbytes
)
8177 st
->st_size
= in
->rstat
.rbytes
;
8179 st
->st_size
= in
->dirstat
.size();
8180 // The Windows "stat" structure provides just a subset of the fields that are
8181 // available on Linux.
8186 st
->st_size
= in
->size
;
8188 st
->st_blocks
= (in
->size
+ 511) >> 9;
8192 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8196 *dirstat
= in
->dirstat
;
8200 return in
->caps_issued();
8203 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
8205 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8206 << " mode 0" << oct
<< in
->mode
<< dec
8207 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
8208 memset(stx
, 0, sizeof(struct ceph_statx
));
8211 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8212 * so that all bits are set.
8217 /* These are always considered to be available */
8218 stx
->stx_dev
= in
->snapid
;
8219 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8221 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8222 stx
->stx_mode
= S_IFMT
& in
->mode
;
8223 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
8224 stx
->stx_rdev
= in
->rdev
;
8225 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
8227 if (mask
& CEPH_CAP_AUTH_SHARED
) {
8228 stx
->stx_uid
= in
->uid
;
8229 stx
->stx_gid
= in
->gid
;
8230 stx
->stx_mode
= in
->mode
;
8231 in
->btime
.to_timespec(&stx
->stx_btime
);
8232 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
8235 if (mask
& CEPH_CAP_LINK_SHARED
) {
8237 switch (in
->nlink
) {
8239 stx
->stx_nlink
= 0; /* dir is unlinked */
8242 stx
->stx_nlink
= 1 /* parent dentry */
8244 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8250 stx
->stx_nlink
= in
->nlink
;
8252 stx
->stx_mask
|= CEPH_STATX_NLINK
;
8255 if (mask
& CEPH_CAP_FILE_SHARED
) {
8257 in
->atime
.to_timespec(&stx
->stx_atime
);
8258 in
->mtime
.to_timespec(&stx
->stx_mtime
);
8261 if (cct
->_conf
->client_dirsize_rbytes
)
8262 stx
->stx_size
= in
->rstat
.rbytes
;
8264 stx
->stx_size
= in
->dirstat
.size();
8265 stx
->stx_blocks
= 1;
8267 stx
->stx_size
= in
->size
;
8268 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
8270 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
8271 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
8274 /* Change time and change_attr both require all shared caps to view */
8275 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
8276 stx
->stx_version
= in
->change_attr
;
8277 if (in
->ctime
> in
->mtime
)
8278 in
->ctime
.to_timespec(&stx
->stx_ctime
);
8280 in
->mtime
.to_timespec(&stx
->stx_ctime
);
8281 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
8286 void Client::touch_dn(Dentry
*dn
)
8291 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8293 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, 0, perms
);
8296 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
8298 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8299 if (!mref_reader
.is_state_satisfied())
8300 return -CEPHFS_ENOTCONN
;
8302 tout(cct
) << __func__
<< std::endl
;
8303 tout(cct
) << fd
<< std::endl
;
8304 tout(cct
) << mode
<< std::endl
;
8306 std::scoped_lock
lock(client_lock
);
8307 Fh
*f
= get_filehandle(fd
);
8309 return -CEPHFS_EBADF
;
8310 #if defined(__linux__) && defined(O_PATH)
8311 if (f
->flags
& O_PATH
)
8312 return -CEPHFS_EBADF
;
8315 attr
.st_mode
= mode
;
8316 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
8319 int Client::chmodat(int dirfd
, const char *relpath
, mode_t mode
, int flags
,
8320 const UserPerm
& perms
) {
8321 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8322 if (!mref_reader
.is_state_satisfied()) {
8323 return -CEPHFS_ENOTCONN
;
8326 tout(cct
) << __func__
<< std::endl
;
8327 tout(cct
) << dirfd
<< std::endl
;
8328 tout(cct
) << relpath
<< std::endl
;
8329 tout(cct
) << mode
<< std::endl
;
8330 tout(cct
) << flags
<< std::endl
;
8332 filepath
path(relpath
);
8336 std::scoped_lock
lock(client_lock
);
8337 int r
= get_fd_inode(dirfd
, &dirinode
);
8342 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8347 attr
.st_mode
= mode
;
8348 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8351 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8353 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, AT_SYMLINK_NOFOLLOW
, perms
);
8356 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8357 const UserPerm
& perms
)
8359 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, 0, perms
);
8362 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
8364 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8365 if (!mref_reader
.is_state_satisfied())
8366 return -CEPHFS_ENOTCONN
;
8368 tout(cct
) << __func__
<< std::endl
;
8369 tout(cct
) << fd
<< std::endl
;
8370 tout(cct
) << new_uid
<< std::endl
;
8371 tout(cct
) << new_gid
<< std::endl
;
8373 std::scoped_lock
lock(client_lock
);
8374 Fh
*f
= get_filehandle(fd
);
8376 return -CEPHFS_EBADF
;
8377 #if defined(__linux__) && defined(O_PATH)
8378 if (f
->flags
& O_PATH
)
8379 return -CEPHFS_EBADF
;
8382 attr
.st_uid
= new_uid
;
8383 attr
.st_gid
= new_gid
;
8385 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8386 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8387 return _setattr(f
->inode
, &attr
, mask
, perms
);
8390 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8391 const UserPerm
& perms
)
8393 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
, perms
);
8396 int Client::chownat(int dirfd
, const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8397 int flags
, const UserPerm
& perms
) {
8398 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8399 if (!mref_reader
.is_state_satisfied()) {
8400 return -CEPHFS_ENOTCONN
;
8403 tout(cct
) << __func__
<< std::endl
;
8404 tout(cct
) << dirfd
<< std::endl
;
8405 tout(cct
) << relpath
<< std::endl
;
8406 tout(cct
) << new_uid
<< std::endl
;
8407 tout(cct
) << new_gid
<< std::endl
;
8408 tout(cct
) << flags
<< std::endl
;
8410 filepath
path(relpath
);
8414 std::scoped_lock
lock(client_lock
);
8415 int r
= get_fd_inode(dirfd
, &dirinode
);
8420 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8425 attr
.st_uid
= new_uid
;
8426 attr
.st_gid
= new_gid
;
8427 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
8430 static void attr_set_atime_and_mtime(struct stat
*attr
,
8431 const utime_t
&atime
,
8432 const utime_t
&mtime
)
8434 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
8435 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
8436 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
8437 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
8440 // for [l]utime() invoke the timeval variant as the timespec
8441 // variant are not yet implemented. for futime[s](), invoke
8442 // the timespec variant.
8443 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
8444 const UserPerm
& perms
)
8446 struct timeval tv
[2];
8447 tv
[0].tv_sec
= buf
->actime
;
8449 tv
[1].tv_sec
= buf
->modtime
;
8452 return utimes(relpath
, tv
, perms
);
8455 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
8456 const UserPerm
& perms
)
8458 struct timeval tv
[2];
8459 tv
[0].tv_sec
= buf
->actime
;
8461 tv
[1].tv_sec
= buf
->modtime
;
8464 return lutimes(relpath
, tv
, perms
);
8467 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
8469 struct timespec ts
[2];
8470 ts
[0].tv_sec
= buf
->actime
;
8472 ts
[1].tv_sec
= buf
->modtime
;
8475 return futimens(fd
, ts
, perms
);
8478 int Client::utimes(const char *relpath
, struct timeval times
[2],
8479 const UserPerm
& perms
)
8481 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8482 if (!mref_reader
.is_state_satisfied())
8483 return -CEPHFS_ENOTCONN
;
8485 tout(cct
) << __func__
<< std::endl
;
8486 tout(cct
) << relpath
<< std::endl
;
8487 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8489 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8492 filepath
path(relpath
);
8495 std::scoped_lock
lock(client_lock
);
8496 int r
= path_walk(path
, &in
, perms
);
8500 utime_t
atime(times
[0]);
8501 utime_t
mtime(times
[1]);
8503 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8504 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8507 int Client::lutimes(const char *relpath
, struct timeval times
[2],
8508 const UserPerm
& perms
)
8510 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8511 if (!mref_reader
.is_state_satisfied())
8512 return -CEPHFS_ENOTCONN
;
8514 tout(cct
) << __func__
<< std::endl
;
8515 tout(cct
) << relpath
<< std::endl
;
8516 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8518 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8521 filepath
path(relpath
);
8524 std::scoped_lock
lock(client_lock
);
8525 int r
= path_walk(path
, &in
, perms
, false);
8529 utime_t
atime(times
[0]);
8530 utime_t
mtime(times
[1]);
8532 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8533 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8536 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
8538 struct timespec ts
[2];
8539 ts
[0].tv_sec
= times
[0].tv_sec
;
8540 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
8541 ts
[1].tv_sec
= times
[1].tv_sec
;
8542 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
8544 return futimens(fd
, ts
, perms
);
8547 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
8549 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8550 if (!mref_reader
.is_state_satisfied())
8551 return -CEPHFS_ENOTCONN
;
8553 tout(cct
) << __func__
<< std::endl
;
8554 tout(cct
) << fd
<< std::endl
;
8555 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8557 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8560 std::scoped_lock
lock(client_lock
);
8561 Fh
*f
= get_filehandle(fd
);
8563 return -CEPHFS_EBADF
;
8564 #if defined(__linux__) && defined(O_PATH)
8565 if (f
->flags
& O_PATH
)
8566 return -CEPHFS_EBADF
;
8569 utime_t
atime(times
[0]);
8570 utime_t
mtime(times
[1]);
8572 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8573 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8576 int Client::utimensat(int dirfd
, const char *relpath
, struct timespec times
[2], int flags
,
8577 const UserPerm
& perms
) {
8578 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8579 if (!mref_reader
.is_state_satisfied()) {
8580 return -CEPHFS_ENOTCONN
;
8583 tout(cct
) << __func__
<< std::endl
;
8584 tout(cct
) << dirfd
<< std::endl
;
8585 tout(cct
) << relpath
<< std::endl
;
8586 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8588 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8590 tout(cct
) << flags
<< std::endl
;
8592 filepath
path(relpath
);
8596 std::scoped_lock
lock(client_lock
);
8597 int r
= get_fd_inode(dirfd
, &dirinode
);
8602 #if defined(__linux__) && defined(O_PATH)
8603 if (flags
& O_PATH
) {
8604 return -CEPHFS_EBADF
;
8608 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8613 utime_t
atime(times
[0]);
8614 utime_t
mtime(times
[1]);
8616 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
8617 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8620 int Client::flock(int fd
, int operation
, uint64_t owner
)
8622 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8623 if (!mref_reader
.is_state_satisfied())
8624 return -CEPHFS_ENOTCONN
;
8626 tout(cct
) << __func__
<< std::endl
;
8627 tout(cct
) << fd
<< std::endl
;
8628 tout(cct
) << operation
<< std::endl
;
8629 tout(cct
) << owner
<< std::endl
;
8631 std::scoped_lock
lock(client_lock
);
8632 Fh
*f
= get_filehandle(fd
);
8634 return -CEPHFS_EBADF
;
8636 return _flock(f
, operation
, owner
);
8639 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8641 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8642 if (!mref_reader
.is_state_satisfied())
8643 return -CEPHFS_ENOTCONN
;
8645 tout(cct
) << __func__
<< std::endl
;
8646 tout(cct
) << relpath
<< std::endl
;
8648 filepath
path(relpath
);
8651 std::scoped_lock
lock(client_lock
);
8652 int r
= path_walk(path
, &in
, perms
, true);
8655 if (cct
->_conf
->client_permissions
) {
8656 int r
= may_open(in
.get(), O_RDONLY
, perms
);
8660 r
= _opendir(in
.get(), dirpp
, perms
);
8661 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8662 if (r
!= -CEPHFS_ENOTDIR
)
8663 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8667 int Client::fdopendir(int dirfd
, dir_result_t
**dirpp
, const UserPerm
&perms
) {
8668 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8669 if (!mref_reader
.is_state_satisfied()) {
8670 return -CEPHFS_ENOTCONN
;
8673 tout(cct
) << __func__
<< std::endl
;
8674 tout(cct
) << dirfd
<< std::endl
;
8677 std::scoped_lock
locker(client_lock
);
8678 int r
= get_fd_inode(dirfd
, &dirinode
);
8683 if (cct
->_conf
->client_permissions
) {
8684 r
= may_open(dirinode
.get(), O_RDONLY
, perms
);
8689 r
= _opendir(dirinode
.get(), dirpp
, perms
);
8690 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8691 if (r
!= -CEPHFS_ENOTDIR
) {
8692 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8697 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8700 return -CEPHFS_ENOTDIR
;
8701 *dirpp
= new dir_result_t(in
, perms
);
8702 opened_dirs
.insert(*dirpp
);
8703 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
8708 int Client::closedir(dir_result_t
*dir
)
8710 tout(cct
) << __func__
<< std::endl
;
8711 tout(cct
) << (uintptr_t)dir
<< std::endl
;
8713 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
8714 std::scoped_lock
lock(client_lock
);
8719 void Client::_closedir(dir_result_t
*dirp
)
8721 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
8724 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
8725 dirp
->inode
.reset();
8727 _readdir_drop_dirp_buffer(dirp
);
8728 opened_dirs
.erase(dirp
);
8732 void Client::rewinddir(dir_result_t
*dirp
)
8734 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
8736 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8737 if (!mref_reader
.is_state_satisfied())
8740 std::scoped_lock
lock(client_lock
);
8741 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8742 _readdir_drop_dirp_buffer(d
);
8746 loff_t
Client::telldir(dir_result_t
*dirp
)
8748 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
8749 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
8753 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
8755 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
8757 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8758 if (!mref_reader
.is_state_satisfied())
8761 std::scoped_lock
lock(client_lock
);
8763 if (offset
== dirp
->offset
)
8766 if (offset
> dirp
->offset
)
8767 dirp
->release_count
= 0; // bump if we do a forward seek
8769 dirp
->ordered_count
= 0; // disable filling readdir cache
8771 if (dirp
->hash_order()) {
8772 if (dirp
->offset
> offset
) {
8773 _readdir_drop_dirp_buffer(dirp
);
8778 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
8779 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
8780 _readdir_drop_dirp_buffer(dirp
);
8785 dirp
->offset
= offset
;
8790 // ino_t d_ino; /* inode number */
8791 // off_t d_off; /* offset to the next dirent */
8792 // unsigned short d_reclen; /* length of this record */
8793 // unsigned char d_type; /* type of file */
8794 // char d_name[256]; /* filename */
8796 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
8798 strncpy(de
->d_name
, name
, 255);
8799 de
->d_name
[255] = '\0';
8800 #if !defined(__CYGWIN__) && !(defined(_WIN32))
8802 #if !defined(__APPLE__) && !defined(__FreeBSD__)
8803 de
->d_off
= next_off
;
8806 de
->d_type
= IFTODT(type
);
8807 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
8808 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
8812 void Client::_readdir_next_frag(dir_result_t
*dirp
)
8814 frag_t fg
= dirp
->buffer_frag
;
8816 if (fg
.is_rightmost()) {
8817 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8824 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8826 if (dirp
->hash_order()) {
8828 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8829 if (dirp
->offset
< new_offset
) // don't decrease offset
8830 dirp
->offset
= new_offset
;
8832 dirp
->last_name
.clear();
8833 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8834 _readdir_rechoose_frag(dirp
);
8838 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8840 ceph_assert(dirp
->inode
);
8842 if (dirp
->hash_order())
8845 frag_t cur
= frag_t(dirp
->offset_high());
8846 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8848 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8849 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8850 dirp
->last_name
.clear();
8851 dirp
->next_offset
= 2;
8855 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8857 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8858 dirp
->buffer
.clear();
8861 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8864 ceph_assert(dirp
->inode
);
8866 // get the current frag.
8868 if (dirp
->hash_order())
8869 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8871 fg
= frag_t(dirp
->offset_high());
8873 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8874 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8876 int op
= CEPH_MDS_OP_READDIR
;
8877 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8878 op
= CEPH_MDS_OP_LSSNAP
;
8880 InodeRef
& diri
= dirp
->inode
;
8882 MetaRequest
*req
= new MetaRequest(op
);
8884 diri
->make_nosnap_relative_path(path
);
8885 req
->set_filepath(path
);
8886 req
->set_inode(diri
.get());
8887 req
->head
.args
.readdir
.frag
= fg
;
8888 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8889 if (dirp
->last_name
.length()) {
8890 req
->path2
.set_path(dirp
->last_name
);
8891 } else if (dirp
->hash_order()) {
8892 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8897 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8899 if (res
== -CEPHFS_EAGAIN
) {
8900 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8901 _readdir_rechoose_frag(dirp
);
8902 return _readdir_get_frag(dirp
);
8906 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8907 << " size " << dirp
->buffer
.size() << dendl
;
8909 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8916 struct dentry_off_lt
{
8917 bool operator()(const Dentry
* dn
, int64_t off
) const {
8918 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8922 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8923 int caps
, bool getref
)
8925 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
8926 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8927 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8929 Dir
*dir
= dirp
->inode
->dir
;
8932 ldout(cct
, 10) << " dir is empty" << dendl
;
8937 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8938 dir
->readdir_cache
.end(),
8939 dirp
->offset
, dentry_off_lt());
8944 if (!dirp
->inode
->is_complete_and_ordered())
8945 return -CEPHFS_EAGAIN
;
8946 if (pd
== dir
->readdir_cache
.end())
8949 if (dn
->inode
== NULL
) {
8950 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8954 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8955 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8960 int idx
= pd
- dir
->readdir_cache
.begin();
8961 if (dn
->inode
->is_dir()) {
8962 mask
|= CEPH_STAT_RSTAT
;
8964 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
8968 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8969 pd
= dir
->readdir_cache
.begin() + idx
;
8970 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8971 return -CEPHFS_EAGAIN
;
8973 struct ceph_statx stx
;
8975 fill_statx(dn
->inode
, caps
, &stx
);
8977 uint64_t next_off
= dn
->offset
+ 1;
8978 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8980 if (pd
== dir
->readdir_cache
.end())
8981 next_off
= dir_result_t::END
;
8985 in
= dn
->inode
.get();
8989 dn_name
= dn
->name
; // fill in name while we have lock
8991 client_lock
.unlock();
8992 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8994 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8995 << " = " << r
<< dendl
;
9000 dirp
->offset
= next_off
;
9002 dirp
->next_offset
= 2;
9004 dirp
->next_offset
= dirp
->offset_low();
9005 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
9006 dirp
->release_count
= 0; // last_name no longer match cache index
9011 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
9016 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
9017 unsigned want
, unsigned flags
, bool getref
)
9019 int caps
= statx_to_mask(flags
, want
);
9021 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9022 if (!mref_reader
.is_state_satisfied())
9023 return -CEPHFS_ENOTCONN
;
9025 std::unique_lock
cl(client_lock
);
9027 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
9029 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
9030 << dec
<< " at_end=" << dirp
->at_end()
9031 << " hash_order=" << dirp
->hash_order() << dendl
;
9034 struct ceph_statx stx
;
9035 memset(&de
, 0, sizeof(de
));
9036 memset(&stx
, 0, sizeof(stx
));
9038 InodeRef
& diri
= dirp
->inode
;
9043 if (dirp
->offset
== 0) {
9044 ldout(cct
, 15) << " including ." << dendl
;
9045 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
9046 uint64_t next_off
= 1;
9049 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9053 fill_statx(diri
, caps
, &stx
);
9054 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
9056 Inode
*inode
= NULL
;
9063 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9068 dirp
->offset
= next_off
;
9072 if (dirp
->offset
== 1) {
9073 ldout(cct
, 15) << " including .." << dendl
;
9074 uint64_t next_off
= 2;
9076 if (diri
->dentries
.empty())
9079 in
= diri
->get_first_parent()->dir
->parent_inode
;
9082 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9086 fill_statx(in
, caps
, &stx
);
9087 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
9089 Inode
*inode
= NULL
;
9096 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9101 dirp
->offset
= next_off
;
9106 // can we read from our cache?
9107 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
9108 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
9109 << dirp
->inode
->is_complete_and_ordered()
9110 << " issued " << ccap_string(dirp
->inode
->caps_issued())
9112 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
9113 dirp
->inode
->is_complete_and_ordered() &&
9114 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
9115 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
9116 if (err
!= -CEPHFS_EAGAIN
)
9124 bool check_caps
= true;
9125 if (!dirp
->is_cached()) {
9126 int r
= _readdir_get_frag(dirp
);
9129 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9130 // different than the requested one. (our dirfragtree was outdated)
9133 frag_t fg
= dirp
->buffer_frag
;
9135 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
9136 << " offset " << hex
<< dirp
->offset
<< dendl
;
9138 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
9139 dirp
->offset
, dir_result_t::dentry_off_lt());
9140 it
!= dirp
->buffer
.end();
9142 dir_result_t::dentry
&entry
= *it
;
9144 uint64_t next_off
= entry
.offset
+ 1;
9149 if(entry
.inode
->is_dir()){
9150 mask
|= CEPH_STAT_RSTAT
;
9152 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
9157 fill_statx(entry
.inode
, caps
, &stx
);
9158 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9160 Inode
*inode
= NULL
;
9162 inode
= entry
.inode
.get();
9167 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
9170 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
9171 << " = " << r
<< dendl
;
9175 dirp
->offset
= next_off
;
9180 if (dirp
->next_offset
> 2) {
9181 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
9182 _readdir_drop_dirp_buffer(dirp
);
9186 if (!fg
.is_rightmost()) {
9188 _readdir_next_frag(dirp
);
9192 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
9193 diri
->dir_release_count
== dirp
->release_count
) {
9194 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
9195 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
9197 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
9198 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
9200 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
9202 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
9203 diri
->flags
|= I_COMPLETE
;
9215 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
9217 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
9224 * 1 if we got a dirent
9225 * 0 for end of directory
9229 struct single_readdir
{
9231 struct ceph_statx
*stx
;
9236 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
9237 struct ceph_statx
*stx
, off_t off
,
9240 single_readdir
*c
= static_cast<single_readdir
*>(p
);
9243 return -1; // already filled this dirent
9253 struct dirent
*Client::readdir(dir_result_t
*d
)
9263 // our callback fills the dirent and sets sr.full=true on first
9264 // call, and returns -1 the second time around.
9265 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
9267 errno
= -ret
; // this sucks.
9268 return (dirent
*) NULL
;
9273 return (dirent
*) NULL
;
9276 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
9277 struct ceph_statx
*stx
, unsigned want
,
9278 unsigned flags
, Inode
**out
)
9286 // our callback fills the dirent and sets sr.full=true on first
9287 // call, and returns -1 the second time around.
9288 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
9300 struct getdents_result
{
9307 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
9308 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9310 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
9316 dlen
= strlen(de
->d_name
) + 1;
9318 if (c
->pos
+ dlen
> c
->buflen
)
9319 return -1; // doesn't fit
9322 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
9324 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
9330 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
9335 gr
.fullent
= fullent
;
9338 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
9340 if (r
< 0) { // some error
9341 if (r
== -1) { // buffer ran out of space
9342 if (gr
.pos
) { // but we got some entries already!
9344 } // or we need a larger buffer
9345 return -CEPHFS_ERANGE
;
9346 } else { // actual error, return it
9355 struct getdir_result
{
9356 list
<string
> *contents
;
9360 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9362 getdir_result
*r
= static_cast<getdir_result
*>(p
);
9364 r
->contents
->push_back(de
->d_name
);
9369 int Client::getdir(const char *relpath
, list
<string
>& contents
,
9370 const UserPerm
& perms
)
9372 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
9373 tout(cct
) << "getdir" << std::endl
;
9374 tout(cct
) << relpath
<< std::endl
;
9377 int r
= opendir(relpath
, &d
, perms
);
9382 gr
.contents
= &contents
;
9384 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
9394 /****** file i/o **********/
9396 // common parts for open and openat. call with client_lock locked.
9397 int Client::create_and_open(int dirfd
, const char *relpath
, int flags
,
9398 const UserPerm
& perms
, mode_t mode
, int stripe_unit
,
9399 int stripe_count
, int object_size
, const char *data_pool
,
9400 std::string alternate_name
) {
9401 ceph_assert(ceph_mutex_is_locked(client_lock
));
9402 int cflags
= ceph_flags_sys2wire(flags
);
9403 tout(cct
) << cflags
<< std::endl
;
9407 #if defined(__linux__) && defined(O_PATH)
9408 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9409 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9410 * in kernel (fs/open.c). */
9412 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
9415 filepath
path(relpath
);
9417 bool created
= false;
9418 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9419 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
9420 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
9422 InodeRef dirinode
= nullptr;
9423 int r
= get_fd_inode(dirfd
, &dirinode
);
9428 r
= path_walk(path
, &in
, perms
, followsym
, mask
, dirinode
);
9429 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
9430 return -CEPHFS_EEXIST
;
9432 #if defined(__linux__) && defined(O_PATH)
9433 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
9435 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
9437 return -CEPHFS_ELOOP
;
9439 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
9440 filepath dirpath
= path
;
9441 string dname
= dirpath
.last_dentry();
9442 dirpath
.pop_dentry();
9444 r
= path_walk(dirpath
, &dir
, perms
, true,
9445 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0, dirinode
);
9449 if (cct
->_conf
->client_permissions
) {
9450 r
= may_create(dir
.get(), perms
);
9454 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
9455 stripe_count
, object_size
, data_pool
, &created
, perms
,
9456 std::move(alternate_name
));
9462 // posix says we can only check permissions of existing files
9463 if (cct
->_conf
->client_permissions
) {
9464 r
= may_open(in
.get(), flags
, perms
);
9471 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
9473 // allocate a integer file descriptor
9476 ceph_assert(fd_map
.count(r
) == 0);
9484 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
9485 mode_t mode
, int stripe_unit
, int stripe_count
,
9486 int object_size
, const char *data_pool
, std::string alternate_name
)
9488 return openat(CEPHFS_AT_FDCWD
, relpath
, flags
, perms
, mode
, stripe_unit
,
9489 stripe_count
, object_size
, data_pool
, alternate_name
);
9492 int Client::openat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perms
,
9493 mode_t mode
, int stripe_unit
, int stripe_count
, int object_size
,
9494 const char *data_pool
, std::string alternate_name
) {
9495 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9496 if (!mref_reader
.is_state_satisfied()) {
9497 return -CEPHFS_ENOTCONN
;
9500 ldout(cct
, 3) << "openat enter(" << relpath
<< ")" << dendl
;
9501 tout(cct
) << dirfd
<< std::endl
;
9502 tout(cct
) << relpath
<< std::endl
;
9503 tout(cct
) << flags
<< std::endl
;
9504 tout(cct
) << mode
<< std::endl
;
9506 std::scoped_lock
locker(client_lock
);
9507 int r
= create_and_open(dirfd
, relpath
, flags
, perms
, mode
, stripe_unit
, stripe_count
,
9508 object_size
, data_pool
, alternate_name
);
9510 tout(cct
) << r
<< std::endl
;
9511 ldout(cct
, 3) << "openat exit(" << relpath
<< ")" << dendl
;
9515 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
9516 const UserPerm
& perms
)
9518 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
9520 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9521 if (!mref_reader
.is_state_satisfied())
9522 return -CEPHFS_ENOTCONN
;
9524 std::scoped_lock
lock(client_lock
);
9525 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
9527 req
->set_filepath(path
);
9529 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
9531 sprintf(f
, "%u", h
);
9532 filepath
path2(dirino
);
9533 path2
.push_dentry(string(f
));
9534 req
->set_filepath2(path2
);
9536 int r
= make_request(req
, perms
, NULL
, NULL
,
9537 rand() % mdsmap
->get_num_in_mds());
9538 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
9544 * Load inode into local cache.
9546 * If inode pointer is non-NULL, and take a reference on
9547 * the resulting Inode object in one operation, so that caller
9548 * can safely assume inode will still be there after return.
9550 int Client::_lookup_vino(vinodeno_t vino
, const UserPerm
& perms
, Inode
**inode
)
9552 ldout(cct
, 8) << __func__
<< " enter(" << vino
<< ")" << dendl
;
9554 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9555 if (!mref_reader
.is_state_satisfied())
9556 return -CEPHFS_ENOTCONN
;
9558 if (is_reserved_vino(vino
))
9559 return -CEPHFS_ESTALE
;
9561 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
9562 filepath
path(vino
.ino
);
9563 req
->set_filepath(path
);
9566 * The MDS expects either a "real" snapid here or 0. The special value
9567 * carveouts for the snapid are all at the end of the range so we can
9568 * just look for any snapid below this value.
9570 if (vino
.snapid
< CEPH_NOSNAP
)
9571 req
->head
.args
.lookupino
.snapid
= vino
.snapid
;
9573 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9574 if (r
== 0 && inode
!= NULL
) {
9575 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
9576 ceph_assert(p
!= inode_map
.end());
9580 ldout(cct
, 8) << __func__
<< " exit(" << vino
<< ") = " << r
<< dendl
;
9584 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
9586 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
9587 std::scoped_lock
lock(client_lock
);
9588 return _lookup_vino(vino
, perms
, inode
);
9592 * Find the parent inode of `ino` and insert it into
9593 * our cache. Conditionally also set `parent` to a referenced
9594 * Inode* if caller provides non-NULL value.
9596 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
9598 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9600 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
9601 filepath
path(ino
->ino
);
9602 req
->set_filepath(path
);
9605 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
9606 // Give caller a reference to the parent ino if they provided a pointer.
9607 if (parent
!= NULL
) {
9609 *parent
= target
.get();
9611 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
9616 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9621 * Populate the parent dentry for `ino`, provided it is
9622 * a child of `parent`.
9624 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9626 ceph_assert(parent
->is_dir());
9627 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
9629 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9630 if (!mref_reader
.is_state_satisfied())
9631 return -CEPHFS_ENOTCONN
;
9633 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9634 req
->set_filepath2(filepath(parent
->ino
));
9635 req
->set_filepath(filepath(ino
->ino
));
9636 req
->set_inode(ino
);
9638 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
9639 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
9643 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
9645 std::scoped_lock
lock(client_lock
);
9646 return _lookup_name(ino
, parent
, perms
);
9649 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
9652 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
9654 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
9656 if (in
->snapid
!= CEPH_NOSNAP
) {
9657 in
->snap_cap_refs
++;
9658 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
9659 << ccap_string(in
->caps_issued()) << dendl
;
9662 const auto& conf
= cct
->_conf
;
9663 f
->readahead
.set_trigger_requests(1);
9664 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
9665 uint64_t max_readahead
= Readahead::NO_LIMIT
;
9666 if (conf
->client_readahead_max_bytes
) {
9667 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
9669 if (conf
->client_readahead_max_periods
) {
9670 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
9672 f
->readahead
.set_max_readahead_size(max_readahead
);
9673 vector
<uint64_t> alignments
;
9674 alignments
.push_back(in
->layout
.get_period());
9675 alignments
.push_back(in
->layout
.stripe_unit
);
9676 f
->readahead
.set_alignments(alignments
);
9681 int Client::_release_fh(Fh
*f
)
9683 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9684 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9685 Inode
*in
= f
->inode
.get();
9686 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
9690 if (in
->snapid
== CEPH_NOSNAP
) {
9691 if (in
->put_open_ref(f
->mode
)) {
9692 _flush(in
, new C_Client_FlushComplete(this, in
));
9696 ceph_assert(in
->snap_cap_refs
> 0);
9697 in
->snap_cap_refs
--;
9700 _release_filelocks(f
);
9702 // Finally, read any async err (i.e. from flushes)
9703 int err
= f
->take_async_err();
9705 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
9706 << cpp_strerror(err
) << dendl
;
9708 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9716 void Client::_put_fh(Fh
*f
)
9718 int left
= f
->put();
9724 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
9725 const UserPerm
& perms
)
9727 if (in
->snapid
!= CEPH_NOSNAP
&&
9728 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
9729 return -CEPHFS_EROFS
;
9732 // use normalized flags to generate cmode
9733 int cflags
= ceph_flags_sys2wire(flags
);
9734 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
9735 cflags
|= CEPH_O_LAZY
;
9737 int cmode
= ceph_flags_to_mode(cflags
);
9738 int want
= ceph_caps_for_mode(cmode
);
9741 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
9743 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
9745 check_caps(in
, CHECK_CAPS_NODELAY
);
9748 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9750 in
->make_nosnap_relative_path(path
);
9751 req
->set_filepath(path
);
9752 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
9753 req
->head
.args
.open
.mode
= mode
;
9754 req
->head
.args
.open
.pool
= -1;
9755 if (cct
->_conf
->client_debug_getattr_caps
)
9756 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9758 req
->head
.args
.open
.mask
= 0;
9759 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
9761 result
= make_request(req
, perms
);
9764 * NFS expects that delegations will be broken on a conflicting open,
9765 * not just when there is actual conflicting access to the file. SMB leases
9766 * and oplocks also have similar semantics.
9768 * Ensure that clients that have delegations enabled will wait on minimal
9769 * caps during open, just to ensure that other clients holding delegations
9770 * return theirs first.
9772 if (deleg_timeout
&& result
== 0) {
9775 if (cmode
& CEPH_FILE_MODE_WR
)
9776 need
|= CEPH_CAP_FILE_WR
;
9777 if (cmode
& CEPH_FILE_MODE_RD
)
9778 need
|= CEPH_CAP_FILE_RD
;
9780 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
9781 result
= get_caps(&fh
, need
, want
, &have
, -1);
9783 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
9784 " . Denying open: " <<
9785 cpp_strerror(result
) << dendl
;
9787 put_cap_ref(in
, need
);
9795 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
9797 in
->put_open_ref(cmode
);
9805 int Client::_renew_caps(Inode
*in
)
9807 int wanted
= in
->caps_file_wanted();
9808 if (in
->is_any_caps() &&
9809 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
9810 check_caps(in
, CHECK_CAPS_NODELAY
);
9815 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
9817 else if (wanted
& CEPH_CAP_FILE_RD
)
9819 else if (wanted
& CEPH_CAP_FILE_WR
)
9822 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
9824 in
->make_nosnap_relative_path(path
);
9825 req
->set_filepath(path
);
9826 req
->head
.args
.open
.flags
= flags
;
9827 req
->head
.args
.open
.pool
= -1;
9828 if (cct
->_conf
->client_debug_getattr_caps
)
9829 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
9831 req
->head
.args
.open
.mask
= 0;
9834 // duplicate in case Cap goes away; not sure if that race is a concern?
9835 const UserPerm
*pperm
= in
->get_best_perms();
9839 int ret
= make_request(req
, perms
);
9843 int Client::_close(int fd
)
9845 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
9846 tout(cct
) << "close" << std::endl
;
9847 tout(cct
) << fd
<< std::endl
;
9849 Fh
*fh
= get_filehandle(fd
);
9851 return -CEPHFS_EBADF
;
9852 int err
= _release_fh(fh
);
9855 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9859 int Client::close(int fd
) {
9860 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9861 if (!mref_reader
.is_state_satisfied())
9862 return -CEPHFS_ENOTCONN
;
9864 std::scoped_lock
lock(client_lock
);
9871 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9873 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9874 if (!mref_reader
.is_state_satisfied())
9875 return -CEPHFS_ENOTCONN
;
9877 tout(cct
) << "lseek" << std::endl
;
9878 tout(cct
) << fd
<< std::endl
;
9879 tout(cct
) << offset
<< std::endl
;
9880 tout(cct
) << whence
<< std::endl
;
9882 std::scoped_lock
lock(client_lock
);
9883 Fh
*f
= get_filehandle(fd
);
9885 return -CEPHFS_EBADF
;
9886 #if defined(__linux__) && defined(O_PATH)
9887 if (f
->flags
& O_PATH
)
9888 return -CEPHFS_EBADF
;
9890 return _lseek(f
, offset
, whence
);
9893 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9895 Inode
*in
= f
->inode
.get();
9896 bool whence_check
= false;
9901 whence_check
= true;
9906 whence_check
= true;
9912 whence_check
= true;
9918 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9929 pos
= f
->pos
+ offset
;
9933 pos
= in
->size
+ offset
;
9938 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9939 return -CEPHFS_ENXIO
;
9946 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9947 return -CEPHFS_ENXIO
;
9953 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9954 return -CEPHFS_EINVAL
;
9958 return -CEPHFS_EINVAL
;
9963 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9968 void Client::lock_fh_pos(Fh
*f
)
9970 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9972 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9973 ceph::condition_variable cond
;
9974 f
->pos_waiters
.push_back(&cond
);
9975 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9976 std::unique_lock l
{client_lock
, std::adopt_lock
};
9977 cond
.wait(l
, [f
, me
=&cond
] {
9978 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9981 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9982 ceph_assert(f
->pos_waiters
.front() == &cond
);
9983 f
->pos_waiters
.pop_front();
9986 f
->pos_locked
= true;
9989 void Client::unlock_fh_pos(Fh
*f
)
9991 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9993 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9994 f
->pos_locked
= false;
9995 if (!f
->pos_waiters
.empty()) {
9996 // only wake up the oldest waiter
9997 auto cond
= f
->pos_waiters
.front();
10002 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
10004 if (!in
->inline_data
.length()) {
10005 onfinish
->complete(0);
10010 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
10011 object_t oid
= oid_buf
;
10013 ObjectOperation create_ops
;
10014 create_ops
.create(false);
10016 objecter
->mutate(oid
,
10017 OSDMap::file_to_object_locator(in
->layout
),
10019 in
->snaprealm
->get_snap_context(),
10020 ceph::real_clock::now(),
10024 bufferlist inline_version_bl
;
10025 encode(in
->inline_version
, inline_version_bl
);
10027 ObjectOperation uninline_ops
;
10028 uninline_ops
.cmpxattr("inline_version",
10029 CEPH_OSD_CMPXATTR_OP_GT
,
10030 CEPH_OSD_CMPXATTR_MODE_U64
,
10031 inline_version_bl
);
10032 bufferlist inline_data
= in
->inline_data
;
10033 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
10034 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
10036 objecter
->mutate(oid
,
10037 OSDMap::file_to_object_locator(in
->layout
),
10039 in
->snaprealm
->get_snap_context(),
10040 ceph::real_clock::now(),
10049 // blocking osd interface
10051 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
10053 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10054 if (!mref_reader
.is_state_satisfied())
10055 return -CEPHFS_ENOTCONN
;
10057 tout(cct
) << "read" << std::endl
;
10058 tout(cct
) << fd
<< std::endl
;
10059 tout(cct
) << size
<< std::endl
;
10060 tout(cct
) << offset
<< std::endl
;
10062 std::unique_lock
lock(client_lock
);
10063 Fh
*f
= get_filehandle(fd
);
10065 return -CEPHFS_EBADF
;
10066 #if defined(__linux__) && defined(O_PATH)
10067 if (f
->flags
& O_PATH
)
10068 return -CEPHFS_EBADF
;
10071 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10072 size
= std::min(size
, (loff_t
)INT_MAX
);
10073 int r
= _read(f
, offset
, size
, &bl
);
10074 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10077 bl
.begin().copy(bl
.length(), buf
);
10083 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
10086 return -CEPHFS_EINVAL
;
10087 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
10090 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
10092 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10094 int want
, have
= 0;
10095 bool movepos
= false;
10096 std::unique_ptr
<C_SaferCond
> onuninline
;
10098 const auto& conf
= cct
->_conf
;
10099 Inode
*in
= f
->inode
.get();
10101 utime_t start
= ceph_clock_now();
10103 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
10104 return -CEPHFS_EBADF
;
10105 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10112 loff_t start_pos
= offset
;
10114 if (in
->inline_version
== 0) {
10115 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10120 ceph_assert(in
->inline_version
> 0);
10124 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10125 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
10127 want
= CEPH_CAP_FILE_CACHE
;
10129 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
10135 if (f
->flags
& O_DIRECT
)
10136 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
10138 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10139 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
10140 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
10141 uninline_data(in
, onuninline
.get());
10143 uint32_t len
= in
->inline_data
.length();
10144 uint64_t endoff
= offset
+ size
;
10145 if (endoff
> in
->size
)
10148 if (offset
< len
) {
10149 if (endoff
<= len
) {
10150 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
10152 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
10153 bl
->append_zero(endoff
- len
);
10155 rc
= endoff
- offset
;
10156 } else if ((uint64_t)offset
< endoff
) {
10157 bl
->append_zero(endoff
- offset
);
10158 rc
= endoff
- offset
;
10166 if (!conf
->client_debug_force_sync_read
&&
10168 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
10170 if (f
->flags
& O_RSYNC
) {
10171 _flush_range(in
, offset
, size
);
10173 rc
= _read_async(f
, offset
, size
, bl
);
10177 if (f
->flags
& O_DIRECT
)
10178 _flush_range(in
, offset
, size
);
10180 bool checkeof
= false;
10181 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
10188 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10192 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10199 // eof? short read.
10200 if ((uint64_t)offset
< in
->size
)
10206 ceph_assert(rc
>= 0);
10207 update_read_io_size(bl
->length());
10210 f
->pos
= start_pos
+ rc
;
10213 lat
= ceph_clock_now();
10217 update_io_stat_read(lat
);
10223 client_lock
.unlock();
10224 int ret
= onuninline
->wait();
10225 client_lock
.lock();
10226 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
10227 in
->inline_data
.clear();
10228 in
->inline_version
= CEPH_INLINE_NONE
;
10229 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10235 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10243 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
10246 f
->readahead
.inc_pending();
10249 Client::C_Readahead::~C_Readahead() {
10250 f
->readahead
.dec_pending();
10251 client
->_put_fh(f
);
10254 void Client::C_Readahead::finish(int r
) {
10255 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
10256 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10258 client
->update_read_io_size(r
);
10262 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
10264 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10266 const auto& conf
= cct
->_conf
;
10267 Inode
*in
= f
->inode
.get();
10269 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10271 // trim read based on file size?
10272 if (off
>= in
->size
)
10276 if (off
+ len
> in
->size
) {
10277 len
= in
->size
- off
;
10280 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
10281 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
10282 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
10284 // read (and possibly block)
10286 C_SaferCond
onfinish("Client::_read_async flock");
10287 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10288 off
, len
, bl
, 0, &onfinish
);
10290 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10291 client_lock
.unlock();
10292 r
= onfinish
.wait();
10293 client_lock
.lock();
10294 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10295 update_read_io_size(bl
->length());
10298 if(f
->readahead
.get_min_readahead_size() > 0) {
10299 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
10300 if (readahead_extent
.second
> 0) {
10301 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
10302 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
10303 Context
*onfinish2
= new C_Readahead(this, f
);
10304 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10305 readahead_extent
.first
, readahead_extent
.second
,
10306 NULL
, 0, onfinish2
);
10308 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
10309 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10311 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
10320 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
10323 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10325 Inode
*in
= f
->inode
.get();
10326 uint64_t pos
= off
;
10330 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10332 // 0 success, 1 continue and < 0 error happen.
10333 auto wait_and_copy
= [&](C_SaferCond
&onfinish
, bufferlist
&tbl
, int wanted
) {
10334 int r
= onfinish
.wait();
10336 // if we get ENOENT from OSD, assume 0 bytes returned
10337 if (r
== -CEPHFS_ENOENT
)
10342 if (tbl
.length()) {
10348 bl
->claim_append(tbl
);
10351 if (r
>= 0 && r
< wanted
) {
10352 if (pos
< in
->size
) {
10353 // zero up to known EOF
10354 int64_t some
= in
->size
- pos
;
10357 auto z
= buffer::ptr_node::create(some
);
10359 bl
->push_back(std::move(z
));
10374 C_SaferCond
onfinish("Client::_read_sync flock");
10378 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
10379 pos
, left
, &tbl
, 0,
10380 in
->truncate_size
, in
->truncate_seq
,
10382 client_lock
.unlock();
10383 int r
= wait_and_copy(onfinish
, tbl
, wanted
);
10384 client_lock
.lock();
10393 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
10395 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10396 if (!mref_reader
.is_state_satisfied())
10397 return -CEPHFS_ENOTCONN
;
10399 tout(cct
) << "write" << std::endl
;
10400 tout(cct
) << fd
<< std::endl
;
10401 tout(cct
) << size
<< std::endl
;
10402 tout(cct
) << offset
<< std::endl
;
10404 std::scoped_lock
lock(client_lock
);
10405 Fh
*fh
= get_filehandle(fd
);
10407 return -CEPHFS_EBADF
;
10408 #if defined(__linux__) && defined(O_PATH)
10409 if (fh
->flags
& O_PATH
)
10410 return -CEPHFS_EBADF
;
10412 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10413 size
= std::min(size
, (loff_t
)INT_MAX
);
10414 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
10415 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10419 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
10422 return -CEPHFS_EINVAL
;
10423 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
10426 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
10427 unsigned iovcnt
, int64_t offset
,
10428 bool write
, bool clamp_to_int
)
10430 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10432 #if defined(__linux__) && defined(O_PATH)
10433 if (fh
->flags
& O_PATH
)
10434 return -CEPHFS_EBADF
;
10436 loff_t totallen
= 0;
10437 for (unsigned i
= 0; i
< iovcnt
; i
++) {
10438 totallen
+= iov
[i
].iov_len
;
10442 * Some of the API functions take 64-bit size values, but only return
10443 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10444 * we don't do I/Os larger than the values we can return.
10446 if (clamp_to_int
) {
10447 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
10450 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
10451 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
10455 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
10456 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
10460 client_lock
.unlock();
10461 auto iter
= bl
.cbegin();
10462 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
10464 * This piece of code aims to handle the case that bufferlist
10465 * does not have enough data to fill in the iov
10467 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
10468 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
10469 resid
-= round_size
;
10470 /* iter is self-updating */
10472 client_lock
.lock();
10477 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
10479 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10480 if (!mref_reader
.is_state_satisfied())
10481 return -CEPHFS_ENOTCONN
;
10483 tout(cct
) << fd
<< std::endl
;
10484 tout(cct
) << offset
<< std::endl
;
10486 std::scoped_lock
cl(client_lock
);
10487 Fh
*fh
= get_filehandle(fd
);
10489 return -CEPHFS_EBADF
;
10490 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
10493 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
10494 const struct iovec
*iov
, int iovcnt
)
10496 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10499 Inode
*in
= f
->inode
.get();
10501 if ( (uint64_t)(offset
+size
) > mdsmap
->get_max_filesize() && //exceeds config
10502 (uint64_t)(offset
+size
) > in
->size
) { //exceeds filesize
10503 return -CEPHFS_EFBIG
;
10505 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10507 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
10508 return -CEPHFS_ENOSPC
;
10511 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
10513 // was Fh opened as writeable?
10514 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10515 return -CEPHFS_EBADF
;
10517 // use/adjust fd pos?
10521 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10522 * change out from under us.
10524 if (f
->flags
& O_APPEND
) {
10525 auto r
= _lseek(f
, 0, SEEK_END
);
10532 fpos
= offset
+size
;
10537 uint64_t endoff
= offset
+ size
;
10538 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
10540 return -CEPHFS_EDQUOT
;
10543 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10545 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
10548 utime_t start
= ceph_clock_now();
10550 if (in
->inline_version
== 0) {
10551 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10554 ceph_assert(in
->inline_version
> 0);
10557 // copy into fresh buffer (since our write may be resub, async)
10561 bl
.append(buf
, size
);
10563 for (int i
= 0; i
< iovcnt
; i
++) {
10564 if (iov
[i
].iov_len
> 0) {
10565 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
10571 uint64_t totalwritten
;
10573 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10574 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
10576 want
= CEPH_CAP_FILE_BUFFER
;
10577 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
10581 /* clear the setuid/setgid bits, if any */
10582 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
10583 struct ceph_statx stx
= { 0 };
10585 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10586 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
10590 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
10593 if (f
->flags
& O_DIRECT
)
10594 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
10596 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
10598 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
10600 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10601 if (endoff
> cct
->_conf
->client_max_inline_size
||
10602 endoff
> CEPH_INLINE_MAX_SIZE
||
10603 !(have
& CEPH_CAP_FILE_BUFFER
)) {
10604 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10605 uninline_data(in
, onuninline
.get());
10607 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10609 uint32_t len
= in
->inline_data
.length();
10612 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
10615 in
->inline_data
.splice(offset
, len
- offset
);
10616 else if (offset
> len
)
10617 in
->inline_data
.append_zero(offset
- len
);
10619 in
->inline_data
.append(bl
);
10620 in
->inline_version
++;
10622 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10628 if (cct
->_conf
->client_oc
&&
10629 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
10630 // do buffered write
10631 if (!in
->oset
.dirty_or_tx
)
10632 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
10634 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10636 // async, caching, non-blocking.
10637 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
10638 in
->snaprealm
->get_snap_context(),
10639 offset
, size
, bl
, ceph::real_clock::now(),
10641 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10646 // flush cached write if O_SYNC is set on file fh
10647 // O_DSYNC == O_SYNC on linux < 2.6.33
10648 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10649 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
10650 _flush_range(in
, offset
, size
);
10653 if (f
->flags
& O_DIRECT
)
10654 _flush_range(in
, offset
, size
);
10656 // simple, non-atomic sync write
10657 C_SaferCond
onfinish("Client::_write flock");
10658 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10660 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
10661 offset
, size
, bl
, ceph::real_clock::now(), 0,
10662 in
->truncate_size
, in
->truncate_seq
,
10664 client_lock
.unlock();
10665 r
= onfinish
.wait();
10666 client_lock
.lock();
10667 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
10672 // if we get here, write was successful, update client metadata
10674 update_write_io_size(size
);
10676 lat
= ceph_clock_now();
10679 ++nr_write_request
;
10680 update_io_stat_write(lat
);
10687 totalwritten
= size
;
10688 r
= (int64_t)totalwritten
;
10691 if (totalwritten
+ offset
> in
->size
) {
10692 in
->size
= totalwritten
+ offset
;
10693 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10695 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
10696 check_caps(in
, CHECK_CAPS_NODELAY
);
10697 } else if (is_max_size_approaching(in
)) {
10701 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
10703 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
10707 in
->mtime
= in
->ctime
= ceph_clock_now();
10709 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10713 if (nullptr != onuninline
) {
10714 client_lock
.unlock();
10715 int uninline_ret
= onuninline
->wait();
10716 client_lock
.lock();
10718 if (uninline_ret
>= 0 || uninline_ret
== -CEPHFS_ECANCELED
) {
10719 in
->inline_data
.clear();
10720 in
->inline_version
= CEPH_INLINE_NONE
;
10721 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
10727 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
10731 int Client::_flush(Fh
*f
)
10733 Inode
*in
= f
->inode
.get();
10734 int err
= f
->take_async_err();
10736 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
10737 << cpp_strerror(err
) << dendl
;
10739 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
10745 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
10747 struct ceph_statx stx
;
10748 stx
.stx_size
= length
;
10749 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
10752 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
10754 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10755 if (!mref_reader
.is_state_satisfied())
10756 return -CEPHFS_ENOTCONN
;
10758 tout(cct
) << __func__
<< std::endl
;
10759 tout(cct
) << fd
<< std::endl
;
10760 tout(cct
) << length
<< std::endl
;
10762 std::scoped_lock
lock(client_lock
);
10763 Fh
*f
= get_filehandle(fd
);
10765 return -CEPHFS_EBADF
;
10766 #if defined(__linux__) && defined(O_PATH)
10767 if (f
->flags
& O_PATH
)
10768 return -CEPHFS_EBADF
;
10770 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10771 return -CEPHFS_EBADF
;
10773 attr
.st_size
= length
;
10774 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
10777 int Client::fsync(int fd
, bool syncdataonly
)
10779 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10780 if (!mref_reader
.is_state_satisfied())
10781 return -CEPHFS_ENOTCONN
;
10783 tout(cct
) << "fsync" << std::endl
;
10784 tout(cct
) << fd
<< std::endl
;
10785 tout(cct
) << syncdataonly
<< std::endl
;
10787 std::scoped_lock
lock(client_lock
);
10788 Fh
*f
= get_filehandle(fd
);
10790 return -CEPHFS_EBADF
;
10791 #if defined(__linux__) && defined(O_PATH)
10792 if (f
->flags
& O_PATH
)
10793 return -CEPHFS_EBADF
;
10795 int r
= _fsync(f
, syncdataonly
);
10797 // The IOs in this fsync were okay, but maybe something happened
10798 // in the background that we shoudl be reporting?
10799 r
= f
->take_async_err();
10800 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
10801 << ") = 0, async_err = " << r
<< dendl
;
10803 // Assume that an error we encountered during fsync, even reported
10804 // synchronously, would also have applied the error to the Fh, and we
10805 // should clear it here to avoid returning the same error again on next
10807 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
10809 f
->take_async_err();
10814 int Client::_fsync(Inode
*in
, bool syncdataonly
)
10816 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10819 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
10820 ceph_tid_t flush_tid
= 0;
10823 utime_t start
= ceph_clock_now();
10825 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
10827 if (cct
->_conf
->client_oc
) {
10828 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
10829 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
10830 _flush(in
, object_cacher_completion
.get());
10831 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
10834 if (!syncdataonly
&& in
->dirty_caps
) {
10835 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
10836 if (in
->flushing_caps
)
10837 flush_tid
= last_flush_tid
;
10838 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
10840 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
10841 flush_mdlog_sync(in
);
10843 MetaRequest
*req
= in
->unsafe_ops
.back();
10844 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
10847 wait_on_list(req
->waitfor_safe
);
10851 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
10852 client_lock
.unlock();
10853 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10854 r
= object_cacher_completion
->wait();
10855 client_lock
.lock();
10856 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
10858 // FIXME: this can starve
10859 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
10860 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
10861 << " uncommitted, waiting" << dendl
;
10862 wait_on_list(in
->waitfor_commit
);
10868 wait_sync_caps(in
, flush_tid
);
10870 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
10872 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
10873 << cpp_strerror(-r
) << dendl
;
10876 lat
= ceph_clock_now();
10878 logger
->tinc(l_c_fsync
, lat
);
10883 int Client::_fsync(Fh
*f
, bool syncdataonly
)
10885 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
10886 return _fsync(f
->inode
.get(), syncdataonly
);
10889 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
10891 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10892 if (!mref_reader
.is_state_satisfied())
10893 return -CEPHFS_ENOTCONN
;
10895 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
10896 tout(cct
) << fd
<< std::endl
;
10898 std::scoped_lock
lock(client_lock
);
10899 Fh
*f
= get_filehandle(fd
);
10901 return -CEPHFS_EBADF
;
10902 int r
= _getattr(f
->inode
, mask
, perms
);
10905 fill_stat(f
->inode
, stbuf
, NULL
);
10906 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10910 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10911 unsigned int want
, unsigned int flags
)
10913 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10914 if (!mref_reader
.is_state_satisfied())
10915 return -CEPHFS_ENOTCONN
;
10917 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10918 tout(cct
) << fd
<< std::endl
;
10920 std::scoped_lock
lock(client_lock
);
10921 Fh
*f
= get_filehandle(fd
);
10923 return -CEPHFS_EBADF
;
10925 unsigned mask
= statx_to_mask(flags
, want
);
10929 r
= _getattr(f
->inode
, mask
, perms
);
10931 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10936 fill_statx(f
->inode
, mask
, stx
);
10937 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10941 int Client::statxat(int dirfd
, const char *relpath
,
10942 struct ceph_statx
*stx
, const UserPerm
& perms
,
10943 unsigned int want
, unsigned int flags
) {
10944 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10945 if (!mref_reader
.is_state_satisfied()) {
10946 return -CEPHFS_ENOTCONN
;
10949 tout(cct
) << __func__
<< " flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10950 tout(cct
) << dirfd
<< std::endl
;
10951 tout(cct
) << relpath
<< std::endl
;
10953 unsigned mask
= statx_to_mask(flags
, want
);
10956 std::scoped_lock
lock(client_lock
);
10957 int r
= get_fd_inode(dirfd
, &dirinode
);
10963 filepath
path(relpath
);
10964 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
, dirinode
);
10968 r
= _getattr(in
, mask
, perms
);
10970 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
10974 fill_statx(in
, mask
, stx
);
10975 ldout(cct
, 3) << __func__
<< " dirfd" << dirfd
<< ", r= " << r
<< dendl
;
10979 // not written yet, but i want to link!
10981 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
10982 const UserPerm
& perms
)
10984 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10985 if (!mref_reader
.is_state_satisfied())
10986 return -CEPHFS_ENOTCONN
;
10988 tout(cct
) << "chdir" << std::endl
;
10989 tout(cct
) << relpath
<< std::endl
;
10991 filepath
path(relpath
);
10994 std::scoped_lock
lock(client_lock
);
10995 int r
= path_walk(path
, &in
, perms
);
10999 if (!(in
.get()->is_dir()))
11000 return -CEPHFS_ENOTDIR
;
11004 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
11006 _getcwd(new_cwd
, perms
);
11010 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
11013 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
11015 Inode
*in
= cwd
.get();
11016 while (in
!= root
.get()) {
11017 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
11019 // A cwd or ancester is unlinked
11020 if (in
->dentries
.empty()) {
11024 Dentry
*dn
= in
->get_first_parent();
11029 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
11030 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
11031 filepath
path(in
->ino
);
11032 req
->set_filepath(path
);
11033 req
->set_inode(in
);
11034 int res
= make_request(req
, perms
);
11043 path
.push_front_dentry(dn
->name
);
11044 in
= dn
->dir
->parent_inode
;
11047 dir
+= path
.get_path();
11050 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
11052 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11053 if (!mref_reader
.is_state_satisfied())
11056 std::scoped_lock
l(client_lock
);
11058 _getcwd(dir
, perms
);
11061 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
11062 const UserPerm
& perms
)
11064 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11065 if (!mref_reader
.is_state_satisfied())
11066 return -CEPHFS_ENOTCONN
;
11068 tout(cct
) << __func__
<< std::endl
;
11069 unsigned long int total_files_on_fs
;
11074 std::unique_lock
lock(client_lock
);
11075 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
11076 if (data_pools
.size() == 1) {
11077 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
11079 objecter
->get_fs_stats(stats
, std::optional
<int64_t>(), &cond
);
11083 int rval
= cond
.wait();
11087 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
11090 ldout(cct
, 1) << "underlying call to statfs returned error: "
11091 << cpp_strerror(rval
)
11096 memset(stbuf
, 0, sizeof(*stbuf
));
11099 * we're going to set a block size of 4MB so we can represent larger
11100 * FSes without overflowing. Additionally convert the space
11101 * measurements from KB to bytes while making them in terms of
11102 * blocks. We use 4MB only because it is big enough, and because it
11103 * actually *is* the (ceph) default block size.
11105 const int CEPH_BLOCK_SHIFT
= 22;
11106 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
11107 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
11108 stbuf
->f_files
= total_files_on_fs
;
11109 stbuf
->f_ffree
= -1;
11110 stbuf
->f_favail
= -1;
11111 stbuf
->f_fsid
= -1; // ??
11112 stbuf
->f_flag
= 0; // ??
11113 stbuf
->f_namemax
= NAME_MAX
;
11115 // Usually quota_root will == root_ancestor, but if the mount root has no
11116 // quota but we can see a parent of it that does have a quota, we'll
11117 // respect that one instead.
11118 ceph_assert(root
!= nullptr);
11119 InodeRef quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
.get(), perms
);
11121 // get_quota_root should always give us something if client quotas are
11123 ceph_assert(cct
->_conf
.get_val
<bool>("client_quota") == false || quota_root
!= nullptr);
11125 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
11127 // Skip the getattr if any sessions are stale, as we don't want to
11128 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11130 if (!_any_stale_sessions()) {
11131 int r
= _getattr(quota_root
, 0, perms
, true);
11133 // Ignore return value: error getting latest inode metadata is not a good
11134 // reason to break "df".
11135 lderr(cct
) << "Error in getattr on quota root 0x"
11136 << std::hex
<< quota_root
->ino
<< std::dec
11137 << " statfs result may be outdated" << dendl
;
11141 // Special case: if there is a size quota set on the Inode acting
11142 // as the root for this client mount, then report the quota status
11143 // as the filesystem statistics.
11144 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
11145 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
11146 // It is possible for a quota to be exceeded: arithmetic here must
11147 // handle case where used > total.
11148 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
11150 stbuf
->f_blocks
= total
;
11151 stbuf
->f_bfree
= free
;
11152 stbuf
->f_bavail
= free
;
11154 // General case: report the cluster statistics returned from RADOS. Because
11155 // multiple pools may be used without one filesystem namespace via
11156 // layouts, this is the most correct thing we can do.
11157 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
11158 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11159 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11165 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
11166 struct flock
*fl
, uint64_t owner
, bool removing
)
11168 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
11169 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
11170 << " type " << fl
->l_type
<< " owner " << owner
11171 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
11173 if (in
->flags
& I_ERROR_FILELOCK
)
11174 return -CEPHFS_EIO
;
11177 if (F_RDLCK
== fl
->l_type
)
11178 lock_cmd
= CEPH_LOCK_SHARED
;
11179 else if (F_WRLCK
== fl
->l_type
)
11180 lock_cmd
= CEPH_LOCK_EXCL
;
11181 else if (F_UNLCK
== fl
->l_type
)
11182 lock_cmd
= CEPH_LOCK_UNLOCK
;
11184 return -CEPHFS_EIO
;
11186 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
11190 * Set the most significant bit, so that MDS knows the 'owner'
11191 * is sufficient to identify the owner of lock. (old code uses
11192 * both 'owner' and 'pid')
11194 owner
|= (1ULL << 63);
11196 MetaRequest
*req
= new MetaRequest(op
);
11198 in
->make_nosnap_relative_path(path
);
11199 req
->set_filepath(path
);
11200 req
->set_inode(in
);
11202 req
->head
.args
.filelock_change
.rule
= lock_type
;
11203 req
->head
.args
.filelock_change
.type
= lock_cmd
;
11204 req
->head
.args
.filelock_change
.owner
= owner
;
11205 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
11206 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
11207 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
11208 req
->head
.args
.filelock_change
.wait
= sleep
;
11213 if (sleep
&& switch_interrupt_cb
) {
11214 // enable interrupt
11215 switch_interrupt_cb(callback_handle
, req
->get());
11216 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11217 // disable interrupt
11218 switch_interrupt_cb(callback_handle
, NULL
);
11219 if (ret
== 0 && req
->aborted()) {
11220 // effect of this lock request has been revoked by the 'lock intr' request
11221 ret
= req
->get_abort_code();
11225 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11229 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
11230 ceph_filelock filelock
;
11231 auto p
= bl
.cbegin();
11232 decode(filelock
, p
);
11234 if (CEPH_LOCK_SHARED
== filelock
.type
)
11235 fl
->l_type
= F_RDLCK
;
11236 else if (CEPH_LOCK_EXCL
== filelock
.type
)
11237 fl
->l_type
= F_WRLCK
;
11239 fl
->l_type
= F_UNLCK
;
11241 fl
->l_whence
= SEEK_SET
;
11242 fl
->l_start
= filelock
.start
;
11243 fl
->l_len
= filelock
.length
;
11244 fl
->l_pid
= filelock
.pid
;
11245 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
11246 ceph_lock_state_t
*lock_state
;
11247 if (lock_type
== CEPH_LOCK_FCNTL
) {
11248 if (!in
->fcntl_locks
)
11249 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11250 lock_state
= in
->fcntl_locks
.get();
11251 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
11252 if (!in
->flock_locks
)
11253 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11254 lock_state
= in
->flock_locks
.get();
11257 return -CEPHFS_EINVAL
;
11259 _update_lock_state(fl
, owner
, lock_state
);
11262 if (lock_type
== CEPH_LOCK_FCNTL
) {
11263 if (!fh
->fcntl_locks
)
11264 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11265 lock_state
= fh
->fcntl_locks
.get();
11267 if (!fh
->flock_locks
)
11268 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11269 lock_state
= fh
->flock_locks
.get();
11271 _update_lock_state(fl
, owner
, lock_state
);
11279 int Client::_interrupt_filelock(MetaRequest
*req
)
11281 // Set abort code, but do not kick. The abort code prevents the request
11282 // from being re-sent.
11283 req
->abort(-CEPHFS_EINTR
);
11285 return 0; // haven't sent the request
11287 Inode
*in
= req
->inode();
11290 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
11291 lock_type
= CEPH_LOCK_FLOCK_INTR
;
11292 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
11293 lock_type
= CEPH_LOCK_FCNTL_INTR
;
11296 return -CEPHFS_EINVAL
;
11299 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
11301 in
->make_nosnap_relative_path(path
);
11302 intr_req
->set_filepath(path
);
11303 intr_req
->set_inode(in
);
11304 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
11305 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
11306 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
11308 UserPerm
perms(req
->get_uid(), req
->get_gid());
11309 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
11312 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
11314 if (!in
->fcntl_locks
&& !in
->flock_locks
)
11317 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
11318 encode(nr_fcntl_locks
, bl
);
11319 if (nr_fcntl_locks
) {
11320 auto &lock_state
= in
->fcntl_locks
;
11321 for(auto p
= lock_state
->held_locks
.begin();
11322 p
!= lock_state
->held_locks
.end();
11324 encode(p
->second
, bl
);
11327 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
11328 encode(nr_flock_locks
, bl
);
11329 if (nr_flock_locks
) {
11330 auto &lock_state
= in
->flock_locks
;
11331 for(auto p
= lock_state
->held_locks
.begin();
11332 p
!= lock_state
->held_locks
.end();
11334 encode(p
->second
, bl
);
11337 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
11338 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
11341 void Client::_release_filelocks(Fh
*fh
)
11343 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
11346 Inode
*in
= fh
->inode
.get();
11347 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
11349 list
<ceph_filelock
> activated_locks
;
11351 list
<pair
<int, ceph_filelock
> > to_release
;
11353 if (fh
->fcntl_locks
) {
11354 auto &lock_state
= fh
->fcntl_locks
;
11355 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11357 if (in
->flags
& I_ERROR_FILELOCK
) {
11358 lock_state
->remove_lock(q
->second
, activated_locks
);
11360 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
11363 lock_state
.reset();
11365 if (fh
->flock_locks
) {
11366 auto &lock_state
= fh
->flock_locks
;
11367 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11369 if (in
->flags
& I_ERROR_FILELOCK
) {
11370 lock_state
->remove_lock(q
->second
, activated_locks
);
11372 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
11375 lock_state
.reset();
11378 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
11379 in
->flags
&= ~I_ERROR_FILELOCK
;
11381 if (to_release
.empty())
11385 memset(&fl
, 0, sizeof(fl
));
11386 fl
.l_whence
= SEEK_SET
;
11387 fl
.l_type
= F_UNLCK
;
11389 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
11390 p
!= to_release
.end();
11392 fl
.l_start
= p
->second
.start
;
11393 fl
.l_len
= p
->second
.length
;
11394 fl
.l_pid
= p
->second
.pid
;
11395 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
11396 p
->second
.owner
, true);
11400 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
11401 ceph_lock_state_t
*lock_state
)
11404 if (F_RDLCK
== fl
->l_type
)
11405 lock_cmd
= CEPH_LOCK_SHARED
;
11406 else if (F_WRLCK
== fl
->l_type
)
11407 lock_cmd
= CEPH_LOCK_EXCL
;
11409 lock_cmd
= CEPH_LOCK_UNLOCK
;;
11411 ceph_filelock filelock
;
11412 filelock
.start
= fl
->l_start
;
11413 filelock
.length
= fl
->l_len
;
11414 filelock
.client
= 0;
11415 // see comment in _do_filelock()
11416 filelock
.owner
= owner
| (1ULL << 63);
11417 filelock
.pid
= fl
->l_pid
;
11418 filelock
.type
= lock_cmd
;
11420 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
11421 list
<ceph_filelock
> activated_locks
;
11422 lock_state
->remove_lock(filelock
, activated_locks
);
11424 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
11429 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
11431 Inode
*in
= fh
->inode
.get();
11432 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
11433 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
11437 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
11439 Inode
*in
= fh
->inode
.get();
11440 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
11441 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
11442 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11446 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
11448 Inode
*in
= fh
->inode
.get();
11449 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
11451 int sleep
= !(cmd
& LOCK_NB
);
11466 return -CEPHFS_EINVAL
;
11470 memset(&fl
, 0, sizeof(fl
));
11472 fl
.l_whence
= SEEK_SET
;
11474 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
11475 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11479 int Client::get_snap_info(const char *path
, const UserPerm
&perms
, SnapInfo
*snap_info
) {
11480 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11481 if (!mref_reader
.is_state_satisfied()) {
11482 return -CEPHFS_ENOTCONN
;
11485 std::scoped_lock
lock(client_lock
);
11487 int r
= Client::path_walk(path
, &in
, perms
, true);
11492 if (in
->snapid
== CEPH_NOSNAP
) {
11493 return -CEPHFS_EINVAL
;
11496 snap_info
->id
= in
->snapid
;
11497 snap_info
->metadata
= in
->snap_metadata
;
11501 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
11503 /* Since the only thing this does is wrap a call to statfs, and
11504 statfs takes a lock, it doesn't seem we have a need to split it
11506 return statfs(0, stbuf
, perms
);
11509 void Client::_ll_register_callbacks(struct ceph_client_callback_args
*args
)
11514 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
11515 << " invalidate_ino_cb " << args
->ino_cb
11516 << " invalidate_dentry_cb " << args
->dentry_cb
11517 << " switch_interrupt_cb " << args
->switch_intr_cb
11518 << " remount_cb " << args
->remount_cb
11520 callback_handle
= args
->handle
;
11521 if (args
->ino_cb
) {
11522 ino_invalidate_cb
= args
->ino_cb
;
11523 async_ino_invalidator
.start();
11525 if (args
->dentry_cb
) {
11526 dentry_invalidate_cb
= args
->dentry_cb
;
11527 async_dentry_invalidator
.start();
11529 if (args
->switch_intr_cb
) {
11530 switch_interrupt_cb
= args
->switch_intr_cb
;
11531 interrupt_finisher
.start();
11533 if (args
->remount_cb
) {
11534 remount_cb
= args
->remount_cb
;
11535 remount_finisher
.start();
11537 if (args
->ino_release_cb
) {
11538 ino_release_cb
= args
->ino_release_cb
;
11539 async_ino_releasor
.start();
11541 if (args
->umask_cb
)
11542 umask_cb
= args
->umask_cb
;
11545 // This is deprecated, use ll_register_callbacks2() instead.
11546 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
11548 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11550 _ll_register_callbacks(args
);
11553 int Client::ll_register_callbacks2(struct ceph_client_callback_args
*args
)
11555 if (is_mounting() || is_mounted() || is_unmounting())
11556 return -CEPHFS_EBUSY
;
11558 _ll_register_callbacks(args
);
11562 std::pair
<int, bool> Client::test_dentry_handling(bool can_invalidate
)
11564 std::pair
<int, bool> r(0, false);
11566 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
11567 if (!iref_reader
.is_state_satisfied())
11568 return std::make_pair(-CEPHFS_ENOTCONN
, false);
11570 can_invalidate_dentries
= can_invalidate
;
11572 if (can_invalidate_dentries
) {
11573 ceph_assert(dentry_invalidate_cb
);
11574 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
11576 ceph_assert(remount_cb
);
11577 ldout(cct
, 1) << "using remount_cb" << dendl
;
11578 r
= _do_remount(false);
11584 int Client::_sync_fs()
11586 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
11588 ldout(cct
, 10) << __func__
<< dendl
;
11591 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
11592 if (cct
->_conf
->client_oc
) {
11593 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
11594 objectcacher
->flush_all(cond
.get());
11599 ceph_tid_t flush_tid
= last_flush_tid
;
11601 // wait for unsafe mds requests
11602 wait_unsafe_requests();
11604 wait_sync_caps(flush_tid
);
11606 if (nullptr != cond
) {
11607 client_lock
.unlock();
11608 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
11610 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
11611 client_lock
.lock();
11617 int Client::sync_fs()
11619 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11620 if (!mref_reader
.is_state_satisfied())
11621 return -CEPHFS_ENOTCONN
;
11623 std::scoped_lock
l(client_lock
);
11628 int64_t Client::drop_caches()
11630 std::scoped_lock
l(client_lock
);
11631 return objectcacher
->release_all();
11634 int Client::_lazyio(Fh
*fh
, int enable
)
11636 Inode
*in
= fh
->inode
.get();
11637 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
11639 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
11642 int orig_mode
= fh
->mode
;
11644 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
11645 in
->get_open_ref(fh
->mode
);
11646 in
->put_open_ref(orig_mode
);
11647 check_caps(in
, CHECK_CAPS_NODELAY
);
11649 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
11650 in
->get_open_ref(fh
->mode
);
11651 in
->put_open_ref(orig_mode
);
11658 int Client::lazyio(int fd
, int enable
)
11660 std::scoped_lock
l(client_lock
);
11661 Fh
*f
= get_filehandle(fd
);
11663 return -CEPHFS_EBADF
;
11665 return _lazyio(f
, enable
);
11668 int Client::ll_lazyio(Fh
*fh
, int enable
)
11670 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
11671 tout(cct
) << __func__
<< std::endl
;
11673 std::scoped_lock
lock(client_lock
);
11674 return _lazyio(fh
, enable
);
11677 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
11679 std::scoped_lock
l(client_lock
);
11680 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
11681 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11683 Fh
*f
= get_filehandle(fd
);
11685 return -CEPHFS_EBADF
;
11693 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
11695 std::scoped_lock
l(client_lock
);
11696 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
11697 << ", " << offset
<< ", " << count
<< ")" << dendl
;
11699 Fh
*f
= get_filehandle(fd
);
11701 return -CEPHFS_EBADF
;
11702 Inode
*in
= f
->inode
.get();
11705 if (_release(in
)) {
11706 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
11714 // =============================
11717 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
,
11718 mode_t mode
, const std::map
<std::string
, std::string
> &metadata
)
11720 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11721 if (!mref_reader
.is_state_satisfied())
11722 return -CEPHFS_ENOTCONN
;
11724 std::scoped_lock
l(client_lock
);
11726 filepath
path(relpath
);
11728 int r
= path_walk(path
, &in
, perm
);
11731 if (cct
->_conf
->client_permissions
) {
11732 r
= may_create(in
.get(), perm
);
11736 Inode
*snapdir
= open_snapdir(in
.get());
11737 return _mkdir(snapdir
, name
, mode
, perm
, nullptr, metadata
);
11740 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
, bool check_perms
)
11742 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11743 if (!mref_reader
.is_state_satisfied())
11744 return -CEPHFS_ENOTCONN
;
11746 std::scoped_lock
l(client_lock
);
11748 filepath
path(relpath
);
11750 int r
= path_walk(path
, &in
, perms
);
11753 Inode
*snapdir
= open_snapdir(in
.get());
11754 if (cct
->_conf
->client_permissions
) {
11755 r
= may_delete(snapdir
, check_perms
? name
: NULL
, perms
);
11759 return _rmdir(snapdir
, name
, perms
);
11762 // =============================
11765 int Client::get_caps_issued(int fd
)
11767 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11768 if (!mref_reader
.is_state_satisfied())
11769 return -CEPHFS_ENOTCONN
;
11771 std::scoped_lock
lock(client_lock
);
11773 Fh
*f
= get_filehandle(fd
);
11775 return -CEPHFS_EBADF
;
11777 return f
->inode
->caps_issued();
11780 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
11782 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11783 if (!mref_reader
.is_state_satisfied())
11784 return -CEPHFS_ENOTCONN
;
11786 std::scoped_lock
lock(client_lock
);
11790 int r
= path_walk(p
, &in
, perms
, true);
11793 return in
->caps_issued();
11796 // =========================================
11799 Inode
*Client::open_snapdir(Inode
*diri
)
11802 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
11803 if (!inode_map
.count(vino
)) {
11804 in
= new Inode(this, vino
, &diri
->layout
);
11806 in
->ino
= diri
->ino
;
11807 in
->snapid
= CEPH_SNAPDIR
;
11808 in
->mode
= diri
->mode
;
11809 in
->uid
= diri
->uid
;
11810 in
->gid
= diri
->gid
;
11812 in
->mtime
= diri
->mtime
;
11813 in
->ctime
= diri
->ctime
;
11814 in
->btime
= diri
->btime
;
11815 in
->atime
= diri
->atime
;
11816 in
->size
= diri
->size
;
11817 in
->change_attr
= diri
->change_attr
;
11819 in
->dirfragtree
.clear();
11820 in
->snapdir_parent
= diri
;
11821 diri
->flags
|= I_SNAPDIR_OPEN
;
11822 inode_map
[vino
] = in
;
11823 if (use_faked_inos())
11824 _assign_faked_ino(in
);
11825 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
11827 in
= inode_map
[vino
];
11828 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
11833 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
11834 Inode
**out
, const UserPerm
& perms
)
11836 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11837 if (!mref_reader
.is_state_satisfied())
11838 return -CEPHFS_ENOTCONN
;
11840 vinodeno_t vparent
= _get_vino(parent
);
11841 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11842 tout(cct
) << __func__
<< std::endl
;
11843 tout(cct
) << name
<< std::endl
;
11845 std::scoped_lock
lock(client_lock
);
11848 if (!fuse_default_permissions
) {
11849 if (strcmp(name
, ".") && strcmp(name
, "..")) {
11850 r
= may_lookup(parent
, perms
);
11856 string
dname(name
);
11859 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
11866 fill_stat(in
, attr
);
11870 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11871 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11872 tout(cct
) << attr
->st_ino
<< std::endl
;
11877 int Client::ll_lookup_vino(
11879 const UserPerm
& perms
,
11882 ceph_assert(inode
!= NULL
);
11883 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11884 if (!mref_reader
.is_state_satisfied())
11885 return -CEPHFS_ENOTCONN
;
11887 if (is_reserved_vino(vino
))
11888 return -CEPHFS_ESTALE
;
11890 std::scoped_lock
lock(client_lock
);
11891 ldout(cct
, 3) << __func__
<< " " << vino
<< dendl
;
11893 // Check the cache first
11894 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11895 if (p
!= inode_map
.end()) {
11896 *inode
= p
->second
;
11901 uint64_t snapid
= vino
.snapid
;
11903 // for snapdir, find the non-snapped dir inode
11904 if (snapid
== CEPH_SNAPDIR
)
11905 vino
.snapid
= CEPH_NOSNAP
;
11907 int r
= _lookup_vino(vino
, perms
, inode
);
11910 ceph_assert(*inode
!= NULL
);
11912 if (snapid
== CEPH_SNAPDIR
) {
11913 Inode
*tmp
= *inode
;
11915 // open the snapdir and put the inode ref
11916 *inode
= open_snapdir(tmp
);
11917 _ll_forget(tmp
, 1);
11923 int Client::ll_lookup_inode(
11924 struct inodeno_t ino
,
11925 const UserPerm
& perms
,
11928 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
11929 return ll_lookup_vino(vino
, perms
, inode
);
11932 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
11933 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11934 const UserPerm
& perms
)
11936 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11937 if (!mref_reader
.is_state_satisfied())
11938 return -CEPHFS_ENOTCONN
;
11940 vinodeno_t vparent
= _get_vino(parent
);
11941 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
11942 tout(cct
) << "ll_lookupx" << std::endl
;
11943 tout(cct
) << name
<< std::endl
;
11945 std::scoped_lock
lock(client_lock
);
11948 if (!fuse_default_permissions
) {
11949 r
= may_lookup(parent
, perms
);
11954 string
dname(name
);
11957 unsigned mask
= statx_to_mask(flags
, want
);
11958 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
11964 fill_statx(in
, mask
, stx
);
11968 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
11969 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11970 tout(cct
) << stx
->stx_ino
<< std::endl
;
11975 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
11976 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
11978 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11979 if (!mref_reader
.is_state_satisfied())
11980 return -CEPHFS_ENOTCONN
;
11982 filepath
fp(name
, 0);
11985 unsigned mask
= statx_to_mask(flags
, want
);
11987 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
11988 tout(cct
) << __func__
<< std::endl
;
11989 tout(cct
) << name
<< std::endl
;
11991 std::scoped_lock
lock(client_lock
);
11992 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
11994 /* zero out mask, just in case... */
12001 fill_statx(in
, mask
, stx
);
12008 void Client::_ll_get(Inode
*in
)
12010 if (in
->ll_ref
== 0) {
12012 if (in
->is_dir() && !in
->dentries
.empty()) {
12013 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12014 in
->get_first_parent()->get(); // pin dentry
12016 if (in
->snapid
!= CEPH_NOSNAP
)
12017 ll_snap_ref
[in
->snapid
]++;
12020 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
12023 int Client::_ll_put(Inode
*in
, uint64_t num
)
12026 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
12027 if (in
->ll_ref
== 0) {
12028 if (in
->is_dir() && !in
->dentries
.empty()) {
12029 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12030 in
->get_first_parent()->put(); // unpin dentry
12032 if (in
->snapid
!= CEPH_NOSNAP
) {
12033 auto p
= ll_snap_ref
.find(in
->snapid
);
12034 ceph_assert(p
!= ll_snap_ref
.end());
12035 ceph_assert(p
->second
> 0);
12036 if (--p
->second
== 0)
12037 ll_snap_ref
.erase(p
);
12046 void Client::_ll_drop_pins()
12048 ldout(cct
, 10) << __func__
<< dendl
;
12049 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
12050 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
12051 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
12052 it
!= inode_map
.end();
12054 Inode
*in
= it
->second
;
12058 to_be_put
.insert(in
);
12059 _ll_put(in
, in
->ll_ref
);
12064 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
12066 inodeno_t ino
= in
->ino
;
12068 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
12069 tout(cct
) << __func__
<< std::endl
;
12070 tout(cct
) << ino
.val
<< std::endl
;
12071 tout(cct
) << count
<< std::endl
;
12073 // Ignore forget if we're no longer mounted
12074 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12075 if (!mref_reader
.is_state_satisfied())
12078 if (ino
== 1) return true; // ignore forget on root.
12081 if (in
->ll_ref
< count
) {
12082 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
12083 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
12084 _ll_put(in
, in
->ll_ref
);
12087 if (_ll_put(in
, count
) == 0)
12094 bool Client::ll_forget(Inode
*in
, uint64_t count
)
12096 std::scoped_lock
lock(client_lock
);
12097 return _ll_forget(in
, count
);
12100 bool Client::ll_put(Inode
*in
)
12102 /* ll_forget already takes the lock */
12103 return ll_forget(in
, 1);
12106 int Client::ll_get_snap_ref(snapid_t snap
)
12108 std::scoped_lock
lock(client_lock
);
12109 auto p
= ll_snap_ref
.find(snap
);
12110 if (p
!= ll_snap_ref
.end())
12115 snapid_t
Client::ll_get_snapid(Inode
*in
)
12117 std::scoped_lock
lock(client_lock
);
12121 Inode
*Client::ll_get_inode(ino_t ino
)
12123 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12124 if (!mref_reader
.is_state_satisfied())
12127 std::scoped_lock
lock(client_lock
);
12129 vinodeno_t vino
= _map_faked_ino(ino
);
12130 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12131 if (p
== inode_map
.end())
12133 Inode
*in
= p
->second
;
12138 Inode
*Client::ll_get_inode(vinodeno_t vino
)
12140 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12141 if (!mref_reader
.is_state_satisfied())
12144 if (is_reserved_vino(vino
))
12147 std::scoped_lock
lock(client_lock
);
12149 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12150 if (p
== inode_map
.end())
12152 Inode
*in
= p
->second
;
12157 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
12159 vinodeno_t vino
= _get_vino(in
);
12161 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
12162 tout(cct
) << __func__
<< std::endl
;
12163 tout(cct
) << vino
.ino
.val
<< std::endl
;
12165 if (vino
.snapid
< CEPH_NOSNAP
)
12168 return _getattr(in
, caps
, perms
);
12171 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
12173 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12174 if (!mref_reader
.is_state_satisfied())
12175 return -CEPHFS_ENOTCONN
;
12177 std::scoped_lock
lock(client_lock
);
12179 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
12182 fill_stat(in
, attr
);
12183 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12187 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
12188 unsigned int flags
, const UserPerm
& perms
)
12190 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12191 if (!mref_reader
.is_state_satisfied())
12192 return -CEPHFS_ENOTCONN
;
12194 std::scoped_lock
lock(client_lock
);
12197 unsigned mask
= statx_to_mask(flags
, want
);
12199 if (mask
&& !in
->caps_issued_mask(mask
, true))
12200 res
= _ll_getattr(in
, mask
, perms
);
12203 fill_statx(in
, mask
, stx
);
12204 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12208 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12209 const UserPerm
& perms
, InodeRef
*inp
)
12211 vinodeno_t vino
= _get_vino(in
);
12213 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
12215 tout(cct
) << __func__
<< std::endl
;
12216 tout(cct
) << vino
.ino
.val
<< std::endl
;
12217 tout(cct
) << stx
->stx_mode
<< std::endl
;
12218 tout(cct
) << stx
->stx_uid
<< std::endl
;
12219 tout(cct
) << stx
->stx_gid
<< std::endl
;
12220 tout(cct
) << stx
->stx_size
<< std::endl
;
12221 tout(cct
) << stx
->stx_mtime
<< std::endl
;
12222 tout(cct
) << stx
->stx_atime
<< std::endl
;
12223 tout(cct
) << stx
->stx_btime
<< std::endl
;
12224 tout(cct
) << mask
<< std::endl
;
12226 if (!fuse_default_permissions
) {
12227 int res
= may_setattr(in
, stx
, mask
, perms
);
12232 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
12234 return __setattrx(in
, stx
, mask
, perms
, inp
);
12237 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12238 const UserPerm
& perms
)
12240 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12241 if (!mref_reader
.is_state_satisfied())
12242 return -CEPHFS_ENOTCONN
;
12244 std::scoped_lock
lock(client_lock
);
12246 InodeRef
target(in
);
12247 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
12249 ceph_assert(in
== target
.get());
12250 fill_statx(in
, in
->caps_issued(), stx
);
12253 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12257 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
12258 const UserPerm
& perms
)
12260 struct ceph_statx stx
;
12261 stat_to_statx(attr
, &stx
);
12263 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12264 if (!mref_reader
.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN
;
12267 std::scoped_lock
lock(client_lock
);
12269 InodeRef
target(in
);
12270 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
12272 ceph_assert(in
== target
.get());
12273 fill_stat(in
, attr
);
12276 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12284 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
12285 const UserPerm
& perms
)
12287 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12288 if (!mref_reader
.is_state_satisfied())
12289 return -CEPHFS_ENOTCONN
;
12291 std::scoped_lock
lock(client_lock
);
12294 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12297 return _getxattr(in
, name
, value
, size
, perms
);
12300 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
12301 const UserPerm
& perms
)
12303 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12304 if (!mref_reader
.is_state_satisfied())
12305 return -CEPHFS_ENOTCONN
;
12307 std::scoped_lock
lock(client_lock
);
12310 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12313 return _getxattr(in
, name
, value
, size
, perms
);
12316 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
12317 const UserPerm
& perms
)
12319 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12320 if (!mref_reader
.is_state_satisfied())
12321 return -CEPHFS_ENOTCONN
;
12323 std::scoped_lock
lock(client_lock
);
12325 Fh
*f
= get_filehandle(fd
);
12327 return -CEPHFS_EBADF
;
12328 return _getxattr(f
->inode
, name
, value
, size
, perms
);
12331 int Client::listxattr(const char *path
, char *list
, size_t size
,
12332 const UserPerm
& perms
)
12334 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12335 if (!mref_reader
.is_state_satisfied())
12336 return -CEPHFS_ENOTCONN
;
12338 std::scoped_lock
lock(client_lock
);
12341 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12344 return Client::_listxattr(in
.get(), list
, size
, perms
);
12347 int Client::llistxattr(const char *path
, char *list
, size_t size
,
12348 const UserPerm
& perms
)
12350 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12351 if (!mref_reader
.is_state_satisfied())
12352 return -CEPHFS_ENOTCONN
;
12354 std::scoped_lock
lock(client_lock
);
12357 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12360 return Client::_listxattr(in
.get(), list
, size
, perms
);
12363 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
12365 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12366 if (!mref_reader
.is_state_satisfied())
12367 return -CEPHFS_ENOTCONN
;
12369 std::scoped_lock
lock(client_lock
);
12371 Fh
*f
= get_filehandle(fd
);
12373 return -CEPHFS_EBADF
;
12374 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
12377 int Client::removexattr(const char *path
, const char *name
,
12378 const UserPerm
& perms
)
12380 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12381 if (!mref_reader
.is_state_satisfied())
12382 return -CEPHFS_ENOTCONN
;
12384 std::scoped_lock
lock(client_lock
);
12387 int r
= Client::path_walk(path
, &in
, perms
, true);
12390 return _removexattr(in
, name
, perms
);
12393 int Client::lremovexattr(const char *path
, const char *name
,
12394 const UserPerm
& perms
)
12396 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12397 if (!mref_reader
.is_state_satisfied())
12398 return -CEPHFS_ENOTCONN
;
12400 std::scoped_lock
lock(client_lock
);
12403 int r
= Client::path_walk(path
, &in
, perms
, false);
12406 return _removexattr(in
, name
, perms
);
12409 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
12411 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12412 if (!mref_reader
.is_state_satisfied())
12413 return -CEPHFS_ENOTCONN
;
12415 std::scoped_lock
lock(client_lock
);
12417 Fh
*f
= get_filehandle(fd
);
12419 return -CEPHFS_EBADF
;
12420 return _removexattr(f
->inode
, name
, perms
);
12423 int Client::setxattr(const char *path
, const char *name
, const void *value
,
12424 size_t size
, int flags
, const UserPerm
& perms
)
12426 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12427 if (!mref_reader
.is_state_satisfied())
12428 return -CEPHFS_ENOTCONN
;
12430 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12432 std::scoped_lock
lock(client_lock
);
12435 int r
= Client::path_walk(path
, &in
, perms
, true);
12438 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12441 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
12442 size_t size
, int flags
, const UserPerm
& perms
)
12444 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12445 if (!mref_reader
.is_state_satisfied())
12446 return -CEPHFS_ENOTCONN
;
12448 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12450 std::scoped_lock
lock(client_lock
);
12453 int r
= Client::path_walk(path
, &in
, perms
, false);
12456 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12459 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
12460 int flags
, const UserPerm
& perms
)
12462 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12463 if (!mref_reader
.is_state_satisfied())
12464 return -CEPHFS_ENOTCONN
;
12466 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12468 std::scoped_lock
lock(client_lock
);
12470 Fh
*f
= get_filehandle(fd
);
12472 return -CEPHFS_EBADF
;
12473 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
12476 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
12477 const UserPerm
& perms
)
12480 const VXattr
*vxattr
= nullptr;
12482 vxattr
= _match_vxattr(in
, name
);
12484 r
= -CEPHFS_ENODATA
;
12486 // Do a force getattr to get the latest quota before returning
12487 // a value to userspace.
12489 if (vxattr
->flags
& VXATTR_RSTAT
) {
12490 flags
|= CEPH_STAT_RSTAT
;
12492 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
12493 flags
|= CEPH_CAP_FILE_SHARED
;
12495 r
= _getattr(in
, flags
| CEPH_STAT_CAP_XATTR
, perms
, true);
12497 // Error from getattr!
12501 // call pointer-to-member function
12503 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
12504 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
12506 r
= -CEPHFS_ENODATA
;
12510 if (r
> (int)size
) {
12511 r
= -CEPHFS_ERANGE
;
12512 } else if (r
> 0) {
12513 memcpy(value
, buf
, r
);
12519 if (!strncmp(name
, "ceph.", 5)) {
12520 r
= _getvxattr(in
, perms
, name
, size
, value
, MDS_RANK_NONE
);
12524 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
12525 r
= -CEPHFS_EOPNOTSUPP
;
12529 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12532 r
= -CEPHFS_ENODATA
;
12533 if (in
->xattrs
.count(n
)) {
12534 r
= in
->xattrs
[n
].length();
12535 if (r
> 0 && size
!= 0) {
12536 if (size
>= (unsigned)r
)
12537 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
12539 r
= -CEPHFS_ERANGE
;
12544 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
12548 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
12549 const UserPerm
& perms
)
12551 if (cct
->_conf
->client_permissions
) {
12552 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
12556 return _getxattr(in
.get(), name
, value
, size
, perms
);
12559 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
12560 size_t size
, const UserPerm
& perms
)
12562 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12563 if (!mref_reader
.is_state_satisfied())
12564 return -CEPHFS_ENOTCONN
;
12566 vinodeno_t vino
= _get_vino(in
);
12568 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12569 tout(cct
) << __func__
<< std::endl
;
12570 tout(cct
) << vino
.ino
.val
<< std::endl
;
12571 tout(cct
) << name
<< std::endl
;
12573 std::scoped_lock
lock(client_lock
);
12574 if (!fuse_default_permissions
) {
12575 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
12580 return _getxattr(in
, name
, value
, size
, perms
);
12583 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
12584 const UserPerm
& perms
)
12586 bool len_only
= (size
== 0);
12587 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12593 for ([[maybe_unused
]] const auto &[xattr_name
, xattr_value_bl
] : in
->xattrs
) {
12594 if (xattr_name
.rfind("ceph.", 0) == 0) {
12598 size_t this_len
= xattr_name
.length() + 1;
12603 if (this_len
> size
) {
12604 r
= -CEPHFS_ERANGE
;
12608 memcpy(name
, xattr_name
.c_str(), this_len
);
12613 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
12617 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
12618 const UserPerm
& perms
)
12620 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12621 if (!mref_reader
.is_state_satisfied())
12622 return -CEPHFS_ENOTCONN
;
12624 vinodeno_t vino
= _get_vino(in
);
12626 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
12627 tout(cct
) << __func__
<< std::endl
;
12628 tout(cct
) << vino
.ino
.val
<< std::endl
;
12629 tout(cct
) << size
<< std::endl
;
12631 std::scoped_lock
lock(client_lock
);
12632 return _listxattr(in
, names
, size
, perms
);
12635 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
12636 size_t size
, int flags
, const UserPerm
& perms
)
12639 int xattr_flags
= 0;
12641 xattr_flags
|= CEPH_XATTR_REMOVE
;
12642 if (flags
& XATTR_CREATE
)
12643 xattr_flags
|= CEPH_XATTR_CREATE
;
12644 if (flags
& XATTR_REPLACE
)
12645 xattr_flags
|= CEPH_XATTR_REPLACE
;
12647 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
12649 in
->make_nosnap_relative_path(path
);
12650 req
->set_filepath(path
);
12651 req
->set_string2(name
);
12652 req
->set_inode(in
);
12653 req
->head
.args
.setxattr
.flags
= xattr_flags
;
12656 ceph_assert(value
|| size
== 0);
12657 bl
.append((const char*)value
, size
);
12660 int res
= make_request(req
, perms
);
12663 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
12668 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
12669 size_t size
, int flags
, const UserPerm
& perms
)
12671 if (in
->snapid
!= CEPH_NOSNAP
) {
12672 return -CEPHFS_EROFS
;
12677 } else if (value
== NULL
) {
12678 return -CEPHFS_EINVAL
;
12681 bool posix_acl_xattr
= false;
12682 if (acl_type
== POSIX_ACL
)
12683 posix_acl_xattr
= !strncmp(name
, "system.", 7);
12685 if (strncmp(name
, "user.", 5) &&
12686 strncmp(name
, "security.", 9) &&
12687 strncmp(name
, "trusted.", 8) &&
12688 strncmp(name
, "ceph.", 5) &&
12690 return -CEPHFS_EOPNOTSUPP
;
12692 bool check_realm
= false;
12694 if (posix_acl_xattr
) {
12695 if (!strcmp(name
, ACL_EA_ACCESS
)) {
12696 mode_t new_mode
= in
->mode
;
12698 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
12705 if (new_mode
!= in
->mode
) {
12706 struct ceph_statx stx
;
12707 stx
.stx_mode
= new_mode
;
12708 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
12713 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
12715 if (!S_ISDIR(in
->mode
))
12716 return -CEPHFS_EACCES
;
12717 int ret
= posix_acl_check(value
, size
);
12719 return -CEPHFS_EINVAL
;
12726 return -CEPHFS_EOPNOTSUPP
;
12729 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12731 if (vxattr
->readonly
)
12732 return -CEPHFS_EOPNOTSUPP
;
12733 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
12734 check_realm
= true;
12738 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
12739 if (ret
>= 0 && check_realm
) {
12740 // check if snaprealm was created for quota inode
12741 if (in
->quota
.is_enable() &&
12742 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
12743 ret
= -CEPHFS_EOPNOTSUPP
;
12749 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
12750 size_t size
, int flags
, const UserPerm
& perms
)
12752 if (cct
->_conf
->client_permissions
) {
12753 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12757 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
12760 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
12763 if (name
== "layout") {
12764 string::iterator begin
= value
.begin();
12765 string::iterator end
= value
.end();
12766 keys_and_values
<string::iterator
> p
; // create instance of parser
12767 std::map
<string
, string
> m
; // map to receive results
12768 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
12769 return -CEPHFS_EINVAL
;
12772 return -CEPHFS_EINVAL
;
12773 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
12774 if (q
->first
== "pool") {
12779 } else if (name
== "layout.pool") {
12783 if (tmp
.length()) {
12786 pool
= boost::lexical_cast
<unsigned>(tmp
);
12787 if (!osdmap
->have_pg_pool(pool
))
12788 return -CEPHFS_ENOENT
;
12789 } catch (boost::bad_lexical_cast
const&) {
12790 pool
= osdmap
->lookup_pg_pool_name(tmp
);
12792 return -CEPHFS_ENOENT
;
12800 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
12802 // For setting pool of layout, MetaRequest need osdmap epoch.
12803 // There is a race which create a new data pool but client and mds both don't have.
12804 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
12805 ldout(cct
, 15) << __func__
<< ": name = " << name
<< dendl
;
12806 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
12807 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
12808 string
rest(strstr(name
, "layout"));
12809 string
v((const char*)value
, size
);
12810 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12811 return _setxattr_check_data_pool(rest
, v
, &o
);
12814 if (r
== -CEPHFS_ENOENT
) {
12816 ldout(cct
, 20) << __func__
<< ": waiting for latest osdmap" << dendl
;
12817 objecter
->wait_for_latest_osdmap(ca::use_blocked
[ec
]);
12818 ldout(cct
, 20) << __func__
<< ": got latest osdmap: " << ec
<< dendl
;
12823 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
12824 size_t size
, int flags
, const UserPerm
& perms
)
12826 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12827 if (!mref_reader
.is_state_satisfied())
12828 return -CEPHFS_ENOTCONN
;
12830 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12832 vinodeno_t vino
= _get_vino(in
);
12834 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
12835 tout(cct
) << __func__
<< std::endl
;
12836 tout(cct
) << vino
.ino
.val
<< std::endl
;
12837 tout(cct
) << name
<< std::endl
;
12839 std::scoped_lock
lock(client_lock
);
12840 if (!fuse_default_permissions
) {
12841 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12845 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12848 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12850 if (in
->snapid
!= CEPH_NOSNAP
) {
12851 return -CEPHFS_EROFS
;
12854 // same xattrs supported by kernel client
12855 if (strncmp(name
, "user.", 5) &&
12856 strncmp(name
, "system.", 7) &&
12857 strncmp(name
, "security.", 9) &&
12858 strncmp(name
, "trusted.", 8) &&
12859 strncmp(name
, "ceph.", 5))
12860 return -CEPHFS_EOPNOTSUPP
;
12862 const VXattr
*vxattr
= _match_vxattr(in
, name
);
12863 if (vxattr
&& vxattr
->readonly
)
12864 return -CEPHFS_EOPNOTSUPP
;
12866 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
12868 in
->make_nosnap_relative_path(path
);
12869 req
->set_filepath(path
);
12870 req
->set_filepath2(name
);
12871 req
->set_inode(in
);
12873 int res
= make_request(req
, perms
);
12876 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
12880 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
12882 if (cct
->_conf
->client_permissions
) {
12883 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
12887 return _removexattr(in
.get(), name
, perms
);
12890 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
12892 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12893 if (!mref_reader
.is_state_satisfied())
12894 return -CEPHFS_ENOTCONN
;
12896 vinodeno_t vino
= _get_vino(in
);
12898 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
12899 tout(cct
) << "ll_removexattr" << std::endl
;
12900 tout(cct
) << vino
.ino
.val
<< std::endl
;
12901 tout(cct
) << name
<< std::endl
;
12903 std::scoped_lock
lock(client_lock
);
12904 if (!fuse_default_permissions
) {
12905 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
12910 return _removexattr(in
, name
, perms
);
12913 bool Client::_vxattrcb_quota_exists(Inode
*in
)
12915 return in
->quota
.is_enable() &&
12916 (in
->snapid
!= CEPH_NOSNAP
||
12917 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
12919 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
12921 return snprintf(val
, size
,
12922 "max_bytes=%lld max_files=%lld",
12923 (long long int)in
->quota
.max_bytes
,
12924 (long long int)in
->quota
.max_files
);
12926 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
12928 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
12930 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
12932 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
12935 bool Client::_vxattrcb_layout_exists(Inode
*in
)
12937 return in
->layout
!= file_layout_t();
12939 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
12941 int r
= snprintf(val
, size
,
12942 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
12943 (unsigned long long)in
->layout
.stripe_unit
,
12944 (unsigned long long)in
->layout
.stripe_count
,
12945 (unsigned long long)in
->layout
.object_size
);
12946 objecter
->with_osdmap([&](const OSDMap
& o
) {
12947 if (o
.have_pg_pool(in
->layout
.pool_id
))
12948 r
+= snprintf(val
+ r
, size
- r
, "%s",
12949 o
.get_pool_name(in
->layout
.pool_id
).c_str());
12951 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
12952 (uint64_t)in
->layout
.pool_id
);
12954 if (in
->layout
.pool_ns
.length())
12955 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
12956 in
->layout
.pool_ns
.c_str());
12959 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
12961 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
12963 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
12965 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
12967 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
12969 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
12971 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
12974 objecter
->with_osdmap([&](const OSDMap
& o
) {
12975 if (o
.have_pg_pool(in
->layout
.pool_id
))
12976 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
12977 in
->layout
.pool_id
).c_str());
12979 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
12983 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
12985 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
12987 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
12989 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
12991 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
12993 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
12995 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
12997 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
12999 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
13001 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
13003 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
13005 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
13007 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
13009 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
13011 size_t Client::_vxattrcb_dir_rsnaps(Inode
*in
, char *val
, size_t size
)
13013 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsnaps
);
13015 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
13017 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
13019 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
13021 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
13022 (long)in
->rstat
.rctime
.nsec());
13024 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
13026 return in
->dir_pin
!= -CEPHFS_ENODATA
;
13028 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
13030 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
13033 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
13035 return !in
->snap_btime
.is_zero();
13038 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
13040 return snprintf(val
, size
, "%llu.%09lu",
13041 (long long unsigned)in
->snap_btime
.sec(),
13042 (long unsigned)in
->snap_btime
.nsec());
13045 size_t Client::_vxattrcb_caps(Inode
*in
, char *val
, size_t size
)
13049 in
->caps_issued(&issued
);
13050 return snprintf(val
, size
, "%s/0x%x", ccap_string(issued
).c_str(), issued
);
13053 bool Client::_vxattrcb_mirror_info_exists(Inode
*in
)
13055 // checking one of the xattrs would suffice
13056 return in
->xattrs
.count("ceph.mirror.info.cluster_id") != 0;
13059 size_t Client::_vxattrcb_mirror_info(Inode
*in
, char *val
, size_t size
)
13061 return snprintf(val
, size
, "cluster_id=%.*s fs_id=%.*s",
13062 in
->xattrs
["ceph.mirror.info.cluster_id"].length(),
13063 in
->xattrs
["ceph.mirror.info.cluster_id"].c_str(),
13064 in
->xattrs
["ceph.mirror.info.fs_id"].length(),
13065 in
->xattrs
["ceph.mirror.info.fs_id"].c_str());
13068 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
13070 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
13073 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
13075 auto name
= messenger
->get_myname();
13076 return snprintf(val
, size
, "%s%" PRId64
, name
.type_str(), name
.num());
13079 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13080 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13082 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13084 name: CEPH_XATTR_NAME(_type, _name), \
13085 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13090 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13092 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13093 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13095 exists_cb: &Client::_vxattrcb_layout_exists, \
13098 #define XATTR_QUOTA_FIELD(_type, _name) \
13100 name: CEPH_XATTR_NAME(_type, _name), \
13101 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13103 exists_cb: &Client::_vxattrcb_quota_exists, \
13107 const Client::VXattr
Client::_dir_vxattrs
[] = {
13109 name
: "ceph.dir.layout",
13110 getxattr_cb
: &Client::_vxattrcb_layout
,
13112 exists_cb
: &Client::_vxattrcb_layout_exists
,
13116 // Delete the following dir layout field definitions for release "S"
13117 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
13118 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
13119 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
13120 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
13121 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
13122 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
13123 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
13124 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
13125 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
13126 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
13127 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
13128 XATTR_NAME_CEPH(dir
, rsnaps
, VXATTR_RSTAT
),
13129 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
13130 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
13132 name
: "ceph.quota",
13133 getxattr_cb
: &Client::_vxattrcb_quota
,
13135 exists_cb
: &Client::_vxattrcb_quota_exists
,
13138 XATTR_QUOTA_FIELD(quota
, max_bytes
),
13139 XATTR_QUOTA_FIELD(quota
, max_files
),
13141 // Delete the following dir pin field definitions for release "S"
13143 name
: "ceph.dir.pin",
13144 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
13146 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
13150 name
: "ceph.snap.btime",
13151 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13153 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13157 name
: "ceph.mirror.info",
13158 getxattr_cb
: &Client::_vxattrcb_mirror_info
,
13160 exists_cb
: &Client::_vxattrcb_mirror_info_exists
,
13165 getxattr_cb
: &Client::_vxattrcb_caps
,
13170 { name
: "" } /* Required table terminator */
13173 const Client::VXattr
Client::_file_vxattrs
[] = {
13175 name
: "ceph.file.layout",
13176 getxattr_cb
: &Client::_vxattrcb_layout
,
13178 exists_cb
: &Client::_vxattrcb_layout_exists
,
13181 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
13182 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
13183 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
13184 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
13185 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
13187 name
: "ceph.snap.btime",
13188 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13190 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13195 getxattr_cb
: &Client::_vxattrcb_caps
,
13200 { name
: "" } /* Required table terminator */
13203 const Client::VXattr
Client::_common_vxattrs
[] = {
13205 name
: "ceph.cluster_fsid",
13206 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
13208 exists_cb
: nullptr,
13212 name
: "ceph.client_id",
13213 getxattr_cb
: &Client::_vxattrcb_client_id
,
13215 exists_cb
: nullptr,
13218 { name
: "" } /* Required table terminator */
13221 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
13224 return _dir_vxattrs
;
13225 else if (in
->is_file())
13226 return _file_vxattrs
;
13230 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
13232 if (strncmp(name
, "ceph.", 5) == 0) {
13233 const VXattr
*vxattr
= _get_vxattrs(in
);
13235 while (!vxattr
->name
.empty()) {
13236 if (vxattr
->name
== name
)
13242 // for common vxattrs
13243 vxattr
= _common_vxattrs
;
13244 while (!vxattr
->name
.empty()) {
13245 if (vxattr
->name
== name
)
13254 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
13256 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13257 if (!mref_reader
.is_state_satisfied())
13258 return -CEPHFS_ENOTCONN
;
13260 vinodeno_t vino
= _get_vino(in
);
13262 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
13263 tout(cct
) << "ll_readlink" << std::endl
;
13264 tout(cct
) << vino
.ino
.val
<< std::endl
;
13266 std::scoped_lock
lock(client_lock
);
13267 for (auto dn
: in
->dentries
) {
13271 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
13272 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
13276 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
13277 const UserPerm
& perms
, InodeRef
*inp
)
13279 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
13280 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
13281 << ", gid " << perms
.gid() << ")" << dendl
;
13283 if (strlen(name
) > NAME_MAX
)
13284 return -CEPHFS_ENAMETOOLONG
;
13286 if (dir
->snapid
!= CEPH_NOSNAP
) {
13287 return -CEPHFS_EROFS
;
13289 if (is_quota_files_exceeded(dir
, perms
)) {
13290 return -CEPHFS_EDQUOT
;
13293 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
13296 dir
->make_nosnap_relative_path(path
);
13297 path
.push_dentry(name
);
13298 req
->set_filepath(path
);
13299 req
->set_inode(dir
);
13300 req
->head
.args
.mknod
.rdev
= rdev
;
13301 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13302 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13304 bufferlist xattrs_bl
;
13305 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13308 req
->head
.args
.mknod
.mode
= mode
;
13309 if (xattrs_bl
.length() > 0)
13310 req
->set_data(xattrs_bl
);
13313 res
= get_or_create(dir
, name
, &de
);
13316 req
->set_dentry(de
);
13318 res
= make_request(req
, perms
, inp
);
13322 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13330 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
13331 dev_t rdev
, struct stat
*attr
, Inode
**out
,
13332 const UserPerm
& perms
)
13334 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13335 if (!mref_reader
.is_state_satisfied())
13336 return -CEPHFS_ENOTCONN
;
13338 vinodeno_t vparent
= _get_vino(parent
);
13340 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
13341 tout(cct
) << "ll_mknod" << std::endl
;
13342 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13343 tout(cct
) << name
<< std::endl
;
13344 tout(cct
) << mode
<< std::endl
;
13345 tout(cct
) << rdev
<< std::endl
;
13347 std::scoped_lock
lock(client_lock
);
13348 if (!fuse_default_permissions
) {
13349 int r
= may_create(parent
, perms
);
13355 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13357 fill_stat(in
, attr
);
13360 tout(cct
) << attr
->st_ino
<< std::endl
;
13361 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
13362 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13367 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
13368 dev_t rdev
, Inode
**out
,
13369 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13370 const UserPerm
& perms
)
13372 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13373 if (!mref_reader
.is_state_satisfied())
13374 return -CEPHFS_ENOTCONN
;
13376 unsigned caps
= statx_to_mask(flags
, want
);
13378 vinodeno_t vparent
= _get_vino(parent
);
13380 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
13381 tout(cct
) << "ll_mknodx" << std::endl
;
13382 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13383 tout(cct
) << name
<< std::endl
;
13384 tout(cct
) << mode
<< std::endl
;
13385 tout(cct
) << rdev
<< std::endl
;
13387 std::scoped_lock
lock(client_lock
);
13389 if (!fuse_default_permissions
) {
13390 int r
= may_create(parent
, perms
);
13396 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13398 fill_statx(in
, caps
, stx
);
13401 tout(cct
) << stx
->stx_ino
<< std::endl
;
13402 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
13403 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13408 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
13409 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
13410 int object_size
, const char *data_pool
, bool *created
,
13411 const UserPerm
& perms
, std::string alternate_name
)
13413 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
13414 mode
<< dec
<< ")" << dendl
;
13416 if (strlen(name
) > NAME_MAX
)
13417 return -CEPHFS_ENAMETOOLONG
;
13418 if (dir
->snapid
!= CEPH_NOSNAP
) {
13419 return -CEPHFS_EROFS
;
13421 if (is_quota_files_exceeded(dir
, perms
)) {
13422 return -CEPHFS_EDQUOT
;
13425 // use normalized flags to generate cmode
13426 int cflags
= ceph_flags_sys2wire(flags
);
13427 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
13428 cflags
|= CEPH_O_LAZY
;
13430 int cmode
= ceph_flags_to_mode(cflags
);
13432 int64_t pool_id
= -1;
13433 if (data_pool
&& *data_pool
) {
13434 pool_id
= objecter
->with_osdmap(
13435 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
13437 return -CEPHFS_EINVAL
;
13438 if (pool_id
> 0xffffffffll
)
13439 return -CEPHFS_ERANGE
; // bummer!
13442 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
13445 dir
->make_nosnap_relative_path(path
);
13446 path
.push_dentry(name
);
13447 req
->set_filepath(path
);
13448 req
->set_alternate_name(std::move(alternate_name
));
13449 req
->set_inode(dir
);
13450 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
13452 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
13453 req
->head
.args
.open
.stripe_count
= stripe_count
;
13454 req
->head
.args
.open
.object_size
= object_size
;
13455 if (cct
->_conf
->client_debug_getattr_caps
)
13456 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
13458 req
->head
.args
.open
.mask
= 0;
13459 req
->head
.args
.open
.pool
= pool_id
;
13460 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13461 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13464 bufferlist xattrs_bl
;
13465 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13468 req
->head
.args
.open
.mode
= mode
;
13469 if (xattrs_bl
.length() > 0)
13470 req
->set_data(xattrs_bl
);
13473 res
= get_or_create(dir
, name
, &de
);
13476 req
->set_dentry(de
);
13478 res
= make_request(req
, perms
, inp
, created
);
13483 /* If the caller passed a value in fhp, do the open */
13485 (*inp
)->get_open_ref(cmode
);
13486 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
13492 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
13493 << " layout " << stripe_unit
13494 << ' ' << stripe_count
13495 << ' ' << object_size
13496 <<") = " << res
<< dendl
;
13504 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
13505 InodeRef
*inp
, const std::map
<std::string
, std::string
> &metadata
,
13506 std::string alternate_name
)
13508 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
13509 << mode
<< dec
<< ", uid " << perm
.uid()
13510 << ", gid " << perm
.gid() << ")" << dendl
;
13512 if (strlen(name
) > NAME_MAX
)
13513 return -CEPHFS_ENAMETOOLONG
;
13515 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13516 return -CEPHFS_EROFS
;
13518 if (is_quota_files_exceeded(dir
, perm
)) {
13519 return -CEPHFS_EDQUOT
;
13522 bool is_snap_op
= dir
->snapid
== CEPH_SNAPDIR
;
13523 MetaRequest
*req
= new MetaRequest(is_snap_op
?
13524 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
13527 dir
->make_nosnap_relative_path(path
);
13528 path
.push_dentry(name
);
13529 req
->set_filepath(path
);
13530 req
->set_inode(dir
);
13531 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13532 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13533 req
->set_alternate_name(std::move(alternate_name
));
13537 int res
= _posix_acl_create(dir
, &mode
, bl
, perm
);
13540 req
->head
.args
.mkdir
.mode
= mode
;
13542 SnapPayload payload
;
13543 // clear the bufferlist that may have been populated by the call
13544 // to _posix_acl_create(). MDS mksnap does not make use of it.
13545 // So, reuse it to pass metadata payload.
13547 payload
.metadata
= metadata
;
13548 encode(payload
, bl
);
13550 if (bl
.length() > 0) {
13555 res
= get_or_create(dir
, name
, &de
);
13558 req
->set_dentry(de
);
13560 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
13561 res
= make_request(req
, perm
, inp
);
13562 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
13566 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13574 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
13575 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
13577 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13578 if (!mref_reader
.is_state_satisfied())
13579 return -CEPHFS_ENOTCONN
;
13581 vinodeno_t vparent
= _get_vino(parent
);
13583 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
13584 tout(cct
) << "ll_mkdir" << std::endl
;
13585 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13586 tout(cct
) << name
<< std::endl
;
13587 tout(cct
) << mode
<< std::endl
;
13589 std::scoped_lock
lock(client_lock
);
13591 if (!fuse_default_permissions
) {
13592 int r
= may_create(parent
, perm
);
13598 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
13600 fill_stat(in
, attr
);
13603 tout(cct
) << attr
->st_ino
<< std::endl
;
13604 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
13605 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13610 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
13611 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13612 const UserPerm
& perms
)
13614 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13615 if (!mref_reader
.is_state_satisfied())
13616 return -CEPHFS_ENOTCONN
;
13618 vinodeno_t vparent
= _get_vino(parent
);
13620 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
13621 tout(cct
) << "ll_mkdirx" << std::endl
;
13622 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13623 tout(cct
) << name
<< std::endl
;
13624 tout(cct
) << mode
<< std::endl
;
13626 std::scoped_lock
lock(client_lock
);
13628 if (!fuse_default_permissions
) {
13629 int r
= may_create(parent
, perms
);
13635 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
13637 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13643 tout(cct
) << stx
->stx_ino
<< std::endl
;
13644 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
13645 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13650 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
13651 const UserPerm
& perms
, std::string alternate_name
, InodeRef
*inp
)
13653 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
13654 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
13657 if (strlen(name
) > NAME_MAX
)
13658 return -CEPHFS_ENAMETOOLONG
;
13660 if (dir
->snapid
!= CEPH_NOSNAP
) {
13661 return -CEPHFS_EROFS
;
13663 if (is_quota_files_exceeded(dir
, perms
)) {
13664 return -CEPHFS_EDQUOT
;
13667 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
13670 dir
->make_nosnap_relative_path(path
);
13671 path
.push_dentry(name
);
13672 req
->set_filepath(path
);
13673 req
->set_alternate_name(std::move(alternate_name
));
13674 req
->set_inode(dir
);
13675 req
->set_string2(target
);
13676 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13677 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13680 int res
= get_or_create(dir
, name
, &de
);
13683 req
->set_dentry(de
);
13685 res
= make_request(req
, perms
, inp
);
13688 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
13697 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
13698 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
13700 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13701 if (!mref_reader
.is_state_satisfied())
13702 return -CEPHFS_ENOTCONN
;
13704 vinodeno_t vparent
= _get_vino(parent
);
13706 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
13708 tout(cct
) << "ll_symlink" << std::endl
;
13709 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13710 tout(cct
) << name
<< std::endl
;
13711 tout(cct
) << value
<< std::endl
;
13713 std::scoped_lock
lock(client_lock
);
13715 if (!fuse_default_permissions
) {
13716 int r
= may_create(parent
, perms
);
13722 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13724 fill_stat(in
, attr
);
13727 tout(cct
) << attr
->st_ino
<< std::endl
;
13728 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
13729 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13734 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
13735 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
13736 unsigned flags
, const UserPerm
& perms
)
13738 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13739 if (!mref_reader
.is_state_satisfied())
13740 return -CEPHFS_ENOTCONN
;
13742 vinodeno_t vparent
= _get_vino(parent
);
13744 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
13746 tout(cct
) << "ll_symlinkx" << std::endl
;
13747 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13748 tout(cct
) << name
<< std::endl
;
13749 tout(cct
) << value
<< std::endl
;
13751 std::scoped_lock
lock(client_lock
);
13753 if (!fuse_default_permissions
) {
13754 int r
= may_create(parent
, perms
);
13760 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
13762 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
13765 tout(cct
) << stx
->stx_ino
<< std::endl
;
13766 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
13767 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13772 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
13774 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
13775 << " uid " << perm
.uid() << " gid " << perm
.gid()
13778 if (dir
->snapid
!= CEPH_NOSNAP
) {
13779 return -CEPHFS_EROFS
;
13782 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
13785 dir
->make_nosnap_relative_path(path
);
13786 path
.push_dentry(name
);
13787 req
->set_filepath(path
);
13793 int res
= get_or_create(dir
, name
, &de
);
13796 req
->set_dentry(de
);
13797 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13798 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13800 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
13804 in
= otherin
.get();
13805 req
->set_other_inode(in
);
13806 in
->break_all_delegs();
13807 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13809 req
->set_inode(dir
);
13811 res
= make_request(req
, perm
);
13814 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
13822 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
13824 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13825 if (!mref_reader
.is_state_satisfied())
13826 return -CEPHFS_ENOTCONN
;
13828 vinodeno_t vino
= _get_vino(in
);
13830 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
13831 tout(cct
) << "ll_unlink" << std::endl
;
13832 tout(cct
) << vino
.ino
.val
<< std::endl
;
13833 tout(cct
) << name
<< std::endl
;
13835 std::scoped_lock
lock(client_lock
);
13837 if (!fuse_default_permissions
) {
13838 int r
= may_delete(in
, name
, perm
);
13842 return _unlink(in
, name
, perm
);
13845 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
13847 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
13848 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
13850 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
13851 return -CEPHFS_EROFS
;
13854 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
13855 MetaRequest
*req
= new MetaRequest(op
);
13857 dir
->make_nosnap_relative_path(path
);
13858 path
.push_dentry(name
);
13859 req
->set_filepath(path
);
13860 req
->set_inode(dir
);
13862 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13863 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13864 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
13869 int res
= get_or_create(dir
, name
, &de
);
13872 if (op
== CEPH_MDS_OP_RMDIR
)
13873 req
->set_dentry(de
);
13877 res
= _lookup(dir
, name
, 0, &in
, perms
);
13881 if (op
== CEPH_MDS_OP_RMSNAP
) {
13882 unlink(de
, true, true);
13885 req
->set_other_inode(in
.get());
13887 res
= make_request(req
, perms
);
13890 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
13898 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
13900 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13901 if (!mref_reader
.is_state_satisfied())
13902 return -CEPHFS_ENOTCONN
;
13904 vinodeno_t vino
= _get_vino(in
);
13906 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
13907 tout(cct
) << "ll_rmdir" << std::endl
;
13908 tout(cct
) << vino
.ino
.val
<< std::endl
;
13909 tout(cct
) << name
<< std::endl
;
13911 std::scoped_lock
lock(client_lock
);
13913 if (!fuse_default_permissions
) {
13914 int r
= may_delete(in
, name
, perms
);
13919 return _rmdir(in
, name
, perms
);
13922 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
, std::string alternate_name
)
13924 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
13925 << todir
->ino
<< " " << toname
13926 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
13929 if (fromdir
->snapid
!= todir
->snapid
)
13930 return -CEPHFS_EXDEV
;
13932 int op
= CEPH_MDS_OP_RENAME
;
13933 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
13934 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
13935 op
= CEPH_MDS_OP_RENAMESNAP
;
13937 return -CEPHFS_EROFS
;
13939 if (cct
->_conf
.get_val
<bool>("client_quota") && fromdir
!= todir
) {
13940 Inode
*fromdir_root
=
13941 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
13942 Inode
*todir_root
=
13943 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
13944 if (fromdir_root
!= todir_root
) {
13945 return -CEPHFS_EXDEV
;
13950 MetaRequest
*req
= new MetaRequest(op
);
13953 fromdir
->make_nosnap_relative_path(from
);
13954 from
.push_dentry(fromname
);
13956 todir
->make_nosnap_relative_path(to
);
13957 to
.push_dentry(toname
);
13958 req
->set_filepath(to
);
13959 req
->set_filepath2(from
);
13960 req
->set_alternate_name(std::move(alternate_name
));
13963 int res
= get_or_create(fromdir
, fromname
, &oldde
);
13967 res
= get_or_create(todir
, toname
, &de
);
13971 if (op
== CEPH_MDS_OP_RENAME
) {
13972 req
->set_old_dentry(oldde
);
13973 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
13974 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
13976 req
->set_dentry(de
);
13977 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13978 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13980 InodeRef oldin
, otherin
;
13981 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
13985 Inode
*oldinode
= oldin
.get();
13986 oldinode
->break_all_delegs();
13987 req
->set_old_inode(oldinode
);
13988 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
13990 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
13994 Inode
*in
= otherin
.get();
13995 req
->set_other_inode(in
);
13996 in
->break_all_delegs();
13998 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
14000 case -CEPHFS_ENOENT
:
14006 req
->set_inode(todir
);
14008 // renamesnap reply contains no tracedn, so we need to invalidate
14010 unlink(oldde
, true, true);
14011 unlink(de
, true, true);
14013 req
->set_inode(todir
);
14016 res
= make_request(req
, perm
, &target
);
14017 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
14019 // renamed item from our cache
14022 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
14030 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
14031 const char *newname
, const UserPerm
& perm
)
14033 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14034 if (!mref_reader
.is_state_satisfied())
14035 return -CEPHFS_ENOTCONN
;
14037 vinodeno_t vparent
= _get_vino(parent
);
14038 vinodeno_t vnewparent
= _get_vino(newparent
);
14040 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
14041 << vnewparent
<< " " << newname
<< dendl
;
14042 tout(cct
) << "ll_rename" << std::endl
;
14043 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14044 tout(cct
) << name
<< std::endl
;
14045 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
14046 tout(cct
) << newname
<< std::endl
;
14048 std::scoped_lock
lock(client_lock
);
14050 if (!fuse_default_permissions
) {
14051 int r
= may_delete(parent
, name
, perm
);
14054 r
= may_delete(newparent
, newname
, perm
);
14055 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
14059 return _rename(parent
, name
, newparent
, newname
, perm
, "");
14062 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, std::string alternate_name
, InodeRef
*inp
)
14064 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
14065 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
14067 if (strlen(newname
) > NAME_MAX
)
14068 return -CEPHFS_ENAMETOOLONG
;
14070 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
14071 return -CEPHFS_EROFS
;
14073 if (is_quota_files_exceeded(dir
, perm
)) {
14074 return -CEPHFS_EDQUOT
;
14077 in
->break_all_delegs();
14078 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
14080 filepath
path(newname
, dir
->ino
);
14081 req
->set_filepath(path
);
14082 req
->set_alternate_name(std::move(alternate_name
));
14083 filepath
existing(in
->ino
);
14084 req
->set_filepath2(existing
);
14086 req
->set_inode(dir
);
14087 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
14088 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
14091 int res
= get_or_create(dir
, newname
, &de
);
14094 req
->set_dentry(de
);
14096 res
= make_request(req
, perm
, inp
);
14097 ldout(cct
, 10) << "link result is " << res
<< dendl
;
14100 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
14108 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
14109 const UserPerm
& perm
)
14111 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14112 if (!mref_reader
.is_state_satisfied())
14113 return -CEPHFS_ENOTCONN
;
14115 vinodeno_t vino
= _get_vino(in
);
14116 vinodeno_t vnewparent
= _get_vino(newparent
);
14118 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
14120 tout(cct
) << "ll_link" << std::endl
;
14121 tout(cct
) << vino
.ino
.val
<< std::endl
;
14122 tout(cct
) << vnewparent
<< std::endl
;
14123 tout(cct
) << newname
<< std::endl
;
14127 std::scoped_lock
lock(client_lock
);
14129 if (!fuse_default_permissions
) {
14130 if (S_ISDIR(in
->mode
))
14131 return -CEPHFS_EPERM
;
14133 int r
= may_hardlink(in
, perm
);
14137 r
= may_create(newparent
, perm
);
14142 return _link(in
, newparent
, newname
, perm
, "", &target
);
14145 int Client::ll_num_osds(void)
14147 std::scoped_lock
lock(client_lock
);
14148 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
14151 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
14153 std::scoped_lock
lock(client_lock
);
14156 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
14157 if (!o
.exists(osd
))
14159 g
= o
.get_addrs(osd
).front();
14164 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
14165 *addr
= ntohl(nb_addr
);
14169 uint32_t Client::ll_stripe_unit(Inode
*in
)
14171 std::scoped_lock
lock(client_lock
);
14172 return in
->layout
.stripe_unit
;
14175 uint64_t Client::ll_snap_seq(Inode
*in
)
14177 std::scoped_lock
lock(client_lock
);
14178 return in
->snaprealm
->seq
;
14181 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
14183 std::scoped_lock
lock(client_lock
);
14184 *layout
= in
->layout
;
14188 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
14190 return ll_file_layout(fh
->inode
.get(), layout
);
14193 /* Currently we cannot take advantage of redundancy in reads, since we
14194 would have to go through all possible placement groups (a
14195 potentially quite large number determined by a hash), and use CRUSH
14196 to calculate the appropriate set of OSDs for each placement group,
14197 then index into that. An array with one entry per OSD is much more
14198 tractable and works for demonstration purposes. */
14200 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
14201 file_layout_t
* layout
)
14203 std::scoped_lock
lock(client_lock
);
14205 inodeno_t ino
= in
->ino
;
14206 uint32_t object_size
= layout
->object_size
;
14207 uint32_t su
= layout
->stripe_unit
;
14208 uint32_t stripe_count
= layout
->stripe_count
;
14209 uint64_t stripes_per_object
= object_size
/ su
;
14210 uint64_t stripeno
= 0, stripepos
= 0;
14213 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
14214 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
14216 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
14217 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
14219 object_t oid
= file_object_t(ino
, objectno
);
14220 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14221 ceph_object_layout olayout
=
14222 o
.file_to_object_layout(oid
, *layout
);
14223 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
14226 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
14231 /* Return the offset of the block, internal to the object */
14233 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
14235 std::scoped_lock
lock(client_lock
);
14236 file_layout_t
*layout
=&(in
->layout
);
14237 uint32_t object_size
= layout
->object_size
;
14238 uint32_t su
= layout
->stripe_unit
;
14239 uint64_t stripes_per_object
= object_size
/ su
;
14241 return (blockno
% stripes_per_object
) * su
;
14244 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
14245 const UserPerm
& perms
)
14247 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14248 if (!mref_reader
.is_state_satisfied())
14249 return -CEPHFS_ENOTCONN
;
14251 vinodeno_t vino
= _get_vino(in
);
14253 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
14254 tout(cct
) << "ll_opendir" << std::endl
;
14255 tout(cct
) << vino
.ino
.val
<< std::endl
;
14257 std::scoped_lock
lock(client_lock
);
14259 if (!fuse_default_permissions
) {
14260 int r
= may_open(in
, flags
, perms
);
14265 int r
= _opendir(in
, dirpp
, perms
);
14266 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
14268 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
14273 int Client::ll_releasedir(dir_result_t
*dirp
)
14275 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14276 if (!mref_reader
.is_state_satisfied())
14277 return -CEPHFS_ENOTCONN
;
14279 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
14280 tout(cct
) << "ll_releasedir" << std::endl
;
14281 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14283 std::scoped_lock
lock(client_lock
);
14289 int Client::ll_fsyncdir(dir_result_t
*dirp
)
14291 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14292 if (!mref_reader
.is_state_satisfied())
14293 return -CEPHFS_ENOTCONN
;
14295 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
14296 tout(cct
) << "ll_fsyncdir" << std::endl
;
14297 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14299 std::scoped_lock
lock(client_lock
);
14300 return _fsync(dirp
->inode
.get(), false);
14303 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
14305 ceph_assert(!(flags
& O_CREAT
));
14307 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14308 if (!mref_reader
.is_state_satisfied())
14309 return -CEPHFS_ENOTCONN
;
14311 vinodeno_t vino
= _get_vino(in
);
14313 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
14314 tout(cct
) << "ll_open" << std::endl
;
14315 tout(cct
) << vino
.ino
.val
<< std::endl
;
14316 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14318 std::scoped_lock
lock(client_lock
);
14321 if (!fuse_default_permissions
) {
14322 r
= may_open(in
, flags
, perms
);
14327 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
14330 Fh
*fhptr
= fhp
? *fhp
: NULL
;
14332 ll_unclosed_fh_set
.insert(fhptr
);
14334 tout(cct
) << (uintptr_t)fhptr
<< std::endl
;
14335 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
14336 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
14340 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14341 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
14342 const UserPerm
& perms
)
14346 vinodeno_t vparent
= _get_vino(parent
);
14348 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14349 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
14350 << ", gid " << perms
.gid() << dendl
;
14351 tout(cct
) << "ll_create" << std::endl
;
14352 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14353 tout(cct
) << name
<< std::endl
;
14354 tout(cct
) << mode
<< std::endl
;
14355 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14357 bool created
= false;
14358 int r
= _lookup(parent
, name
, caps
, in
, perms
);
14360 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
14361 return -CEPHFS_EEXIST
;
14363 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
14364 if (!fuse_default_permissions
) {
14365 r
= may_create(parent
, perms
);
14369 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
14380 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
14382 if (!fuse_default_permissions
) {
14383 r
= may_open(in
->get(), flags
, perms
);
14386 int release_r
= _release_fh(*fhp
);
14387 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
14392 if (*fhp
== NULL
) {
14393 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
14401 ll_unclosed_fh_set
.insert(*fhp
);
14406 Inode
*inode
= in
->get();
14407 if (use_faked_inos())
14408 ino
= inode
->faked_ino
;
14413 tout(cct
) << (uintptr_t)*fhp
<< std::endl
;
14414 tout(cct
) << ino
<< std::endl
;
14415 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14416 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
14417 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
14422 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14423 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
14424 const UserPerm
& perms
)
14426 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14427 if (!mref_reader
.is_state_satisfied())
14428 return -CEPHFS_ENOTCONN
;
14430 std::scoped_lock
lock(client_lock
);
14433 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
14438 // passing an Inode in outp requires an additional ref
14443 fill_stat(in
, attr
);
14451 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
14452 int oflags
, Inode
**outp
, Fh
**fhp
,
14453 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
14454 const UserPerm
& perms
)
14456 unsigned caps
= statx_to_mask(lflags
, want
);
14457 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14458 if (!mref_reader
.is_state_satisfied())
14459 return -CEPHFS_ENOTCONN
;
14461 std::scoped_lock
lock(client_lock
);
14464 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
14468 // passing an Inode in outp requires an additional ref
14473 fill_statx(in
, caps
, stx
);
14482 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
14484 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14485 if (!mref_reader
.is_state_satisfied())
14486 return -CEPHFS_ENOTCONN
;
14488 tout(cct
) << "ll_lseek" << std::endl
;
14489 tout(cct
) << offset
<< std::endl
;
14490 tout(cct
) << whence
<< std::endl
;
14492 std::scoped_lock
lock(client_lock
);
14493 return _lseek(fh
, offset
, whence
);
14496 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
14498 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14499 if (!mref_reader
.is_state_satisfied())
14500 return -CEPHFS_ENOTCONN
;
14502 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
14503 tout(cct
) << "ll_read" << std::endl
;
14504 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14505 tout(cct
) << off
<< std::endl
;
14506 tout(cct
) << len
<< std::endl
;
14508 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14509 len
= std::min(len
, (loff_t
)INT_MAX
);
14510 std::scoped_lock
lock(client_lock
);
14512 int r
= _read(fh
, off
, len
, bl
);
14513 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
14518 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
14522 file_layout_t
* layout
)
14524 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14525 if (!mref_reader
.is_state_satisfied())
14526 return -CEPHFS_ENOTCONN
;
14528 vinodeno_t vino
= _get_vino(in
);
14529 object_t oid
= file_object_t(vino
.ino
, blockid
);
14530 C_SaferCond onfinish
;
14533 objecter
->read(oid
,
14534 object_locator_t(layout
->pool_id
),
14539 CEPH_OSD_FLAG_READ
,
14542 int r
= onfinish
.wait();
14544 bl
.begin().copy(bl
.length(), buf
);
14551 /* It appears that the OSD doesn't return success unless the entire
14552 buffer was written, return the write length on success. */
14554 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
14555 char* buf
, uint64_t offset
,
14556 uint64_t length
, file_layout_t
* layout
,
14557 uint64_t snapseq
, uint32_t sync
)
14559 vinodeno_t vino
= ll_get_vino(in
);
14561 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
14563 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14564 if (!mref_reader
.is_state_satisfied())
14565 return -CEPHFS_ENOTCONN
;
14568 return -CEPHFS_EINVAL
;
14570 if (true || sync
) {
14571 /* if write is stable, the epilogue is waiting on
14573 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
14575 object_t oid
= file_object_t(vino
.ino
, blockid
);
14576 SnapContext fakesnap
;
14577 ceph::bufferlist bl
;
14579 bl
.push_back(buffer::copy(buf
, length
));
14582 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
14585 fakesnap
.seq
= snapseq
;
14587 /* lock just in time */
14588 objecter
->write(oid
,
14589 object_locator_t(layout
->pool_id
),
14594 ceph::real_clock::now(),
14598 if (nullptr != onsafe
) {
14599 r
= onsafe
->wait();
14609 int Client::ll_commit_blocks(Inode
*in
,
14614 BarrierContext *bctx;
14615 vinodeno_t vino = _get_vino(in);
14616 uint64_t ino = vino.ino;
14618 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14619 << offset << " to " << length << dendl;
14622 return -CEPHFS_EINVAL;
14625 std::scoped_lock lock(client_lock);
14626 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14627 if (p != barriers.end()) {
14628 barrier_interval civ(offset, offset + length);
14629 p->second->commit_barrier(civ);
14635 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
14637 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
14638 "~" << len
<< dendl
;
14639 tout(cct
) << "ll_write" << std::endl
;
14640 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14641 tout(cct
) << off
<< std::endl
;
14642 tout(cct
) << len
<< std::endl
;
14644 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14645 if (!mref_reader
.is_state_satisfied())
14646 return -CEPHFS_ENOTCONN
;
14648 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14649 len
= std::min(len
, (loff_t
)INT_MAX
);
14650 std::scoped_lock
lock(client_lock
);
14652 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
14653 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
14658 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14660 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14661 if (!mref_reader
.is_state_satisfied())
14662 return -CEPHFS_ENOTCONN
;
14664 std::scoped_lock
cl(client_lock
);
14665 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
14668 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
14670 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14671 if (!mref_reader
.is_state_satisfied())
14672 return -CEPHFS_ENOTCONN
;
14674 std::scoped_lock
cl(client_lock
);
14675 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
14678 int Client::ll_flush(Fh
*fh
)
14680 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14681 if (!mref_reader
.is_state_satisfied())
14682 return -CEPHFS_ENOTCONN
;
14684 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14685 tout(cct
) << "ll_flush" << std::endl
;
14686 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14688 std::scoped_lock
lock(client_lock
);
14692 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
14694 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14695 if (!mref_reader
.is_state_satisfied())
14696 return -CEPHFS_ENOTCONN
;
14698 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14699 tout(cct
) << "ll_fsync" << std::endl
;
14700 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14702 std::scoped_lock
lock(client_lock
);
14703 int r
= _fsync(fh
, syncdataonly
);
14705 // If we're returning an error, clear it from the FH
14706 fh
->take_async_err();
14711 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
14713 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14714 if (!mref_reader
.is_state_satisfied())
14715 return -CEPHFS_ENOTCONN
;
14717 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
14718 tout(cct
) << "ll_sync_inode" << std::endl
;
14719 tout(cct
) << (uintptr_t)in
<< std::endl
;
14721 std::scoped_lock
lock(client_lock
);
14722 return _fsync(in
, syncdataonly
);
14725 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14727 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14729 if (offset
< 0 || length
<= 0)
14730 return -CEPHFS_EINVAL
;
14732 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
14733 return -CEPHFS_EOPNOTSUPP
;
14735 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
14736 return -CEPHFS_EOPNOTSUPP
;
14738 Inode
*in
= fh
->inode
.get();
14740 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
14741 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
14742 return -CEPHFS_ENOSPC
;
14745 if (in
->snapid
!= CEPH_NOSNAP
)
14746 return -CEPHFS_EROFS
;
14748 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
14749 return -CEPHFS_EBADF
;
14751 uint64_t size
= offset
+ length
;
14752 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
14754 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
14755 return -CEPHFS_EDQUOT
;
14759 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
14763 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
14764 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
14765 if (in
->inline_version
< CEPH_INLINE_NONE
&&
14766 (have
& CEPH_CAP_FILE_BUFFER
)) {
14768 auto inline_iter
= in
->inline_data
.cbegin();
14769 int len
= in
->inline_data
.length();
14770 if (offset
< len
) {
14772 inline_iter
.copy(offset
, bl
);
14774 if (offset
+ size
> len
)
14775 size
= len
- offset
;
14777 bl
.append_zero(size
);
14778 if (offset
+ size
< len
) {
14779 inline_iter
+= size
;
14780 inline_iter
.copy(len
- offset
- size
, bl
);
14782 in
->inline_data
= bl
;
14783 in
->inline_version
++;
14785 in
->mtime
= in
->ctime
= ceph_clock_now();
14787 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14789 if (in
->inline_version
< CEPH_INLINE_NONE
) {
14790 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14791 uninline_data(in
, onuninline
.get());
14794 C_SaferCond
onfinish("Client::_punch_hole flock");
14796 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14798 _invalidate_inode_cache(in
, offset
, length
);
14799 filer
->zero(in
->ino
, &in
->layout
,
14800 in
->snaprealm
->get_snap_context(),
14802 ceph::real_clock::now(),
14803 0, true, &onfinish
);
14804 in
->mtime
= in
->ctime
= ceph_clock_now();
14806 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14808 client_lock
.unlock();
14810 client_lock
.lock();
14811 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
14813 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
14814 uint64_t size
= offset
+ length
;
14815 if (size
> in
->size
) {
14817 in
->mtime
= in
->ctime
= ceph_clock_now();
14819 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14821 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
14822 check_caps(in
, CHECK_CAPS_NODELAY
);
14823 } else if (is_max_size_approaching(in
)) {
14829 if (nullptr != onuninline
) {
14830 client_lock
.unlock();
14831 int ret
= onuninline
->wait();
14832 client_lock
.lock();
14834 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
14835 in
->inline_data
.clear();
14836 in
->inline_version
= CEPH_INLINE_NONE
;
14837 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
14843 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
14847 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
14849 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14850 if (!mref_reader
.is_state_satisfied())
14851 return -CEPHFS_ENOTCONN
;
14853 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
14854 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
14855 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14857 std::scoped_lock
lock(client_lock
);
14858 return _fallocate(fh
, mode
, offset
, length
);
14861 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
14863 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14864 if (!mref_reader
.is_state_satisfied())
14865 return -CEPHFS_ENOTCONN
;
14867 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
14869 std::scoped_lock
lock(client_lock
);
14870 Fh
*fh
= get_filehandle(fd
);
14872 return -CEPHFS_EBADF
;
14873 #if defined(__linux__) && defined(O_PATH)
14874 if (fh
->flags
& O_PATH
)
14875 return -CEPHFS_EBADF
;
14877 return _fallocate(fh
, mode
, offset
, length
);
14880 int Client::ll_release(Fh
*fh
)
14882 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14883 if (!mref_reader
.is_state_satisfied())
14884 return -CEPHFS_ENOTCONN
;
14886 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
14888 tout(cct
) << __func__
<< " (fh)" << std::endl
;
14889 tout(cct
) << (uintptr_t)fh
<< std::endl
;
14891 std::scoped_lock
lock(client_lock
);
14893 if (ll_unclosed_fh_set
.count(fh
))
14894 ll_unclosed_fh_set
.erase(fh
);
14895 return _release_fh(fh
);
14898 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
14900 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14901 if (!mref_reader
.is_state_satisfied())
14902 return -CEPHFS_ENOTCONN
;
14904 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
14905 tout(cct
) << "ll_getk (fh)" << (uintptr_t)fh
<< std::endl
;
14907 std::scoped_lock
lock(client_lock
);
14908 return _getlk(fh
, fl
, owner
);
14911 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
14913 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14914 if (!mref_reader
.is_state_satisfied())
14915 return -CEPHFS_ENOTCONN
;
14917 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14918 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14920 std::scoped_lock
lock(client_lock
);
14921 return _setlk(fh
, fl
, owner
, sleep
);
14924 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
14926 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14927 if (!mref_reader
.is_state_satisfied())
14928 return -CEPHFS_ENOTCONN
;
14930 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
14931 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
14933 std::scoped_lock
lock(client_lock
);
14934 return _flock(fh
, cmd
, owner
);
14937 int Client::set_deleg_timeout(uint32_t timeout
)
14939 std::scoped_lock
lock(client_lock
);
14942 * The whole point is to prevent blocklisting so we must time out the
14943 * delegation before the session autoclose timeout kicks in.
14945 if (timeout
>= mdsmap
->get_session_autoclose())
14946 return -CEPHFS_EINVAL
;
14948 deleg_timeout
= timeout
;
14952 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
14954 int ret
= -CEPHFS_EINVAL
;
14956 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14957 if (!mref_reader
.is_state_satisfied())
14958 return -CEPHFS_ENOTCONN
;
14960 std::scoped_lock
lock(client_lock
);
14962 Inode
*inode
= fh
->inode
.get();
14965 case CEPH_DELEGATION_NONE
:
14966 inode
->unset_deleg(fh
);
14971 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
14972 } catch (std::bad_alloc
&) {
14973 ret
= -CEPHFS_ENOMEM
;
14980 class C_Client_RequestInterrupt
: public Context
{
14985 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
14988 void finish(int r
) override
{
14989 std::scoped_lock
l(client
->client_lock
);
14990 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
14991 client
->_interrupt_filelock(req
);
14992 client
->put_request(req
);
14996 void Client::ll_interrupt(void *d
)
14998 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
14999 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
15000 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
15001 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
15004 // =========================================
15007 // expose file layouts
15009 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
15010 const UserPerm
& perms
)
15012 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15013 if (!mref_reader
.is_state_satisfied())
15014 return -CEPHFS_ENOTCONN
;
15016 std::scoped_lock
lock(client_lock
);
15018 filepath
path(relpath
);
15020 int r
= path_walk(path
, &in
, perms
);
15026 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
15030 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
15032 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15033 if (!mref_reader
.is_state_satisfied())
15034 return -CEPHFS_ENOTCONN
;
15036 std::scoped_lock
lock(client_lock
);
15038 Fh
*f
= get_filehandle(fd
);
15040 return -CEPHFS_EBADF
;
15041 Inode
*in
= f
->inode
.get();
15045 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
15049 int64_t Client::get_default_pool_id()
15051 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15052 if (!mref_reader
.is_state_satisfied())
15053 return -CEPHFS_ENOTCONN
;
15055 std::scoped_lock
lock(client_lock
);
15057 /* first data pool is the default */
15058 return mdsmap
->get_first_data_pool();
15063 int64_t Client::get_pool_id(const char *pool_name
)
15065 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15066 if (!mref_reader
.is_state_satisfied())
15067 return -CEPHFS_ENOTCONN
;
15069 std::scoped_lock
lock(client_lock
);
15071 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
15075 string
Client::get_pool_name(int64_t pool
)
15077 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15078 if (!mref_reader
.is_state_satisfied())
15081 std::scoped_lock
lock(client_lock
);
15083 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15084 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
15088 int Client::get_pool_replication(int64_t pool
)
15090 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15091 if (!mref_reader
.is_state_satisfied())
15092 return -CEPHFS_ENOTCONN
;
15094 std::scoped_lock
lock(client_lock
);
15096 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15097 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -CEPHFS_ENOENT
;
15101 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
15103 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15104 if (!mref_reader
.is_state_satisfied())
15105 return -CEPHFS_ENOTCONN
;
15107 std::scoped_lock
lock(client_lock
);
15109 Fh
*f
= get_filehandle(fd
);
15111 return -CEPHFS_EBADF
;
15112 Inode
*in
= f
->inode
.get();
15114 vector
<ObjectExtent
> extents
;
15115 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
15116 ceph_assert(extents
.size() == 1);
15118 objecter
->with_osdmap([&](const OSDMap
& o
) {
15119 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15120 o
.pg_to_acting_osds(pg
, osds
);
15124 return -CEPHFS_EINVAL
;
15127 * Return the remainder of the extent (stripe unit)
15129 * If length = 1 is passed to Striper::file_to_extents we get a single
15130 * extent back, but its length is one so we still need to compute the length
15131 * to the end of the stripe unit.
15133 * If length = su then we may get 1 or 2 objects back in the extents vector
15134 * which would have to be examined. Even then, the offsets are local to the
15135 * object, so matching up to the file offset is extra work.
15137 * It seems simpler to stick with length = 1 and manually compute the
15141 uint64_t su
= in
->layout
.stripe_unit
;
15142 *len
= su
- (off
% su
);
15148 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
15150 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15151 if (!mref_reader
.is_state_satisfied())
15152 return -CEPHFS_ENOTCONN
;
15154 std::scoped_lock
lock(client_lock
);
15157 return -CEPHFS_EINVAL
;
15158 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15159 return o
.crush
->get_full_location_ordered(id
, path
);
15163 int Client::get_file_stripe_address(int fd
, loff_t offset
,
15164 vector
<entity_addr_t
>& address
)
15166 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15167 if (!mref_reader
.is_state_satisfied())
15168 return -CEPHFS_ENOTCONN
;
15170 std::scoped_lock
lock(client_lock
);
15172 Fh
*f
= get_filehandle(fd
);
15174 return -CEPHFS_EBADF
;
15175 Inode
*in
= f
->inode
.get();
15178 vector
<ObjectExtent
> extents
;
15179 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
15180 in
->truncate_size
, extents
);
15181 ceph_assert(extents
.size() == 1);
15183 // now we have the object and its 'layout'
15184 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15185 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15187 o
.pg_to_acting_osds(pg
, osds
);
15189 return -CEPHFS_EINVAL
;
15190 for (unsigned i
= 0; i
< osds
.size(); i
++) {
15191 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
15192 address
.push_back(addr
);
15198 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
15200 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15201 if (!mref_reader
.is_state_satisfied())
15202 return -CEPHFS_ENOTCONN
;
15204 std::scoped_lock
lock(client_lock
);
15206 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15207 if (!o
.exists(osd
))
15208 return -CEPHFS_ENOENT
;
15210 addr
= o
.get_addrs(osd
).front();
15215 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
15216 loff_t length
, loff_t offset
)
15218 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15219 if (!mref_reader
.is_state_satisfied())
15220 return -CEPHFS_ENOTCONN
;
15222 std::scoped_lock
lock(client_lock
);
15224 Fh
*f
= get_filehandle(fd
);
15226 return -CEPHFS_EBADF
;
15227 Inode
*in
= f
->inode
.get();
15229 // map to a list of extents
15230 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
15232 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
15237 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15238 int Client::get_local_osd()
15240 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15241 if (!mref_reader
.is_state_satisfied())
15242 return -CEPHFS_ENOTCONN
;
15244 std::scoped_lock
lock(client_lock
);
15246 objecter
->with_osdmap([this](const OSDMap
& o
) {
15247 if (o
.get_epoch() != local_osd_epoch
) {
15248 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
15249 local_osd_epoch
= o
.get_epoch();
15260 // ===============================
15262 void Client::ms_handle_connect(Connection
*con
)
15264 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15267 bool Client::ms_handle_reset(Connection
*con
)
15269 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15273 void Client::ms_handle_remote_reset(Connection
*con
)
15275 std::scoped_lock
lock(client_lock
);
15276 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15277 switch (con
->get_peer_type()) {
15278 case CEPH_ENTITY_TYPE_MDS
:
15280 // kludge to figure out which mds this is; fixme with a Connection* state
15281 mds_rank_t mds
= MDS_RANK_NONE
;
15282 MetaSessionRef s
= NULL
;
15283 for (auto &p
: mds_sessions
) {
15284 if (mdsmap
->have_inst(p
.first
) && mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
15290 ceph_assert(s
!= NULL
);
15291 switch (s
->state
) {
15292 case MetaSession::STATE_CLOSING
:
15293 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
15294 _closed_mds_session(s
.get());
15297 case MetaSession::STATE_OPENING
:
15299 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
15300 list
<Context
*> waiters
;
15301 waiters
.swap(s
->waiting_for_open
);
15302 _closed_mds_session(s
.get());
15303 auto news
= _get_or_open_mds_session(mds
);
15304 news
->waiting_for_open
.swap(waiters
);
15308 case MetaSession::STATE_OPEN
:
15310 objecter
->maybe_request_map(); /* to check if we are blocklisted */
15311 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
15312 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
15313 _closed_mds_session(s
.get());
15315 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
15316 s
->state
= MetaSession::STATE_STALE
;
15321 case MetaSession::STATE_NEW
:
15322 case MetaSession::STATE_CLOSED
:
15332 bool Client::ms_handle_refused(Connection
*con
)
15334 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15338 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
15340 Inode
*quota_in
= root_ancestor
;
15341 SnapRealm
*realm
= in
->snaprealm
;
15343 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15347 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
15348 if (realm
->ino
!= in
->ino
) {
15349 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
15350 if (p
== inode_map
.end())
15353 if (p
->second
->quota
.is_enable()) {
15354 quota_in
= p
->second
;
15358 realm
= realm
->pparent
;
15360 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
15365 * Traverse quota ancestors of the Inode, return true
15366 * if any of them passes the passed function
15368 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
15369 std::function
<bool (const Inode
&in
)> test
)
15371 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15375 ceph_assert(in
!= NULL
);
15380 if (in
== root_ancestor
) {
15381 // We're done traversing, drop out
15384 // Continue up the tree
15385 in
= get_quota_root(in
, perms
);
15392 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
15394 return check_quota_condition(in
, perms
,
15395 [](const Inode
&in
) {
15396 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
15400 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
15401 const UserPerm
& perms
)
15403 return check_quota_condition(in
, perms
,
15404 [&new_bytes
](const Inode
&in
) {
15405 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
15406 > in
.quota
.max_bytes
;
15410 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
15412 ceph_assert(in
->size
>= in
->reported_size
);
15413 const uint64_t size
= in
->size
- in
->reported_size
;
15414 return check_quota_condition(in
, perms
,
15415 [&size
](const Inode
&in
) {
15416 if (in
.quota
.max_bytes
) {
15417 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
15421 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
15422 return (space
>> 4) < size
;
15436 int Client::check_pool_perm(Inode
*in
, int need
)
15438 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15440 if (!cct
->_conf
->client_check_pool_perm
)
15443 /* Only need to do this for regular files */
15444 if (!in
->is_file())
15447 int64_t pool_id
= in
->layout
.pool_id
;
15448 std::string pool_ns
= in
->layout
.pool_ns
;
15449 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
15452 auto it
= pool_perms
.find(perm_key
);
15453 if (it
== pool_perms
.end())
15455 if (it
->second
== POOL_CHECKING
) {
15456 // avoid concurrent checkings
15457 wait_on_list(waiting_for_pool_perm
);
15460 ceph_assert(have
& POOL_CHECKED
);
15466 if (in
->snapid
!= CEPH_NOSNAP
) {
15467 // pool permission check needs to write to the first object. But for snapshot,
15468 // head of the first object may have already been deleted. To avoid creating
15469 // orphan object, skip the check for now.
15473 pool_perms
[perm_key
] = POOL_CHECKING
;
15476 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
15477 object_t oid
= oid_buf
;
15479 SnapContext nullsnapc
;
15481 C_SaferCond rd_cond
;
15482 ObjectOperation rd_op
;
15483 rd_op
.stat(nullptr, nullptr, nullptr);
15485 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
15486 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
15488 C_SaferCond wr_cond
;
15489 ObjectOperation wr_op
;
15490 wr_op
.create(true);
15492 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
15493 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
15495 client_lock
.unlock();
15496 int rd_ret
= rd_cond
.wait();
15497 int wr_ret
= wr_cond
.wait();
15498 client_lock
.lock();
15500 bool errored
= false;
15502 if (rd_ret
== 0 || rd_ret
== -CEPHFS_ENOENT
)
15504 else if (rd_ret
!= -CEPHFS_EPERM
) {
15505 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15506 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15510 if (wr_ret
== 0 || wr_ret
== -CEPHFS_EEXIST
)
15511 have
|= POOL_WRITE
;
15512 else if (wr_ret
!= -CEPHFS_EPERM
) {
15513 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15514 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
15519 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15520 // Raise EIO because actual error code might be misleading for
15521 // userspace filesystem user.
15522 pool_perms
.erase(perm_key
);
15523 signal_cond_list(waiting_for_pool_perm
);
15524 return -CEPHFS_EIO
;
15527 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
15528 signal_cond_list(waiting_for_pool_perm
);
15531 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
15532 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15533 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
15534 return -CEPHFS_EPERM
;
15536 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
15537 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
15538 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
15539 return -CEPHFS_EPERM
;
15545 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
15547 if (acl_type
== POSIX_ACL
) {
15548 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15549 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15551 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
15554 return -CEPHFS_EAGAIN
;
15557 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
15559 if (acl_type
== NO_ACL
)
15562 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
15566 if (acl_type
== POSIX_ACL
) {
15567 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
15568 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
15569 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
15570 r
= posix_acl_access_chmod(acl
, mode
);
15573 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
15579 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
15583 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
15584 const UserPerm
& perms
)
15586 if (acl_type
== NO_ACL
)
15589 if (S_ISLNK(*mode
))
15592 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
15596 if (acl_type
== POSIX_ACL
) {
15597 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
15598 map
<string
, bufferptr
> xattrs
;
15600 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
15601 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
15602 r
= posix_acl_inherit_mode(acl
, mode
);
15607 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
15611 xattrs
[ACL_EA_ACCESS
] = acl
;
15614 if (S_ISDIR(*mode
))
15615 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
15619 encode(xattrs
, xattrs_bl
);
15622 *mode
&= ~umask_cb(callback_handle
);
15627 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
15631 void Client::set_filer_flags(int flags
)
15633 std::scoped_lock
l(client_lock
);
15634 ceph_assert(flags
== 0 ||
15635 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15636 objecter
->add_global_op_flags(flags
);
15639 void Client::clear_filer_flags(int flags
)
15641 std::scoped_lock
l(client_lock
);
15642 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
15643 objecter
->clear_global_op_flag(flags
);
15646 // called before mount
15647 void Client::set_uuid(const std::string
& uuid
)
15649 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15650 ceph_assert(iref_reader
.is_state_satisfied());
15652 std::scoped_lock
l(client_lock
);
15653 ceph_assert(!uuid
.empty());
15655 metadata
["uuid"] = uuid
;
15659 // called before mount. 0 means infinite
15660 void Client::set_session_timeout(unsigned timeout
)
15662 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15663 ceph_assert(iref_reader
.is_state_satisfied());
15665 std::scoped_lock
l(client_lock
);
15667 metadata
["timeout"] = stringify(timeout
);
15670 // called before mount
15671 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
15672 const std::string
& fs_name
)
15674 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
15675 if (!iref_reader
.is_state_satisfied())
15676 return -CEPHFS_ENOTCONN
;
15679 return -CEPHFS_EINVAL
;
15681 std::unique_lock
l(client_lock
);
15683 auto it
= metadata
.find("uuid");
15684 if (it
!= metadata
.end() && it
->second
== uuid
)
15685 return -CEPHFS_EINVAL
;
15688 int r
= subscribe_mdsmap(fs_name
);
15690 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
15694 if (metadata
.empty())
15695 populate_metadata("");
15697 while (mdsmap
->get_epoch() == 0)
15698 wait_on_list(waiting_for_mdsmap
);
15701 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
15702 if (!mdsmap
->is_up(mds
)) {
15703 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
15704 wait_on_list(waiting_for_mdsmap
);
15708 MetaSessionRef session
;
15709 if (!have_open_session(mds
)) {
15710 session
= _get_or_open_mds_session(mds
);
15711 if (session
->state
== MetaSession::STATE_REJECTED
)
15712 return -CEPHFS_EPERM
;
15713 if (session
->state
!= MetaSession::STATE_OPENING
) {
15715 return -CEPHFS_EINVAL
;
15717 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
15718 wait_on_context_list(session
->waiting_for_open
);
15722 session
= mds_sessions
.at(mds
);
15723 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
15724 return -CEPHFS_EOPNOTSUPP
;
15726 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
15727 session
->reclaim_state
== MetaSession::RECLAIMING
) {
15728 session
->reclaim_state
= MetaSession::RECLAIMING
;
15729 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
15730 session
->con
->send_message2(std::move(m
));
15731 wait_on_list(waiting_for_reclaim
);
15732 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
15733 return reclaim_errno
? : -CEPHFS_ENOTRECOVERABLE
;
15739 // didn't find target session in any mds
15740 if (reclaim_target_addrs
.empty()) {
15741 if (flags
& CEPH_RECLAIM_RESET
)
15742 return -CEPHFS_ENOENT
;
15743 return -CEPHFS_ENOTRECOVERABLE
;
15746 if (flags
& CEPH_RECLAIM_RESET
)
15749 // use blocklist to check if target session was killed
15750 // (config option mds_session_blocklist_on_evict needs to be true)
15751 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
15754 objecter
->wait_for_map(reclaim_osd_epoch
, ca::use_blocked
[ec
]);
15758 return ceph::from_error_code(ec
);
15760 bool blocklisted
= objecter
->with_osdmap(
15761 [this](const OSDMap
&osd_map
) -> bool {
15762 return osd_map
.is_blocklisted(reclaim_target_addrs
);
15765 return -CEPHFS_ENOTRECOVERABLE
;
15767 metadata
["reclaiming_uuid"] = uuid
;
15771 void Client::finish_reclaim()
15773 auto it
= metadata
.find("reclaiming_uuid");
15774 if (it
== metadata
.end()) {
15775 for (auto &p
: mds_sessions
)
15776 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15780 for (auto &p
: mds_sessions
) {
15781 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
15782 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
15783 p
.second
->con
->send_message2(std::move(m
));
15786 metadata
["uuid"] = it
->second
;
15787 metadata
.erase(it
);
15790 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
15792 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
15793 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
15795 std::scoped_lock
cl(client_lock
);
15796 auto session
= _get_mds_session(from
, reply
->get_connection().get());
15798 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
15802 if (reply
->get_result() >= 0) {
15803 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
15804 if (reply
->get_epoch() > reclaim_osd_epoch
)
15805 reclaim_osd_epoch
= reply
->get_epoch();
15806 if (!reply
->get_addrs().empty())
15807 reclaim_target_addrs
= reply
->get_addrs();
15809 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
15810 reclaim_errno
= reply
->get_result();
15813 signal_cond_list(waiting_for_reclaim
);
15817 * This is included in cap release messages, to cause
15818 * the MDS to wait until this OSD map epoch. It is necessary
15819 * in corner cases where we cancel RADOS ops, so that
15820 * nobody else tries to do IO to the same objects in
15821 * the same epoch as the cancelled ops.
15823 void Client::set_cap_epoch_barrier(epoch_t e
)
15825 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
15826 cap_epoch_barrier
= e
;
15829 const char** Client::get_tracked_conf_keys() const
15831 static const char* keys
[] = {
15832 "client_cache_size",
15833 "client_cache_mid",
15835 "client_deleg_timeout",
15836 "client_deleg_break_on_open",
15838 "client_oc_max_objects",
15839 "client_oc_max_dirty",
15840 "client_oc_target_dirty",
15841 "client_oc_max_dirty_age",
15842 "client_caps_release_delay",
15843 "client_mount_timeout",
15849 void Client::handle_conf_change(const ConfigProxy
& conf
,
15850 const std::set
<std::string
> &changed
)
15852 std::scoped_lock
lock(client_lock
);
15854 if (changed
.count("client_cache_mid")) {
15855 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
15857 if (changed
.count("client_acl_type")) {
15859 if (cct
->_conf
->client_acl_type
== "posix_acl")
15860 acl_type
= POSIX_ACL
;
15862 if (changed
.count("client_oc_size")) {
15863 objectcacher
->set_max_size(cct
->_conf
->client_oc_size
);
15865 if (changed
.count("client_oc_max_objects")) {
15866 objectcacher
->set_max_objects(cct
->_conf
->client_oc_max_objects
);
15868 if (changed
.count("client_oc_max_dirty")) {
15869 objectcacher
->set_max_dirty(cct
->_conf
->client_oc_max_dirty
);
15871 if (changed
.count("client_oc_target_dirty")) {
15872 objectcacher
->set_target_dirty(cct
->_conf
->client_oc_target_dirty
);
15874 if (changed
.count("client_oc_max_dirty_age")) {
15875 objectcacher
->set_max_dirty_age(cct
->_conf
->client_oc_max_dirty_age
);
15877 if (changed
.count("client_collect_and_send_global_metrics")) {
15878 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
15879 "client_collect_and_send_global_metrics");
15881 if (changed
.count("client_caps_release_delay")) {
15882 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
15883 "client_caps_release_delay");
15885 if (changed
.count("client_mount_timeout")) {
15886 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
15887 "client_mount_timeout");
15891 void intrusive_ptr_add_ref(Inode
*in
)
15896 void intrusive_ptr_release(Inode
*in
)
15898 in
->client
->put_inode(in
);
15901 mds_rank_t
Client::_get_random_up_mds() const
15903 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15905 std::set
<mds_rank_t
> up
;
15906 mdsmap
->get_up_mds_set(up
);
15909 return MDS_RANK_NONE
;
15910 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
15911 for (int n
= rand() % up
.size(); n
; n
--)
15917 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
,
15918 boost::asio::io_context
& ictx
)
15919 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, ictx
))
15921 monclient
->set_messenger(m
);
15922 objecter
->set_client_incarnation(0);
15925 StandaloneClient::~StandaloneClient()
15928 objecter
= nullptr;
15931 int StandaloneClient::init()
15933 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
15934 ceph_assert(iref_writer
.is_first_writer());
15939 client_lock
.lock();
15941 messenger
->add_dispatcher_tail(objecter
);
15942 messenger
->add_dispatcher_tail(this);
15944 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
15945 int r
= monclient
->init();
15947 // need to do cleanup because we're in an intermediate init state
15949 std::scoped_lock
l(timer_lock
);
15953 client_lock
.unlock();
15954 objecter
->shutdown();
15955 objectcacher
->stop();
15956 monclient
->shutdown();
15961 client_lock
.unlock();
15963 iref_writer
.update_state(CLIENT_INITIALIZED
);
15968 void StandaloneClient::shutdown()
15970 Client::shutdown();
15971 objecter
->shutdown();
15972 monclient
->shutdown();