1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
27 #include <sys/utsname.h>
31 #include <boost/lexical_cast.hpp>
32 #include <boost/fusion/include/std_pair.hpp>
34 #include "common/async/waiter.h"
36 #if defined(__FreeBSD__)
37 #define XATTR_CREATE 0x1
38 #define XATTR_REPLACE 0x2
39 #elif !defined(_WIN32)
40 #include <sys/xattr.h>
43 #if defined(__linux__)
44 #include <linux/falloc.h>
47 #include <sys/statvfs.h>
49 #include "common/config.h"
50 #include "common/version.h"
51 #include "common/async/blocked_completion.h"
53 #include "mon/MonClient.h"
55 #include "messages/MClientCaps.h"
56 #include "messages/MClientLease.h"
57 #include "messages/MClientQuota.h"
58 #include "messages/MClientReclaim.h"
59 #include "messages/MClientReclaimReply.h"
60 #include "messages/MClientReconnect.h"
61 #include "messages/MClientReply.h"
62 #include "messages/MClientRequest.h"
63 #include "messages/MClientRequestForward.h"
64 #include "messages/MClientSession.h"
65 #include "messages/MClientSnap.h"
66 #include "messages/MClientMetrics.h"
67 #include "messages/MCommandReply.h"
68 #include "messages/MFSMap.h"
69 #include "messages/MFSMapUser.h"
70 #include "messages/MMDSMap.h"
71 #include "messages/MOSDMap.h"
73 #include "mds/flock.h"
74 #include "mds/cephfs_features.h"
76 #include "osd/OSDMap.h"
77 #include "osdc/Filer.h"
79 #include "common/Cond.h"
80 #include "common/perf_counters.h"
81 #include "common/admin_socket.h"
82 #include "common/errno.h"
83 #include "include/str_list.h"
85 #define dout_subsys ceph_subsys_client
87 #include "include/lru.h"
88 #include "include/compat.h"
89 #include "include/stringify.h"
90 #include "include/random.h"
95 #include "Delegation.h"
97 #include "ClientSnapRealm.h"
99 #include "MetaSession.h"
100 #include "MetaRequest.h"
101 #include "ObjecterWriteback.h"
102 #include "posix_acl.h"
104 #include "include/ceph_assert.h"
105 #include "include/stat.h"
107 #include "include/cephfs/ceph_ll_client.h"
109 #if HAVE_GETGROUPLIST
116 #define dout_prefix *_dout << "client." << whoami << " "
118 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
120 // FreeBSD fails to define this
124 // Darwin fails to define this
133 // Windows doesn't define those values. While the Posix compatibilty layer
134 // doesn't support those values, the Windows native functions do provide
135 // similar flags. Special care should be taken if we're going to use those
136 // flags in ceph-dokan. The current values are no-ops, while propagating
137 // them to the rest of the code might cause the Windows functions to reject
140 #define O_NOFOLLOW 0x0
147 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
150 #define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
161 using namespace TOPNSPC::common
;
163 namespace bs
= boost::system
;
164 namespace ca
= ceph::async
;
166 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
168 Client
*client
= static_cast<Client
*>(p
);
169 client
->flush_set_callback(oset
);
172 bool Client::is_reserved_vino(vinodeno_t
&vino
) {
173 if (MDS_IS_PRIVATE_INO(vino
.ino
)) {
174 ldout(cct
, -1) << __func__
<< " attempt to access reserved inode number " << vino
<< dendl
;
180 // running average and standard deviation -- presented in
181 // Donald Knuth's TAoCP, Volume II.
182 double calc_average(double old_avg
, double value
, uint64_t count
) {
187 new_avg
= old_avg
+ ((value
- old_avg
) / count
);
193 double calc_sq_sum(double old_sq_sum
, double old_mean
, double new_mean
,
194 double value
, uint64_t count
) {
199 new_sq_sum
= old_sq_sum
+ (value
- old_mean
)*(value
- new_mean
);
207 Client::CommandHook::CommandHook(Client
*client
) :
212 int Client::CommandHook::call(
213 std::string_view command
,
214 const cmdmap_t
& cmdmap
,
220 f
->open_object_section("result");
222 std::scoped_lock l
{m_client
->client_lock
};
223 if (command
== "mds_requests")
224 m_client
->dump_mds_requests(f
);
225 else if (command
== "mds_sessions") {
226 bool cap_dump
= false;
227 cmd_getval(cmdmap
, "cap_dump", cap_dump
);
228 m_client
->dump_mds_sessions(f
, cap_dump
);
229 } else if (command
== "dump_cache")
230 m_client
->dump_cache(f
);
231 else if (command
== "kick_stale_sessions")
232 m_client
->_kick_stale_sessions();
233 else if (command
== "status")
234 m_client
->dump_status(f
);
236 ceph_abort_msg("bad command registered");
245 int Client::get_fd_inode(int fd
, InodeRef
*in
) {
247 if (fd
== CEPHFS_AT_FDCWD
) {
250 Fh
*f
= get_filehandle(fd
);
260 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
261 : inode(in
), offset(0), next_offset(2),
262 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
266 void Client::_reset_faked_inos()
269 free_faked_inos
.clear();
270 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
271 last_used_faked_ino
= 0;
272 last_used_faked_root
= 0;
274 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
275 // Windows structures, including Dokan ones, are using 64B identifiers.
276 _use_faked_inos
= false;
278 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
282 void Client::_assign_faked_ino(Inode
*in
)
284 if (0 == last_used_faked_ino
)
285 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
286 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
287 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
288 last_used_faked_ino
= 2048;
289 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
291 ceph_assert(it
!= free_faked_inos
.end());
292 if (last_used_faked_ino
< it
.get_start()) {
293 ceph_assert(it
.get_len() > 0);
294 last_used_faked_ino
= it
.get_start();
296 ++last_used_faked_ino
;
297 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
299 in
->faked_ino
= last_used_faked_ino
;
300 free_faked_inos
.erase(in
->faked_ino
);
301 faked_ino_map
[in
->faked_ino
] = in
->vino();
305 * In the faked mode, if you export multiple subdirectories,
306 * you will see that the inode numbers of the exported subdirectories
307 * are the same. so we distinguish the mount point by reserving
308 * the "fake ids" between "1024~2048" and combining the last
309 * 10bits(0x3ff) of the "root inodes".
311 void Client::_assign_faked_root(Inode
*in
)
313 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
314 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
315 last_used_faked_root
= 0;
316 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
318 ceph_assert(it
!= free_faked_inos
.end());
319 vinodeno_t inode_info
= in
->vino();
320 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
321 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
322 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
323 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
325 in
->faked_ino
= last_used_faked_root
;
326 free_faked_inos
.erase(in
->faked_ino
);
327 faked_ino_map
[in
->faked_ino
] = in
->vino();
330 void Client::_release_faked_ino(Inode
*in
)
332 free_faked_inos
.insert(in
->faked_ino
);
333 faked_ino_map
.erase(in
->faked_ino
);
336 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
341 else if (faked_ino_map
.count(ino
))
342 vino
= faked_ino_map
[ino
];
344 vino
= vinodeno_t(0, CEPH_NOSNAP
);
345 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
349 vinodeno_t
Client::map_faked_ino(ino_t ino
)
351 std::scoped_lock
lock(client_lock
);
352 return _map_faked_ino(ino
);
357 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
358 : Dispatcher(m
->cct
->get()),
359 timer(m
->cct
, timer_lock
, false),
363 whoami(mc
->get_global_id()),
364 mount_state(CLIENT_UNMOUNTED
, "Client::mountstate_lock"),
365 initialize_state(CLIENT_NEW
, "Client::initstate_lock"),
366 cct_deleter
{m
->cct
, [](CephContext
*p
) {p
->put();}},
367 async_ino_invalidator(m
->cct
),
368 async_dentry_invalidator(m
->cct
),
369 interrupt_finisher(m
->cct
),
370 remount_finisher(m
->cct
),
371 async_ino_releasor(m
->cct
),
372 objecter_finisher(m
->cct
),
373 m_command_hook(this),
378 user_id
= cct
->_conf
->client_mount_uid
;
379 group_id
= cct
->_conf
->client_mount_gid
;
380 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
381 "fuse_default_permissions");
383 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
384 "client_collect_and_send_global_metrics");
386 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
387 "client_mount_timeout");
389 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
390 "client_caps_release_delay");
392 if (cct
->_conf
->client_acl_type
== "posix_acl")
393 acl_type
= POSIX_ACL
;
395 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
398 free_fd_set
.insert(10, 1<<30);
400 mdsmap
.reset(new MDSMap
);
403 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
405 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
406 client_flush_set_callback
, // all commit callback
408 cct
->_conf
->client_oc_size
,
409 cct
->_conf
->client_oc_max_objects
,
410 cct
->_conf
->client_oc_max_dirty
,
411 cct
->_conf
->client_oc_target_dirty
,
412 cct
->_conf
->client_oc_max_dirty_age
,
419 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
421 // If the task is crashed or aborted and doesn't
422 // get any chance to run the umount and shutdow.
424 std::scoped_lock l
{client_lock
};
425 tick_thread_stopped
= true;
426 upkeep_cond
.notify_one();
429 if (upkeeper
.joinable())
432 // It is necessary to hold client_lock, because any inode destruction
433 // may call into ObjectCacher, which asserts that it's lock (which is
434 // client_lock) is held.
435 std::scoped_lock l
{client_lock
};
439 void Client::tear_down_cache()
442 for (auto &[fd
, fh
] : fd_map
) {
443 ldout(cct
, 1) << __func__
<< " forcing close of fh " << fd
<< " ino " << fh
->inode
->ino
<< dendl
;
448 while (!opened_dirs
.empty()) {
449 dir_result_t
*dirp
= *opened_dirs
.begin();
450 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
459 ceph_assert(lru
.lru_get_size() == 0);
462 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
463 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
467 ceph_assert(inode_map
.empty());
470 inodeno_t
Client::get_root_ino()
472 std::scoped_lock
l(client_lock
);
473 if (use_faked_inos())
474 return root
->faked_ino
;
479 Inode
*Client::get_root()
481 std::scoped_lock
l(client_lock
);
489 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
492 in
->make_long_path(path
);
493 ldout(cct
, 1) << "dump_inode: "
494 << (disconnected
? "DISCONNECTED ":"")
495 << "inode " << in
->ino
497 << " ref " << in
->get_nref()
498 << " " << *in
<< dendl
;
501 f
->open_object_section("inode");
502 f
->dump_stream("path") << path
;
504 f
->dump_int("disconnected", 1);
511 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
512 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
513 it
!= in
->dir
->dentries
.end();
515 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
517 f
->open_object_section("dentry");
521 if (it
->second
->inode
)
522 dump_inode(f
, it
->second
->inode
.get(), did
, false);
527 void Client::dump_cache(Formatter
*f
)
531 ldout(cct
, 1) << __func__
<< dendl
;
534 f
->open_array_section("cache");
537 dump_inode(f
, root
.get(), did
, true);
539 // make a second pass to catch anything disconnected
540 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
541 it
!= inode_map
.end();
543 if (did
.count(it
->second
))
545 dump_inode(f
, it
->second
, did
, true);
552 void Client::dump_status(Formatter
*f
)
554 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
556 ldout(cct
, 1) << __func__
<< dendl
;
558 const epoch_t osd_epoch
559 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
562 f
->open_object_section("metadata");
563 for (const auto& kv
: metadata
)
564 f
->dump_string(kv
.first
.c_str(), kv
.second
);
567 f
->dump_int("dentry_count", lru
.lru_get_size());
568 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
569 f
->dump_int("id", get_nodeid().v
);
570 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
571 f
->dump_object("inst", inst
);
572 f
->dump_object("addr", inst
.addr
);
573 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
574 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
575 f
->dump_int("inode_count", inode_map
.size());
576 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
577 f
->dump_int("osd_epoch", osd_epoch
);
578 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
579 f
->dump_bool("blocklisted", blocklisted
);
580 f
->dump_string("fs_name", mdsmap
->get_fs_name());
584 void Client::_pre_init()
588 objecter_finisher
.start();
589 filer
.reset(new Filer(objecter
, &objecter_finisher
));
591 objectcacher
->start();
596 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
597 ceph_assert(iref_writer
.is_first_writer());
601 std::scoped_lock l
{client_lock
};
602 messenger
->add_dispatcher_tail(this);
605 iref_writer
.update_state(CLIENT_INITIALIZED
);
609 void Client::_finish_init()
612 std::scoped_lock l
{client_lock
};
614 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
615 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
616 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
617 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
618 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
619 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
620 // average, standard deviation mds/r/w/ latencies
621 plb
.add_time(l_c_md_avg
, "mdavg", "Average latency for processing metadata requests");
622 plb
.add_u64(l_c_md_sqsum
, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
623 plb
.add_u64(l_c_md_ops
, "mdops", "Total metadata IO operations");
624 plb
.add_time(l_c_rd_avg
, "readavg", "Average latency for processing read requests");
625 plb
.add_u64(l_c_rd_sqsum
, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
626 plb
.add_u64(l_c_rd_ops
, "rdops", "Total read IO operations");
627 plb
.add_time(l_c_wr_avg
, "writeavg", "Average latency for processing write requests");
628 plb
.add_u64(l_c_wr_sqsum
, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
629 plb
.add_u64(l_c_wr_ops
, "rdops", "Total write IO operations");
630 logger
.reset(plb
.create_perf_counters());
631 cct
->get_perfcounters_collection()->add(logger
.get());
634 cct
->_conf
.add_observer(this);
636 AdminSocket
* admin_socket
= cct
->get_admin_socket();
637 int ret
= admin_socket
->register_command("mds_requests",
639 "show in-progress mds requests");
641 lderr(cct
) << "error registering admin socket command: "
642 << cpp_strerror(-ret
) << dendl
;
644 ret
= admin_socket
->register_command("mds_sessions "
645 "name=cap_dump,type=CephBool,req=false",
647 "show mds session state");
649 lderr(cct
) << "error registering admin socket command: "
650 << cpp_strerror(-ret
) << dendl
;
652 ret
= admin_socket
->register_command("dump_cache",
654 "show in-memory metadata cache contents");
656 lderr(cct
) << "error registering admin socket command: "
657 << cpp_strerror(-ret
) << dendl
;
659 ret
= admin_socket
->register_command("kick_stale_sessions",
661 "kick sessions that were remote reset");
663 lderr(cct
) << "error registering admin socket command: "
664 << cpp_strerror(-ret
) << dendl
;
666 ret
= admin_socket
->register_command("status",
668 "show overall client status");
670 lderr(cct
) << "error registering admin socket command: "
671 << cpp_strerror(-ret
) << dendl
;
675 void Client::shutdown()
677 ldout(cct
, 1) << __func__
<< dendl
;
679 // If we were not mounted, but were being used for sending
680 // MDS commands, we may have sessions that need closing.
682 std::scoped_lock l
{client_lock
};
684 // To make sure the tick thread will be stoppped before
685 // destructing the Client, just in case like the _mount()
686 // failed but didn't not get a chance to stop the tick
688 tick_thread_stopped
= true;
689 upkeep_cond
.notify_one();
693 cct
->_conf
.remove_observer(this);
695 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
697 if (ino_invalidate_cb
) {
698 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
699 async_ino_invalidator
.wait_for_empty();
700 async_ino_invalidator
.stop();
703 if (dentry_invalidate_cb
) {
704 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
705 async_dentry_invalidator
.wait_for_empty();
706 async_dentry_invalidator
.stop();
709 if (switch_interrupt_cb
) {
710 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
711 interrupt_finisher
.wait_for_empty();
712 interrupt_finisher
.stop();
716 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
717 remount_finisher
.wait_for_empty();
718 remount_finisher
.stop();
721 if (ino_release_cb
) {
722 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
723 async_ino_releasor
.wait_for_empty();
724 async_ino_releasor
.stop();
727 objectcacher
->stop(); // outside of client_lock! this does a join.
730 * We are shuting down the client.
732 * Just declare the state to CLIENT_NEW to block and fail any
733 * new comming "reader" and then try to wait all the in-flight
734 * "readers" to finish.
736 RWRef_t
iref_writer(initialize_state
, CLIENT_NEW
, false);
737 if (!iref_writer
.is_first_writer())
739 iref_writer
.wait_readers_done();
742 std::scoped_lock
l(timer_lock
);
746 objecter_finisher
.wait_for_empty();
747 objecter_finisher
.stop();
750 cct
->get_perfcounters_collection()->remove(logger
.get());
755 void Client::update_io_stat_metadata(utime_t latency
) {
756 auto lat_nsec
= latency
.to_nsec();
757 // old values are used to compute new ones
758 auto o_avg
= logger
->tget(l_c_md_avg
).to_nsec();
759 auto o_sqsum
= logger
->get(l_c_md_sqsum
);
761 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_metadata_request
);
762 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
763 nr_metadata_request
);
765 logger
->tinc(l_c_lat
, latency
);
766 logger
->tinc(l_c_reply
, latency
);
769 avg
.set_from_double(n_avg
/ 1000000000);
770 logger
->tset(l_c_md_avg
, avg
);
771 logger
->set(l_c_md_sqsum
, n_sqsum
);
772 logger
->set(l_c_md_ops
, nr_metadata_request
);
775 void Client::update_io_stat_read(utime_t latency
) {
776 auto lat_nsec
= latency
.to_nsec();
777 // old values are used to compute new ones
778 auto o_avg
= logger
->tget(l_c_rd_avg
).to_nsec();
779 auto o_sqsum
= logger
->get(l_c_rd_sqsum
);
781 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_read_request
);
782 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
785 logger
->tinc(l_c_read
, latency
);
788 avg
.set_from_double(n_avg
/ 1000000000);
789 logger
->tset(l_c_rd_avg
, avg
);
790 logger
->set(l_c_rd_sqsum
, n_sqsum
);
791 logger
->set(l_c_rd_ops
, nr_read_request
);
794 void Client::update_io_stat_write(utime_t latency
) {
795 auto lat_nsec
= latency
.to_nsec();
796 // old values are used to compute new ones
797 auto o_avg
= logger
->tget(l_c_wr_avg
).to_nsec();
798 auto o_sqsum
= logger
->get(l_c_wr_sqsum
);
800 auto n_avg
= calc_average(o_avg
, lat_nsec
, nr_write_request
);
801 auto n_sqsum
= calc_sq_sum(o_sqsum
, o_avg
, n_avg
, lat_nsec
,
804 logger
->tinc(l_c_wrlat
, latency
);
807 avg
.set_from_double(n_avg
/ 1000000000);
808 logger
->tset(l_c_wr_avg
, avg
);
809 logger
->set(l_c_wr_sqsum
, n_sqsum
);
810 logger
->set(l_c_wr_ops
, nr_write_request
);
813 // ===================
814 // metadata cache stuff
816 void Client::trim_cache(bool trim_kernel_dcache
)
818 uint64_t max
= cct
->_conf
->client_cache_size
;
819 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
821 while (lru
.lru_get_size() != last
) {
822 last
= lru
.lru_get_size();
824 if (!is_unmounting() && lru
.lru_get_size() <= max
) break;
827 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
834 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
835 _invalidate_kernel_dcache();
838 if (lru
.lru_get_size() == 0 && root
&& root
->get_nref() == 1 && inode_map
.size() == 1 + root_parents
.size()) {
839 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
844 void Client::trim_cache_for_reconnect(MetaSession
*s
)
846 mds_rank_t mds
= s
->mds_num
;
847 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
850 list
<Dentry
*> skipped
;
851 while (lru
.lru_get_size() > 0) {
852 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
856 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
857 dn
->dir
->parent_inode
->caps
.count(mds
)) {
861 skipped
.push_back(dn
);
864 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
865 lru
.lru_insert_mid(*p
);
867 ldout(cct
, 20) << __func__
<< " mds." << mds
868 << " trimmed " << trimmed
<< " dentries" << dendl
;
870 if (s
->caps
.size() > 0)
871 _invalidate_kernel_dcache();
874 void Client::trim_dentry(Dentry
*dn
)
876 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
878 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
881 Inode
*diri
= dn
->dir
->parent_inode
;
882 clear_dir_complete_and_ordered(diri
, true);
884 unlink(dn
, false, false); // drop dir, drop dentry
888 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
889 uint64_t truncate_seq
, uint64_t truncate_size
)
891 uint64_t prior_size
= in
->size
;
893 if (truncate_seq
> in
->truncate_seq
||
894 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
895 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
897 in
->reported_size
= size
;
898 if (truncate_seq
!= in
->truncate_seq
) {
899 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
900 << truncate_seq
<< dendl
;
901 in
->truncate_seq
= truncate_seq
;
902 in
->oset
.truncate_seq
= truncate_seq
;
904 // truncate cached file data
905 if (prior_size
> size
) {
906 _invalidate_inode_cache(in
, size
, prior_size
- size
);
910 // truncate inline data
911 if (in
->inline_version
< CEPH_INLINE_NONE
) {
912 uint32_t len
= in
->inline_data
.length();
914 in
->inline_data
.splice(size
, len
- size
);
917 if (truncate_seq
>= in
->truncate_seq
&&
918 in
->truncate_size
!= truncate_size
) {
920 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
921 << truncate_size
<< dendl
;
922 in
->truncate_size
= truncate_size
;
923 in
->oset
.truncate_size
= truncate_size
;
925 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
930 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
931 utime_t ctime
, utime_t mtime
, utime_t atime
)
933 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
934 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
936 if (time_warp_seq
> in
->time_warp_seq
)
937 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
938 << " is higher than local time_warp_seq "
939 << in
->time_warp_seq
<< dendl
;
942 // be careful with size, mtime, atime
943 if (issued
& (CEPH_CAP_FILE_EXCL
|
945 CEPH_CAP_FILE_BUFFER
|
947 CEPH_CAP_XATTR_EXCL
)) {
948 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
949 if (ctime
> in
->ctime
)
951 if (time_warp_seq
> in
->time_warp_seq
) {
952 //the mds updated times, so take those!
955 in
->time_warp_seq
= time_warp_seq
;
956 } else if (time_warp_seq
== in
->time_warp_seq
) {
958 if (mtime
> in
->mtime
)
960 if (atime
> in
->atime
)
962 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
963 //ignore mds values as we have a higher seq
966 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
967 if (time_warp_seq
>= in
->time_warp_seq
) {
971 in
->time_warp_seq
= time_warp_seq
;
975 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
976 << time_warp_seq
<< " is lower than local time_warp_seq "
982 void Client::_fragmap_remove_non_leaves(Inode
*in
)
984 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
985 if (!in
->dirfragtree
.is_leaf(p
->first
))
986 in
->fragmap
.erase(p
++);
991 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
993 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
994 if (p
->second
== mds
)
995 in
->fragmap
.erase(p
++);
1000 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
1001 MetaSession
*session
,
1002 const UserPerm
& request_perms
)
1005 bool was_new
= false;
1006 if (inode_map
.count(st
->vino
)) {
1007 in
= inode_map
[st
->vino
];
1008 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1010 in
= new Inode(this, st
->vino
, &st
->layout
);
1011 inode_map
[st
->vino
] = in
;
1013 if (use_faked_inos())
1014 _assign_faked_ino(in
);
1018 if (use_faked_inos())
1019 _assign_faked_root(root
.get());
1022 } else if (is_mounting()) {
1023 root_parents
[root_ancestor
] = in
;
1028 in
->ino
= st
->vino
.ino
;
1029 in
->snapid
= st
->vino
.snapid
;
1030 in
->mode
= st
->mode
& S_IFMT
;
1034 in
->rdev
= st
->rdev
;
1035 if (in
->is_symlink())
1036 in
->symlink
= st
->symlink
;
1038 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1039 bool new_version
= false;
1040 if (in
->version
== 0 ||
1041 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
1042 (in
->version
& ~1) < st
->version
))
1046 in
->caps_issued(&issued
);
1047 issued
|= in
->caps_dirty();
1048 int new_issued
= ~issued
& (int)st
->cap
.caps
;
1050 bool need_snapdir_attr_refresh
= false;
1051 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
1052 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
1053 in
->mode
= st
->mode
;
1056 in
->btime
= st
->btime
;
1057 in
->snap_btime
= st
->snap_btime
;
1058 in
->snap_metadata
= st
->snap_metadata
;
1059 in
->fscrypt_auth
= st
->fscrypt_auth
;
1060 need_snapdir_attr_refresh
= true;
1063 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
1064 !(issued
& CEPH_CAP_LINK_EXCL
)) {
1065 in
->nlink
= st
->nlink
;
1068 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
1069 need_snapdir_attr_refresh
= true;
1070 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
1071 st
->ctime
, st
->mtime
, st
->atime
);
1075 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
1076 in
->layout
= st
->layout
;
1077 in
->fscrypt_file
= st
->fscrypt_file
;
1078 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
1082 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
1083 in
->dirstat
= st
->dirstat
;
1085 // dir_layout/rstat/quota are not tracked by capability, update them only if
1086 // the inode stat is from auth mds
1087 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
1088 in
->dir_layout
= st
->dir_layout
;
1089 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
1090 in
->rstat
= st
->rstat
;
1091 in
->quota
= st
->quota
;
1092 in
->dir_pin
= st
->dir_pin
;
1094 // move me if/when version reflects fragtree changes.
1095 if (in
->dirfragtree
!= st
->dirfragtree
) {
1096 in
->dirfragtree
= st
->dirfragtree
;
1097 _fragmap_remove_non_leaves(in
);
1101 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
1102 st
->xattrbl
.length() &&
1103 st
->xattr_version
> in
->xattr_version
) {
1104 auto p
= st
->xattrbl
.cbegin();
1105 decode(in
->xattrs
, p
);
1106 in
->xattr_version
= st
->xattr_version
;
1107 need_snapdir_attr_refresh
= true;
1110 if (st
->inline_version
> in
->inline_version
) {
1111 in
->inline_data
= st
->inline_data
;
1112 in
->inline_version
= st
->inline_version
;
1115 /* always take a newer change attr */
1116 ldout(cct
, 12) << __func__
<< " client inode change_attr: " << in
->change_attr
<< " , mds inodestat change_attr: " << st
->change_attr
<< dendl
;
1117 if (st
->change_attr
> in
->change_attr
)
1118 in
->change_attr
= st
->change_attr
;
1120 if (st
->version
> in
->version
)
1121 in
->version
= st
->version
;
1124 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
1127 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
1129 if (in
->snapid
== CEPH_NOSNAP
) {
1130 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
1131 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
1132 st
->cap
.flags
, request_perms
);
1133 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
1134 in
->max_size
= st
->max_size
;
1135 in
->rstat
= st
->rstat
;
1138 // setting I_COMPLETE needs to happen after adding the cap
1140 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
1141 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
1142 in
->dirstat
.nfiles
== 0 &&
1143 in
->dirstat
.nsubdirs
== 0) {
1144 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
1145 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
1147 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
1148 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
1149 in
->dir
->readdir_cache
.clear();
1150 for (const auto& p
: in
->dir
->dentries
) {
1151 unlink(p
.second
, true, true); // keep dir, keep dentry
1153 if (in
->dir
->dentries
.empty())
1158 in
->snap_caps
|= st
->cap
.caps
;
1161 if (need_snapdir_attr_refresh
&& in
->is_dir() && in
->snapid
== CEPH_NOSNAP
) {
1162 vinodeno_t
vino(in
->ino
, CEPH_SNAPDIR
);
1163 if (inode_map
.count(vino
)) {
1164 refresh_snapdir_attrs(inode_map
[vino
], in
);
1173 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1175 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
1176 Inode
*in
, utime_t from
, MetaSession
*session
,
1180 if (dir
->dentries
.count(dname
))
1181 dn
= dir
->dentries
[dname
];
1183 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
1184 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
1187 if (dn
&& dn
->inode
) {
1188 if (dn
->inode
->vino() == in
->vino()) {
1190 ldout(cct
, 12) << " had dentry " << dname
1191 << " with correct vino " << dn
->inode
->vino()
1194 ldout(cct
, 12) << " had dentry " << dname
1195 << " with WRONG vino " << dn
->inode
->vino()
1197 unlink(dn
, true, true); // keep dir, keep dentry
1201 if (!dn
|| !dn
->inode
) {
1202 InodeRef
tmp_ref(in
);
1204 if (old_dentry
->dir
!= dir
) {
1205 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1206 clear_dir_complete_and_ordered(old_diri
, false);
1208 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1210 Inode
*diri
= dir
->parent_inode
;
1211 clear_dir_complete_and_ordered(diri
, false);
1212 dn
= link(dir
, dname
, in
, dn
);
1215 dn
->is_renaming
= false;
1216 signal_cond_list(waiting_for_rename
);
1220 update_dentry_lease(dn
, dlease
, from
, session
);
1224 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1226 utime_t dttl
= from
;
1227 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1229 ldout(cct
, 15) << __func__
<< " " << *dn
<< " " << *dlease
<< " from " << from
<< dendl
;
1233 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1234 if (dttl
> dn
->lease_ttl
) {
1235 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1236 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1237 dn
->lease_ttl
= dttl
;
1238 dn
->lease_mds
= session
->mds_num
;
1239 dn
->lease_seq
= dlease
->seq
;
1240 dn
->lease_gen
= session
->cap_gen
;
1243 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1244 if (dlease
->mask
& CEPH_LEASE_PRIMARY_LINK
)
1246 dn
->alternate_name
= std::move(dlease
->alternate_name
);
1251 * update MDS location cache for a single inode
1253 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
, mds_rank_t from
)
1256 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1257 if (dst
->auth
>= 0) {
1258 in
->fragmap
[dst
->frag
] = dst
->auth
;
1260 in
->fragmap
.erase(dst
->frag
);
1262 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1263 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1264 _fragmap_remove_non_leaves(in
);
1267 // replicated, only update from auth mds reply
1268 if (from
== dst
->auth
) {
1269 in
->dir_replicated
= !dst
->dist
.empty();
1270 if (!dst
->dist
.empty())
1271 in
->frag_repmap
[dst
->frag
].assign(dst
->dist
.begin(), dst
->dist
.end()) ;
1273 in
->frag_repmap
.erase(dst
->frag
);
1277 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1280 diri
->dir_release_count
++;
1282 diri
->dir_ordered_count
++;
1283 if (diri
->flags
& I_COMPLETE
) {
1285 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1286 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1288 if (diri
->flags
& I_DIR_ORDERED
) {
1289 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1290 diri
->flags
&= ~I_DIR_ORDERED
;
1294 diri
->dir
->readdir_cache
.clear();
1299 * insert results from readdir or lssnap into the metadata cache.
1301 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
,
1302 Inode
*diri
, Inode
*diri_other
) {
1304 auto& reply
= request
->reply
;
1305 ConnectionRef con
= request
->reply
->get_connection();
1307 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1308 features
= (uint64_t)-1;
1311 features
= con
->get_features();
1314 dir_result_t
*dirp
= request
->dirp
;
1317 // the extra buffer list is only set for readdir, lssnap and
1318 // readdir_snapdiff replies
1319 auto p
= reply
->get_extra_bl().cbegin();
1322 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1324 diri
= open_snapdir(diri
);
1326 bool snapdiff_req
= request
->head
.op
== CEPH_MDS_OP_READDIR_SNAPDIFF
;
1328 unsigned offset_hash
;
1330 fg
= (unsigned)request
->head
.args
.snapdiff
.frag
;
1331 offset_hash
= (unsigned)request
->head
.args
.snapdiff
.offset_hash
;
1333 fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1334 offset_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1337 // only open dir if we're actually adding stuff to it!
1338 Dir
*dir
= diri
->open_dir();
1340 //open opponent dir for snapdiff if any
1341 Dir
*dir_other
= nullptr;
1343 ceph_assert(diri_other
);
1344 dir_other
= diri_other
->open_dir();
1345 ceph_assert(dir_other
);
1349 DirStat
dst(p
, features
);
1355 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1356 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1358 unsigned readdir_offset
= dirp
->next_offset
;
1359 string readdir_start
= dirp
->last_name
;
1360 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1362 unsigned last_hash
= 0;
1364 if (!readdir_start
.empty()) {
1365 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1366 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1367 /* mds understands offset_hash */
1368 last_hash
= offset_hash
;
1372 if (fg
!= dst
.frag
) {
1373 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1377 readdir_start
.clear();
1378 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1382 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1383 << ", hash_order=" << hash_order
1384 << ", readdir_start " << readdir_start
1385 << ", last_hash " << last_hash
1386 << ", next_offset " << readdir_offset
<< dendl
;
1388 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1389 fg
.is_leftmost() && readdir_offset
== 2 &&
1390 !(hash_order
&& last_hash
)) {
1391 dirp
->release_count
= diri
->dir_release_count
;
1392 dirp
->ordered_count
= diri
->dir_ordered_count
;
1393 dirp
->start_shared_gen
= diri
->shared_gen
;
1394 dirp
->cache_index
= 0;
1397 dirp
->buffer_frag
= fg
;
1399 _readdir_drop_dirp_buffer(dirp
);
1400 dirp
->buffer
.reserve(numdn
);
1404 for (unsigned i
=0; i
<numdn
; i
++) {
1406 dlease
.decode(p
, features
);
1407 InodeStat
ist(p
, features
);
1409 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1411 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1413 auto *effective_dir
= dir
;
1414 auto *effective_diri
= diri
;
1416 if (snapdiff_req
&& in
->snapid
!= diri
->snapid
) {
1417 ceph_assert(diri_other
);
1418 ceph_assert(dir_other
);
1419 effective_diri
= diri_other
;
1420 effective_dir
= dir_other
;
1423 if (effective_dir
->dentries
.count(dname
)) {
1424 Dentry
*olddn
= effective_dir
->dentries
[dname
];
1425 if (olddn
->inode
!= in
) {
1426 // replace incorrect dentry
1427 unlink(olddn
, true, true); // keep dir, dentry
1428 dn
= link(effective_dir
, dname
, in
, olddn
);
1429 ceph_assert(dn
== olddn
);
1437 dn
= link(effective_dir
, dname
, in
, NULL
);
1439 dn
->alternate_name
= std::move(dlease
.alternate_name
);
1441 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1443 unsigned hash
= ceph_frag_value(effective_diri
->hash_dentry_name(dname
));
1444 if (hash
!= last_hash
)
1447 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1449 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1451 // add to readdir cache
1452 if (!snapdiff_req
&&
1453 dirp
->release_count
== effective_diri
->dir_release_count
&&
1454 dirp
->ordered_count
== effective_diri
->dir_ordered_count
&&
1455 dirp
->start_shared_gen
== effective_diri
->shared_gen
) {
1456 if (dirp
->cache_index
== effective_dir
->readdir_cache
.size()) {
1458 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1459 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1461 effective_dir
->readdir_cache
.push_back(dn
);
1462 } else if (dirp
->cache_index
< effective_dir
->readdir_cache
.size()) {
1463 if (dirp
->inode
->is_complete_and_ordered())
1464 ceph_assert(effective_dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1466 effective_dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1468 ceph_abort_msg("unexpected readdir buffer idx");
1470 dirp
->cache_index
++;
1472 // add to cached result list
1473 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, dn
->alternate_name
, in
));
1474 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1478 dirp
->last_name
= dname
;
1480 dirp
->next_offset
= 2;
1482 dirp
->next_offset
= readdir_offset
;
1484 if (dir
->is_empty())
1486 if (dir_other
&& dir_other
->is_empty())
1487 close_dir(dir_other
);
1493 * insert a trace from a MDS reply into the cache.
1495 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1497 auto& reply
= request
->reply
;
1498 int op
= request
->get_op();
1500 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1501 << " is_target=" << (int)reply
->head
.is_target
1502 << " is_dentry=" << (int)reply
->head
.is_dentry
1505 auto p
= reply
->get_trace_bl().cbegin();
1506 if (request
->got_unsafe
) {
1507 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1508 ceph_assert(p
.end());
1513 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1515 Dentry
*d
= request
->dentry();
1517 Inode
*diri
= d
->dir
->parent_inode
;
1518 clear_dir_complete_and_ordered(diri
, true);
1521 if (d
&& reply
->get_result() == 0) {
1522 if (op
== CEPH_MDS_OP_RENAME
) {
1524 Dentry
*od
= request
->old_dentry();
1525 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1527 unlink(od
, true, true); // keep dir, dentry
1528 } else if (op
== CEPH_MDS_OP_RMDIR
||
1529 op
== CEPH_MDS_OP_UNLINK
) {
1531 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1532 unlink(d
, true, true); // keep dir, dentry
1538 ConnectionRef con
= request
->reply
->get_connection();
1540 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1541 features
= (uint64_t)-1;
1544 features
= con
->get_features();
1546 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1549 SnapRealm
*realm
= NULL
;
1550 if (reply
->snapbl
.length())
1551 update_snap_trace(session
, reply
->snapbl
, &realm
);
1553 ldout(cct
, 10) << " hrm "
1554 << " is_target=" << (int)reply
->head
.is_target
1555 << " is_dentry=" << (int)reply
->head
.is_dentry
1564 if (reply
->head
.is_dentry
) {
1565 dirst
.decode(p
, features
);
1566 dst
.decode(p
, features
);
1568 dlease
.decode(p
, features
);
1572 if (reply
->head
.is_target
) {
1573 ist
.decode(p
, features
);
1574 if (cct
->_conf
->client_debug_getattr_caps
) {
1575 unsigned wanted
= 0;
1576 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1577 wanted
= request
->head
.args
.getattr
.mask
;
1578 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1579 wanted
= request
->head
.args
.open
.mask
;
1581 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1582 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1583 ceph_abort_msg("MDS reply does not contain xattrs");
1586 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1591 if (reply
->head
.is_dentry
) {
1592 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1594 mds_rank_t from_mds
= mds_rank_t(reply
->get_source().num());
1595 update_dir_dist(diri
, &dst
, from_mds
); // dir stat info is attached to ..
1598 Dir
*dir
= diri
->open_dir();
1599 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1600 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1603 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1604 dn
= diri
->dir
->dentries
[dname
];
1606 clear_dir_complete_and_ordered(diri
, false);
1607 unlink(dn
, true, true); // keep dir, dentry
1610 if (dlease
.duration_ms
> 0) {
1612 Dir
*dir
= diri
->open_dir();
1613 dn
= link(dir
, dname
, NULL
, NULL
);
1615 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1618 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1619 op
== CEPH_MDS_OP_MKSNAP
) {
1620 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1621 // fake it for snap lookup
1622 vinodeno_t vino
= ist
.vino
;
1623 vino
.snapid
= CEPH_SNAPDIR
;
1624 ceph_assert(inode_map
.count(vino
));
1625 diri
= inode_map
[vino
];
1627 string dname
= request
->path
.last_dentry();
1630 dlease
.duration_ms
= 0;
1633 Dir
*dir
= diri
->open_dir();
1634 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1636 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1637 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1639 unlink(dn
, true, true); // keep dir, dentry
1645 if (op
== CEPH_MDS_OP_READDIR
||
1646 op
== CEPH_MDS_OP_LSSNAP
) {
1647 insert_readdir_results(request
,
1651 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1652 // hack: return parent inode instead
1654 } else if (op
== CEPH_MDS_OP_READDIR_SNAPDIFF
) {
1655 // provide both request's inode (aka snapA) and traced one (snapB)
1656 // to properly match snapdiff results
1657 insert_readdir_results(request
,
1663 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1664 // pin the target inode if its parent dentry is not pinned
1665 request
->set_other_inode(in
);
1670 put_snap_realm(realm
);
1672 request
->target
= in
;
1678 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1680 mds_rank_t mds
= MDS_RANK_NONE
;
1682 bool is_hash
= false;
1688 if (req
->resend_mds
>= 0) {
1689 mds
= req
->resend_mds
;
1690 req
->resend_mds
= -1;
1691 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1695 if (cct
->_conf
->client_use_random_mds
)
1701 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1702 if (req
->path
.depth()) {
1703 hash
= in
->hash_dentry_name(req
->path
[0]);
1704 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1705 << " on " << req
->path
[0]
1706 << " => " << hash
<< dendl
;
1711 in
= de
->inode
.get();
1712 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1714 in
= de
->dir
->parent_inode
;
1715 hash
= in
->hash_dentry_name(de
->name
);
1716 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1717 << " on " << de
->name
1718 << " => " << hash
<< dendl
;
1723 if (in
->snapid
!= CEPH_NOSNAP
) {
1724 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1725 while (in
->snapid
!= CEPH_NOSNAP
) {
1726 if (in
->snapid
== CEPH_SNAPDIR
)
1727 in
= in
->snapdir_parent
.get();
1728 else if (!in
->dentries
.empty())
1729 /* In most cases there will only be one dentry, so getting it
1730 * will be the correct action. If there are multiple hard links,
1731 * I think the MDS should be able to redirect as needed*/
1732 in
= in
->get_first_parent()->dir
->parent_inode
;
1734 ldout(cct
, 10) << __func__
<< "got unlinked inode, can't look at parent" << dendl
;
1741 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1742 << " hash=" << hash
<< dendl
;
1744 if (req
->get_op() == CEPH_MDS_OP_GETATTR
)
1745 issued
= req
->inode()->caps_issued();
1747 if (is_hash
&& S_ISDIR(in
->mode
) && (!in
->fragmap
.empty() || !in
->frag_repmap
.empty())) {
1748 frag_t fg
= in
->dirfragtree
[hash
];
1749 if (!req
->auth_is_best(issued
)) {
1750 auto repmapit
= in
->frag_repmap
.find(fg
);
1751 if (repmapit
!= in
->frag_repmap
.end()) {
1752 auto& repmap
= repmapit
->second
;
1753 auto r
= ceph::util::generate_random_number
<uint64_t>(0, repmap
.size()-1);
1756 } else if (in
->fragmap
.count(fg
)) {
1757 mds
= in
->fragmap
[fg
];
1760 } else if (in
->auth_cap
) {
1761 req
->send_to_auth
= true;
1762 mds
= in
->auth_cap
->session
->mds_num
;
1765 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1770 if (in
->auth_cap
&& req
->auth_is_best(issued
)) {
1771 mds
= in
->auth_cap
->session
->mds_num
;
1772 } else if (!in
->caps
.empty()) {
1773 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1777 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1784 mds
= _get_random_up_mds();
1785 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1789 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1793 void Client::connect_mds_targets(mds_rank_t mds
)
1795 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1796 ceph_assert(mds_sessions
.count(mds
));
1797 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1798 for (const auto &rank
: info
.export_targets
) {
1799 if (mds_sessions
.count(rank
) == 0 &&
1800 mdsmap
->is_clientreplay_or_active_or_stopping(rank
)) {
1801 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1802 << " export target mds." << rank
<< dendl
;
1804 auto session
= _get_or_open_mds_session(rank
);
1805 if (session
->state
== MetaSession::STATE_OPENING
||
1806 session
->state
== MetaSession::STATE_OPEN
)
1809 _open_mds_session(rank
);
1814 void Client::dump_mds_sessions(Formatter
*f
, bool cap_dump
)
1816 f
->dump_int("id", get_nodeid().v
);
1817 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1818 f
->dump_object("inst", inst
);
1819 f
->dump_stream("inst_str") << inst
;
1820 f
->dump_stream("addr_str") << inst
.addr
;
1821 f
->open_array_section("sessions");
1822 for (const auto &p
: mds_sessions
) {
1823 f
->open_object_section("session");
1824 p
.second
->dump(f
, cap_dump
);
1828 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1831 void Client::dump_mds_requests(Formatter
*f
)
1833 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1834 p
!= mds_requests
.end();
1836 f
->open_object_section("request");
1842 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1843 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1844 InodeRef
*ptarget
, bool *pcreated
,
1845 const UserPerm
& perms
)
1847 // check whether this request actually did the create, and set created flag
1848 bufferlist extra_bl
;
1849 inodeno_t created_ino
;
1850 bool got_created_ino
= false;
1851 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1853 extra_bl
= reply
->get_extra_bl();
1854 if (extra_bl
.length() >= 8) {
1855 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1856 struct openc_response_t ocres
;
1858 decode(ocres
, extra_bl
);
1859 created_ino
= ocres
.created_ino
;
1861 * The userland cephfs client doesn't have a way to do an async create
1862 * (yet), so just discard delegated_inos for now. Eventually we should
1863 * store them and use them in create calls, even if they are synchronous,
1864 * if only for testing purposes.
1866 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1868 // u64 containing number of created ino
1869 decode(created_ino
, extra_bl
);
1871 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1872 got_created_ino
= true;
1876 *pcreated
= got_created_ino
;
1878 if (request
->target
) {
1879 *ptarget
= request
->target
;
1880 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1882 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1883 (*ptarget
) = p
->second
;
1884 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1886 // we got a traceless reply, and need to look up what we just
1887 // created. for now, do this by name. someday, do this by the
1888 // ino... which we know! FIXME.
1890 Dentry
*d
= request
->dentry();
1893 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1894 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1895 << " got_ino " << got_created_ino
1896 << " ino " << created_ino
1898 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1901 // if the dentry is not linked, just do our best. see #5021.
1902 ceph_abort_msg("how did this happen? i want logs!");
1905 Inode
*in
= request
->inode();
1906 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1907 << in
->ino
<< dendl
;
1908 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1912 // verify ino returned in reply and trace_dist are the same
1913 if (got_created_ino
&&
1914 created_ino
.val
!= target
->ino
.val
) {
1915 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1919 ptarget
->swap(target
);
1931 * Blocking helper to make an MDS request.
1933 * If the ptarget flag is set, behavior changes slightly: the caller
1934 * expects to get a pointer to the inode we are creating or operating
1935 * on. As a result, we will follow up any traceless mutation reply
1936 * with a getattr or lookup to transparently handle a traceless reply
1937 * from the MDS (as when the MDS restarts and the client has to replay
1940 * @param request the MetaRequest to execute
1941 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1942 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1943 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1944 * @param use_mds [optional] prefer a specific mds (-1 for default)
1945 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1947 int Client::make_request(MetaRequest
*request
,
1948 const UserPerm
& perms
,
1949 InodeRef
*ptarget
, bool *pcreated
,
1952 size_t feature_needed
)
1956 // assign a unique tid
1957 ceph_tid_t tid
= ++last_tid
;
1958 request
->set_tid(tid
);
1961 request
->op_stamp
= ceph_clock_now();
1962 request
->created
= ceph::coarse_mono_clock::now();
1965 mds_requests
[tid
] = request
->get();
1966 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1969 request
->set_caller_perms(perms
);
1971 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1972 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1973 request
->set_oldest_client_tid(1);
1975 request
->set_oldest_client_tid(oldest_tid
);
1980 request
->resend_mds
= use_mds
;
1982 MetaSessionRef session
= NULL
;
1984 if (request
->aborted())
1988 request
->abort(-CEPHFS_EBLOCKLISTED
);
1993 ceph::condition_variable caller_cond
;
1994 request
->caller_cond
= &caller_cond
;
1997 Inode
*hash_diri
= NULL
;
1998 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1999 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
2000 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
2001 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
2003 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
2004 _fragmap_remove_stopped_mds(hash_diri
, mds
);
2006 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
2007 request
->resend_mds
= _get_random_up_mds();
2010 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
2011 wait_on_list(waiting_for_mdsmap
);
2017 if (!have_open_session(mds
)) {
2018 session
= _get_or_open_mds_session(mds
);
2019 if (session
->state
== MetaSession::STATE_REJECTED
) {
2020 request
->abort(-CEPHFS_EPERM
);
2024 if (session
->state
== MetaSession::STATE_OPENING
) {
2025 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
2026 wait_on_context_list(session
->waiting_for_open
);
2030 if (!have_open_session(mds
))
2033 session
= mds_sessions
.at(mds
);
2036 if (feature_needed
!= ULONG_MAX
&& !session
->mds_features
.test(feature_needed
)) {
2037 request
->abort(-CEPHFS_EOPNOTSUPP
);
2042 send_request(request
, session
.get());
2045 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
2046 request
->kick
= false;
2047 std::unique_lock l
{client_lock
, std::adopt_lock
};
2048 caller_cond
.wait(l
, [request
] {
2049 return (request
->reply
|| // reply
2050 request
->resend_mds
>= 0 || // forward
2054 request
->caller_cond
= nullptr;
2056 // did we get a reply?
2061 if (!request
->reply
) {
2062 ceph_assert(request
->aborted());
2063 ceph_assert(!request
->got_unsafe
);
2064 r
= request
->get_abort_code();
2065 request
->item
.remove_myself();
2066 unregister_request(request
);
2067 put_request(request
);
2072 auto reply
= std::move(request
->reply
);
2073 r
= reply
->get_result();
2075 request
->success
= true;
2077 // kick dispatcher (we've got it!)
2078 ceph_assert(request
->dispatch_cond
);
2079 request
->dispatch_cond
->notify_all();
2080 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
2081 request
->dispatch_cond
= 0;
2083 if (r
>= 0 && ptarget
)
2084 r
= verify_reply_trace(r
, session
.get(), request
, reply
, ptarget
, pcreated
, perms
);
2087 *pdirbl
= reply
->get_extra_bl();
2090 utime_t lat
= ceph_clock_now();
2091 lat
-= request
->sent_stamp
;
2092 ldout(cct
, 20) << "lat " << lat
<< dendl
;
2094 ++nr_metadata_request
;
2095 update_io_stat_metadata(lat
);
2097 put_request(request
);
2101 void Client::unregister_request(MetaRequest
*req
)
2103 mds_requests
.erase(req
->tid
);
2104 if (req
->tid
== oldest_tid
) {
2105 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
2107 if (p
== mds_requests
.end()) {
2111 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
2112 oldest_tid
= p
->first
;
2121 void Client::put_request(MetaRequest
*request
)
2123 if (request
->_put()) {
2125 if (request
->success
)
2126 op
= request
->get_op();
2128 request
->take_other_inode(&other_in
);
2132 (op
== CEPH_MDS_OP_RMDIR
||
2133 op
== CEPH_MDS_OP_RENAME
||
2134 op
== CEPH_MDS_OP_RMSNAP
)) {
2135 _try_to_trim_inode(other_in
.get(), false);
2140 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
2141 mds_rank_t mds
, int drop
,
2142 int unless
, int force
)
2144 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
2145 << " mds:" << mds
<< ", drop:" << ccap_string(drop
) << ", unless:" << ccap_string(unless
)
2146 << ", force:" << force
<< ")" << dendl
;
2148 auto it
= in
->caps
.find(mds
);
2149 if (it
!= in
->caps
.end()) {
2150 Cap
&cap
= it
->second
;
2151 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
2152 if ((drop
& cap
.issued
) &&
2153 !(unless
& cap
.issued
)) {
2154 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
2155 cap
.issued
&= ~drop
;
2156 cap
.implemented
&= ~drop
;
2162 cap
.wanted
= in
->caps_wanted();
2163 if (&cap
== in
->auth_cap
&&
2164 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
2165 in
->requested_max_size
= 0;
2166 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
2168 ceph_mds_request_release rel
;
2170 rel
.cap_id
= cap
.cap_id
;
2172 rel
.issue_seq
= cap
.issue_seq
;
2173 rel
.mseq
= cap
.mseq
;
2174 rel
.caps
= cap
.implemented
;
2175 rel
.wanted
= cap
.wanted
;
2178 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
2181 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
2182 << released
<< dendl
;
2186 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
2187 mds_rank_t mds
, int drop
, int unless
)
2189 ldout(cct
, 20) << __func__
<< " enter(dn:"
2190 << dn
<< ")" << dendl
;
2193 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
2194 mds
, drop
, unless
, 1);
2195 if (released
&& dn
->lease_mds
== mds
) {
2196 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
2197 auto& rel
= req
->cap_releases
.back();
2198 rel
.item
.dname_len
= dn
->name
.length();
2199 rel
.item
.dname_seq
= dn
->lease_seq
;
2200 rel
.dname
= dn
->name
;
2203 ldout(cct
, 25) << __func__
<< " exit(dn:"
2204 << dn
<< ")" << dendl
;
2209 * This requires the MClientRequest *request member to be set.
2210 * It will error out horribly without one.
2211 * Additionally, if you set any *drop member, you'd better have
2212 * set the corresponding dentry!
2214 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
2216 ldout(cct
, 20) << __func__
<< " enter (req: "
2217 << req
<< ", mds: " << mds
<< ")" << dendl
;
2218 if (req
->inode_drop
&& req
->inode())
2219 encode_inode_release(req
->inode(), req
,
2220 mds
, req
->inode_drop
,
2223 if (req
->old_inode_drop
&& req
->old_inode())
2224 encode_inode_release(req
->old_inode(), req
,
2225 mds
, req
->old_inode_drop
,
2226 req
->old_inode_unless
);
2227 if (req
->other_inode_drop
&& req
->other_inode())
2228 encode_inode_release(req
->other_inode(), req
,
2229 mds
, req
->other_inode_drop
,
2230 req
->other_inode_unless
);
2232 if (req
->dentry_drop
&& req
->dentry())
2233 encode_dentry_release(req
->dentry(), req
,
2234 mds
, req
->dentry_drop
,
2235 req
->dentry_unless
);
2237 if (req
->old_dentry_drop
&& req
->old_dentry())
2238 encode_dentry_release(req
->old_dentry(), req
,
2239 mds
, req
->old_dentry_drop
,
2240 req
->old_dentry_unless
);
2241 ldout(cct
, 25) << __func__
<< " exit (req: "
2242 << req
<< ", mds " << mds
<<dendl
;
2245 bool Client::have_open_session(mds_rank_t mds
)
2247 const auto &it
= mds_sessions
.find(mds
);
2248 return it
!= mds_sessions
.end() &&
2249 (it
->second
->state
== MetaSession::STATE_OPEN
||
2250 it
->second
->state
== MetaSession::STATE_STALE
);
2253 MetaSessionRef
Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
2255 const auto &it
= mds_sessions
.find(mds
);
2256 if (it
== mds_sessions
.end() || it
->second
->con
!= con
) {
2263 MetaSessionRef
Client::_get_or_open_mds_session(mds_rank_t mds
)
2265 auto it
= mds_sessions
.find(mds
);
2266 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : it
->second
;
2270 * Populate a map of strings with client-identifying metadata,
2271 * such as the hostname. Call this once at initialization.
2273 void Client::populate_metadata(const std::string
&mount_root
)
2277 // TODO: move this to compat.h
2279 DWORD hostname_sz
= 64;
2280 GetComputerNameA(hostname
, &hostname_sz
);
2281 metadata
["hostname"] = hostname
;
2286 metadata
["hostname"] = u
.nodename
;
2287 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
2289 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
2293 metadata
["pid"] = stringify(getpid());
2295 // Ceph entity id (the '0' in "client.0")
2296 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2298 // Our mount position
2299 if (!mount_root
.empty()) {
2300 metadata
["root"] = mount_root
;
2304 metadata
["ceph_version"] = pretty_version_to_str();
2305 metadata
["ceph_sha1"] = git_version_to_str();
2307 // Apply any metadata from the user's configured overrides
2308 std::vector
<std::string
> tokens
;
2309 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2310 for (const auto &i
: tokens
) {
2311 auto eqpos
= i
.find("=");
2312 // Throw out anything that isn't of the form "<str>=<str>"
2313 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2314 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2317 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2322 * Optionally add or override client metadata fields.
2324 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2326 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2327 ceph_assert(iref_reader
.is_state_satisfied());
2329 std::scoped_lock
l(client_lock
);
2331 auto it
= metadata
.find(k
);
2332 if (it
!= metadata
.end()) {
2333 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2334 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2340 MetaSessionRef
Client::_open_mds_session(mds_rank_t mds
)
2342 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2343 auto addrs
= mdsmap
->get_addrs(mds
);
2344 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2345 std::forward_as_tuple(mds
),
2346 std::forward_as_tuple(new MetaSession(mds
, messenger
->connect_to_mds(addrs
), addrs
)));
2347 ceph_assert(em
.second
); /* not already present */
2348 auto session
= em
.first
->second
;
2350 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2351 m
->metadata
= metadata
;
2352 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2353 m
->metric_spec
= feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL
);
2354 session
->con
->send_message2(std::move(m
));
2358 void Client::_close_mds_session(MetaSession
*s
)
2360 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2361 s
->state
= MetaSession::STATE_CLOSING
;
2362 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2365 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2367 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2368 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2369 s
->state
= MetaSession::STATE_REJECTED
;
2371 s
->state
= MetaSession::STATE_CLOSED
;
2372 s
->con
->mark_down();
2373 signal_context_list(s
->waiting_for_open
);
2374 mount_cond
.notify_all();
2375 remove_session_caps(s
, err
);
2376 kick_requests_closed(s
);
2377 mds_ranks_closing
.erase(s
->mds_num
);
2378 if (s
->state
== MetaSession::STATE_CLOSED
)
2379 mds_sessions
.erase(s
->mds_num
);
2382 static void reinit_mds_features(MetaSession
*session
,
2383 const MConstRef
<MClientSession
>& m
) {
2384 session
->mds_features
= std::move(m
->supported_features
);
2385 session
->mds_metric_flags
= std::move(m
->metric_spec
.metric_flags
);
2388 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2390 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2391 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2393 std::scoped_lock
cl(client_lock
);
2394 auto session
= _get_mds_session(from
, m
->get_connection().get());
2396 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2400 switch (m
->get_op()) {
2401 case CEPH_SESSION_OPEN
:
2403 if (session
->state
== MetaSession::STATE_OPEN
) {
2404 ldout(cct
, 10) << "mds." << from
<< " already opened, ignore it"
2406 // The MDS could send a client_session(open) message even when
2407 // the session state is STATE_OPEN. Normally, its fine to
2408 // ignore this message, but, if the MDS sent this message just
2409 // after it got upgraded, the MDS feature bits could differ
2410 // than the one before the upgrade - so, refresh the feature
2411 // bits the client holds.
2412 reinit_mds_features(session
.get(), m
);
2416 * The connection maybe broken and the session in client side
2417 * has been reinitialized, need to update the seq anyway.
2419 if (!session
->seq
&& m
->get_seq())
2420 session
->seq
= m
->get_seq();
2422 reinit_mds_features(session
.get(), m
);
2424 renew_caps(session
.get());
2425 session
->state
= MetaSession::STATE_OPEN
;
2426 if (is_unmounting())
2427 mount_cond
.notify_all();
2429 connect_mds_targets(from
);
2430 signal_context_list(session
->waiting_for_open
);
2434 case CEPH_SESSION_CLOSE
:
2435 _closed_mds_session(session
.get());
2438 case CEPH_SESSION_RENEWCAPS
:
2439 if (session
->cap_renew_seq
== m
->get_seq()) {
2440 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2442 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2444 wake_up_session_caps(session
.get(), false);
2448 case CEPH_SESSION_STALE
:
2449 // invalidate session caps/leases
2451 session
->cap_ttl
= ceph_clock_now();
2452 session
->cap_ttl
-= 1;
2453 renew_caps(session
.get());
2456 case CEPH_SESSION_RECALL_STATE
:
2458 * Call the renew caps and flush cap releases just before
2459 * triming the caps in case the tick() won't get a chance
2460 * to run them, which could cause the client to be blocklisted
2461 * and MDS daemons trying to recall the caps again and
2464 * In most cases it will do nothing, and the new cap releases
2465 * added by trim_caps() followed will be deferred flushing
2468 renew_and_flush_cap_releases();
2469 trim_caps(session
.get(), m
->get_max_caps());
2472 case CEPH_SESSION_FLUSHMSG
:
2473 /* flush cap release */
2474 if (auto& m
= session
->release
; m
) {
2475 session
->con
->send_message2(std::move(m
));
2477 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2480 case CEPH_SESSION_FORCE_RO
:
2481 force_session_readonly(session
.get());
2484 case CEPH_SESSION_REJECT
:
2486 std::string_view error_str
;
2487 auto it
= m
->metadata
.find("error_string");
2488 if (it
!= m
->metadata
.end())
2489 error_str
= it
->second
;
2491 error_str
= "unknown error";
2492 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2494 _closed_mds_session(session
.get(), -CEPHFS_EPERM
, true);
2503 bool Client::_any_stale_sessions() const
2505 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2507 for (const auto &p
: mds_sessions
) {
2508 if (p
.second
->state
== MetaSession::STATE_STALE
) {
2516 void Client::_kick_stale_sessions()
2518 ldout(cct
, 1) << __func__
<< dendl
;
2520 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2521 auto s
= it
->second
;
2522 if (s
->state
== MetaSession::STATE_REJECTED
) {
2523 mds_sessions
.erase(it
->first
);
2526 if (s
->state
== MetaSession::STATE_STALE
)
2527 _closed_mds_session(s
.get());
2531 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2532 bool drop_cap_releases
)
2535 mds_rank_t mds
= session
->mds_num
;
2536 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2537 << " for mds." << mds
<< dendl
;
2538 auto r
= build_client_request(request
, mds
);
2542 if (request
->dentry()) {
2543 r
->set_dentry_wanted();
2545 if (request
->got_unsafe
) {
2546 r
->set_replayed_op();
2547 if (request
->target
)
2548 r
->head
.ino
= request
->target
->ino
;
2550 encode_cap_releases(request
, mds
);
2551 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2552 request
->cap_releases
.clear();
2554 r
->releases
.swap(request
->cap_releases
);
2556 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2557 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2558 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2559 r
->set_osdmap_epoch(o
.get_epoch());
2563 if (request
->mds
== -1) {
2564 request
->sent_stamp
= ceph_clock_now();
2565 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2569 Inode
*in
= request
->inode();
2571 auto it
= in
->caps
.find(mds
);
2572 if (it
!= in
->caps
.end()) {
2573 request
->sent_on_mseq
= it
->second
.mseq
;
2577 session
->requests
.push_back(&request
->item
);
2579 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2580 session
->con
->send_message2(std::move(r
));
2583 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
, mds_rank_t mds
)
2585 auto session
= mds_sessions
.at(mds
);
2586 bool old_version
= !session
->mds_features
.test(CEPHFS_FEATURE_32BITS_RETRY_FWD
);
2589 * Avoid inifinite retrying after overflow.
2591 * The client will increase the retry count and if the MDS is
2592 * old version, so we limit to retry at most 256 times.
2594 if (request
->retry_attempt
) {
2595 int old_max_retry
= sizeof(((struct ceph_mds_request_head
*)0)->num_retry
);
2596 old_max_retry
= 1 << (old_max_retry
* CHAR_BIT
);
2597 if ((old_version
&& request
->retry_attempt
>= old_max_retry
) ||
2598 (uint32_t)request
->retry_attempt
>= UINT32_MAX
) {
2599 request
->abort(-CEPHFS_EMULTIHOP
);
2600 request
->caller_cond
->notify_all();
2601 ldout(cct
, 1) << __func__
<< " request tid " << request
->tid
2602 << " retry seq overflow" << ", abort it" << dendl
;
2607 auto req
= make_message
<MClientRequest
>(request
->get_op(), session
->mds_features
);
2608 req
->set_tid(request
->tid
);
2609 req
->set_stamp(request
->op_stamp
);
2610 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2612 // if the filepath's haven't been set, set them!
2613 if (request
->path
.empty()) {
2614 Inode
*in
= request
->inode();
2615 Dentry
*de
= request
->dentry();
2617 in
->make_nosnap_relative_path(request
->path
);
2620 de
->inode
->make_nosnap_relative_path(request
->path
);
2622 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2623 request
->path
.push_dentry(de
->name
);
2625 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2626 << " No path, inode, or appropriately-endowed dentry given!"
2628 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2629 << " No path, inode, or dentry given!"
2632 req
->set_filepath(request
->get_filepath());
2633 req
->set_filepath2(request
->get_filepath2());
2634 req
->set_alternate_name(request
->alternate_name
);
2635 req
->set_data(request
->data
);
2636 req
->fscrypt_auth
= request
->fscrypt_auth
;
2637 req
->fscrypt_file
= request
->fscrypt_file
;
2638 req
->set_retry_attempt(request
->retry_attempt
++);
2639 req
->head
.ext_num_fwd
= request
->num_fwd
;
2641 int gid_count
= request
->perms
.get_gids(&_gids
);
2642 req
->set_gid_list(gid_count
, _gids
);
2648 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2650 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2652 std::scoped_lock
cl(client_lock
);
2653 auto session
= _get_mds_session(mds
, fwd
->get_connection().get());
2657 ceph_tid_t tid
= fwd
->get_tid();
2659 if (mds_requests
.count(tid
) == 0) {
2660 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2664 MetaRequest
*request
= mds_requests
[tid
];
2665 ceph_assert(request
);
2668 * Avoid inifinite retrying after overflow.
2670 * The MDS will increase the fwd count and in client side
2671 * if the num_fwd is less than the one saved in request
2672 * that means the MDS is an old version and overflowed of
2675 auto num_fwd
= fwd
->get_num_fwd();
2676 if (num_fwd
<= request
->num_fwd
|| (uint32_t)num_fwd
>= UINT32_MAX
) {
2677 request
->abort(-CEPHFS_EMULTIHOP
);
2678 request
->caller_cond
->notify_all();
2679 ldout(cct
, 0) << __func__
<< " request tid " << tid
<< " new num_fwd "
2680 << num_fwd
<< " old num_fwd " << request
->num_fwd
<< ", fwd seq overflow"
2681 << ", abort it" << dendl
;
2685 // reset retry counter
2686 request
->retry_attempt
= 0;
2688 // request not forwarded, or dest mds has no session.
2690 ldout(cct
, 10) << __func__
<< " tid " << tid
2691 << " fwd " << fwd
->get_num_fwd()
2692 << " to mds." << fwd
->get_dest_mds()
2693 << ", resending to " << fwd
->get_dest_mds()
2697 request
->item
.remove_myself();
2698 request
->num_fwd
= num_fwd
;
2699 request
->resend_mds
= fwd
->get_dest_mds();
2700 request
->caller_cond
->notify_all();
2703 bool Client::is_dir_operation(MetaRequest
*req
)
2705 int op
= req
->get_op();
2706 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2707 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2708 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2709 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2714 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2716 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2718 std::scoped_lock
cl(client_lock
);
2719 auto session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2724 ceph_tid_t tid
= reply
->get_tid();
2725 bool is_safe
= reply
->is_safe();
2727 if (mds_requests
.count(tid
) == 0) {
2728 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2729 << " safe is:" << is_safe
<< dendl
;
2732 MetaRequest
*request
= mds_requests
.at(tid
);
2734 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2735 << " tid " << tid
<< dendl
;
2737 // correct sessions ?
2738 if (request
->mds
!= mds_num
) {
2739 ldout(cct
, 0) << "got a stale reply from mds." << mds_num
2740 << " instead of mds." << request
->mds
<< dendl
;
2744 if (request
->got_unsafe
&& !is_safe
) {
2745 //duplicate response
2746 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2747 << mds_num
<< " safe:" << is_safe
<< dendl
;
2751 ceph_assert(!request
->reply
);
2752 request
->reply
= reply
;
2753 insert_trace(request
, session
.get());
2755 // Handle unsafe reply
2757 request
->got_unsafe
= true;
2758 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2759 if (is_dir_operation(request
)) {
2760 Inode
*dir
= request
->inode();
2762 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2764 if (request
->target
) {
2765 InodeRef
&in
= request
->target
;
2766 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2770 // Only signal the caller once (on the first reply):
2771 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2772 if (!is_safe
|| !request
->got_unsafe
) {
2773 ceph::condition_variable cond
;
2774 request
->dispatch_cond
= &cond
;
2777 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2778 request
->caller_cond
->notify_all();
2780 // wake for kick back
2781 std::unique_lock l
{client_lock
, std::adopt_lock
};
2782 cond
.wait(l
, [tid
, request
, &cond
, this] {
2783 if (request
->dispatch_cond
) {
2784 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2785 << tid
<< " " << &cond
<< dendl
;
2787 return !request
->dispatch_cond
;
2793 // the filesystem change is committed to disk
2794 // we're done, clean up
2795 if (request
->got_unsafe
) {
2796 request
->unsafe_item
.remove_myself();
2797 request
->unsafe_dir_item
.remove_myself();
2798 request
->unsafe_target_item
.remove_myself();
2799 signal_cond_list(request
->waitfor_safe
);
2801 request
->item
.remove_myself();
2802 unregister_request(request
);
2804 if (is_unmounting())
2805 mount_cond
.notify_all();
2808 void Client::_handle_full_flag(int64_t pool
)
2810 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2811 << "on " << pool
<< dendl
;
2812 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
2813 // to do this rather than blocking, because otherwise when we fill up we
2814 // potentially lock caps forever on files with dirty pages, and we need
2815 // to be able to release those caps to the MDS so that it can delete files
2816 // and free up space.
2817 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-CEPHFS_ENOSPC
, pool
);
2819 // For all inodes with layouts in this pool and a pending flush write op
2820 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2821 // from ObjectCacher so that it doesn't re-issue the write in response to
2822 // the ENOSPC error.
2823 // Fortunately since we're cancelling everything in a given pool, we don't
2824 // need to know which ops belong to which ObjectSet, we can just blow all
2825 // the un-flushed cached data away and mark any dirty inodes' async_err
2826 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
2827 // affecting this pool, and all the objectsets we're purging were also
2829 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2830 i
!= inode_map
.end(); ++i
)
2832 Inode
*inode
= i
->second
;
2833 if (inode
->oset
.dirty_or_tx
2834 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2835 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2836 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2837 objectcacher
->purge_set(&inode
->oset
);
2838 inode
->set_async_err(-CEPHFS_ENOSPC
);
2842 if (cancelled_epoch
!= (epoch_t
)-1) {
2843 set_cap_epoch_barrier(cancelled_epoch
);
2847 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2849 std::scoped_lock
cl(client_lock
);
2851 const auto myaddrs
= messenger
->get_myaddrs();
2852 bool new_blocklist
= objecter
->with_osdmap(
2853 [&](const OSDMap
& o
) {
2854 return o
.is_blocklisted(myaddrs
);
2857 if (new_blocklist
&& !blocklisted
) {
2858 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2859 return o
.get_epoch();
2861 lderr(cct
) << "I was blocklisted at osd epoch " << epoch
<< dendl
;
2864 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED
);
2866 // Since we know all our OSD ops will fail, cancel them all preemtively,
2867 // so that on an unhealthy cluster we can umount promptly even if e.g.
2868 // some PGs were inaccessible.
2869 objecter
->op_cancel_writes(-CEPHFS_EBLOCKLISTED
);
2874 // Handle case where we were blocklisted but no longer are
2875 blocklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2876 return o
.is_blocklisted(myaddrs
);});
2879 // Always subscribe to next osdmap for blocklisted client
2880 // until this client is not blocklisted.
2882 objecter
->maybe_request_map();
2885 if (objecter
->osdmap_full_flag()) {
2886 _handle_full_flag(-1);
2888 // Accumulate local list of full pools so that I can drop
2889 // the objecter lock before re-entering objecter in
2891 std::vector
<int64_t> full_pools
;
2893 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2894 for (const auto& kv
: o
.get_pools()) {
2895 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2896 full_pools
.push_back(kv
.first
);
2901 for (auto p
: full_pools
)
2902 _handle_full_flag(p
);
2904 // Subscribe to subsequent maps to watch for the full flag going
2905 // away. For the global full flag objecter does this for us, but
2906 // it pays no attention to the per-pool full flag so in this branch
2907 // we do it ourselves.
2908 if (!full_pools
.empty()) {
2909 objecter
->maybe_request_map();
2915 // ------------------------
2916 // incoming messages
2919 bool Client::ms_dispatch2(const MessageRef
&m
)
2921 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
2922 if (!iref_reader
.is_state_satisfied()) {
2923 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2927 switch (m
->get_type()) {
2928 // mounting and mds sessions
2929 case CEPH_MSG_MDS_MAP
:
2930 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2932 case CEPH_MSG_FS_MAP
:
2933 handle_fs_map(ref_cast
<MFSMap
>(m
));
2935 case CEPH_MSG_FS_MAP_USER
:
2936 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2938 case CEPH_MSG_CLIENT_SESSION
:
2939 handle_client_session(ref_cast
<MClientSession
>(m
));
2942 case CEPH_MSG_OSD_MAP
:
2943 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2947 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2948 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2950 case CEPH_MSG_CLIENT_REPLY
:
2951 handle_client_reply(ref_cast
<MClientReply
>(m
));
2955 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2956 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2959 case CEPH_MSG_CLIENT_SNAP
:
2960 handle_snap(ref_cast
<MClientSnap
>(m
));
2962 case CEPH_MSG_CLIENT_CAPS
:
2963 handle_caps(ref_cast
<MClientCaps
>(m
));
2965 case CEPH_MSG_CLIENT_LEASE
:
2966 handle_lease(ref_cast
<MClientLease
>(m
));
2968 case MSG_COMMAND_REPLY
:
2969 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2970 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2975 case CEPH_MSG_CLIENT_QUOTA
:
2976 handle_quota(ref_cast
<MClientQuota
>(m
));
2984 std::scoped_lock
cl(client_lock
);
2985 if (is_unmounting()) {
2986 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2987 << "+" << inode_map
.size() << dendl
;
2988 uint64_t size
= lru
.lru_get_size() + inode_map
.size();
2990 if (size
> lru
.lru_get_size() + inode_map
.size()) {
2991 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2992 mount_cond
.notify_all();
2994 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2995 << "+" << inode_map
.size() << dendl
;
3002 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
3004 std::scoped_lock
cl(client_lock
);
3005 fsmap
.reset(new FSMap(m
->get_fsmap()));
3007 signal_cond_list(waiting_for_fsmap
);
3009 monclient
->sub_got("fsmap", fsmap
->get_epoch());
3012 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
3014 std::scoped_lock
cl(client_lock
);
3015 fsmap_user
.reset(new FSMapUser
);
3016 *fsmap_user
= m
->get_fsmap();
3018 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
3019 signal_cond_list(waiting_for_fsmap
);
3022 // Cancel all the commands for missing or laggy GIDs
3023 void Client::cancel_commands(const MDSMap
& newmap
)
3025 std::vector
<ceph_tid_t
> cancel_ops
;
3027 std::scoped_lock
cmd_lock(command_lock
);
3028 auto &commands
= command_table
.get_commands();
3029 for (const auto &[tid
, op
] : commands
) {
3030 const mds_gid_t op_mds_gid
= op
.mds_gid
;
3031 if (newmap
.is_dne_gid(op_mds_gid
) || newmap
.is_laggy_gid(op_mds_gid
)) {
3032 ldout(cct
, 1) << __func__
<< ": cancelling command op " << tid
<< dendl
;
3033 cancel_ops
.push_back(tid
);
3035 std::ostringstream ss
;
3036 ss
<< "MDS " << op_mds_gid
<< " went away";
3037 *(op
.outs
) = ss
.str();
3040 * No need to make the con->mark_down under
3041 * client_lock here, because the con will
3044 op
.con
->mark_down();
3046 op
.on_finish
->complete(-CEPHFS_ETIMEDOUT
);
3050 for (const auto &tid
: cancel_ops
)
3051 command_table
.erase(tid
);
3054 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
3056 std::unique_lock
cl(client_lock
);
3057 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
3058 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
3059 << " is identical to or older than our "
3060 << mdsmap
->get_epoch() << dendl
;
3065 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
3066 std::unique_ptr
<MDSMap
> _mdsmap(new MDSMap
);
3067 _mdsmap
->decode(m
->get_encoded());
3068 cancel_commands(*_mdsmap
.get());
3071 _mdsmap
.swap(mdsmap
);
3074 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
3075 mds_rank_t mds
= p
->first
;
3076 MetaSessionRef session
= p
->second
;
3079 int oldstate
= _mdsmap
->get_state(mds
);
3080 int newstate
= mdsmap
->get_state(mds
);
3081 if (!mdsmap
->is_up(mds
)) {
3082 session
->con
->mark_down();
3083 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
3084 auto old_inc
= _mdsmap
->get_incarnation(mds
);
3085 auto new_inc
= mdsmap
->get_incarnation(mds
);
3086 if (old_inc
!= new_inc
) {
3087 ldout(cct
, 1) << "mds incarnation changed from "
3088 << old_inc
<< " to " << new_inc
<< dendl
;
3089 oldstate
= MDSMap::STATE_NULL
;
3091 session
->con
->mark_down();
3092 session
->addrs
= mdsmap
->get_addrs(mds
);
3093 // When new MDS starts to take over, notify kernel to trim unused entries
3094 // in its dcache/icache. Hopefully, the kernel will release some unused
3095 // inodes before the new MDS enters reconnect state.
3096 trim_cache_for_reconnect(session
.get());
3097 } else if (oldstate
== newstate
)
3098 continue; // no change
3100 session
->mds_state
= newstate
;
3101 if (newstate
== MDSMap::STATE_RECONNECT
) {
3102 session
->con
= messenger
->connect_to_mds(session
->addrs
);
3103 send_reconnect(session
.get());
3104 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
3105 if (oldstate
< MDSMap::STATE_RECONNECT
) {
3106 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
3107 _closed_mds_session(session
.get());
3110 if (newstate
>= MDSMap::STATE_ACTIVE
) {
3111 if (oldstate
< MDSMap::STATE_ACTIVE
) {
3112 // kick new requests
3113 kick_requests(session
.get());
3114 kick_flushing_caps(session
.get());
3115 signal_context_list(session
->waiting_for_open
);
3116 wake_up_session_caps(session
.get(), true);
3118 connect_mds_targets(mds
);
3120 } else if (newstate
== MDSMap::STATE_NULL
&&
3121 mds
>= mdsmap
->get_max_mds()) {
3122 _closed_mds_session(session
.get());
3126 // kick any waiting threads
3127 signal_cond_list(waiting_for_mdsmap
);
3129 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
3132 void Client::send_reconnect(MetaSession
*session
)
3134 mds_rank_t mds
= session
->mds_num
;
3135 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
3137 // trim unused caps to reduce MDS's cache rejoin time
3138 trim_cache_for_reconnect(session
);
3140 session
->readonly
= false;
3142 session
->release
.reset();
3144 // reset my cap seq number
3146 //connect to the mds' offload targets
3147 connect_mds_targets(mds
);
3148 //make sure unsafe requests get saved
3149 resend_unsafe_requests(session
);
3151 early_kick_flushing_caps(session
);
3153 auto m
= make_message
<MClientReconnect
>();
3154 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
3156 // i have an open session.
3157 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
3158 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
3159 p
!= inode_map
.end();
3161 Inode
*in
= p
->second
;
3162 auto it
= in
->caps
.find(mds
);
3163 if (it
!= in
->caps
.end()) {
3165 m
->get_approx_size() >=
3166 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
3168 session
->con
->send_message2(std::move(m
));
3170 m
= make_message
<MClientReconnect
>();
3173 Cap
&cap
= it
->second
;
3174 ldout(cct
, 10) << " caps on " << p
->first
3175 << " " << ccap_string(cap
.issued
)
3176 << " wants " << ccap_string(in
->caps_wanted())
3179 in
->make_short_path(path
);
3180 ldout(cct
, 10) << " path " << path
<< dendl
;
3183 _encode_filelocks(in
, flockbl
);
3185 cap
.seq
= 0; // reset seq.
3186 cap
.issue_seq
= 0; // reset seq.
3187 cap
.mseq
= 0; // reset seq.
3188 // cap gen should catch up with session cap_gen
3189 if (cap
.gen
< session
->cap_gen
) {
3190 cap
.gen
= session
->cap_gen
;
3191 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
3193 cap
.issued
= cap
.implemented
;
3195 snapid_t snap_follows
= 0;
3196 if (!in
->cap_snaps
.empty())
3197 snap_follows
= in
->cap_snaps
.begin()->first
;
3199 m
->add_cap(p
->first
.ino
,
3201 path
.get_ino(), path
.get_path(), // ino
3202 in
->caps_wanted(), // wanted
3203 cap
.issued
, // issued
3208 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
3209 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
3210 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
3211 did_snaprealm
.insert(in
->snaprealm
->ino
);
3217 m
->set_encoding_version(0); // use connection features to choose encoding
3218 session
->con
->send_message2(std::move(m
));
3220 mount_cond
.notify_all();
3222 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
3223 signal_cond_list(waiting_for_reclaim
);
3227 void Client::kick_requests(MetaSession
*session
)
3229 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3230 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3231 p
!= mds_requests
.end();
3233 MetaRequest
*req
= p
->second
;
3234 if (req
->got_unsafe
)
3236 if (req
->aborted()) {
3237 if (req
->caller_cond
) {
3239 req
->caller_cond
->notify_all();
3243 if (req
->retry_attempt
> 0)
3244 continue; // new requests only
3245 if (req
->mds
== session
->mds_num
) {
3246 send_request(p
->second
, session
);
3251 void Client::resend_unsafe_requests(MetaSession
*session
)
3253 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
3256 send_request(*iter
, session
);
3258 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3259 // process completed requests in clientreplay stage.
3260 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3261 p
!= mds_requests
.end();
3263 MetaRequest
*req
= p
->second
;
3264 if (req
->got_unsafe
)
3268 if (req
->retry_attempt
== 0)
3269 continue; // old requests only
3270 if (req
->mds
== session
->mds_num
)
3271 send_request(req
, session
, true);
3275 void Client::wait_unsafe_requests()
3277 list
<MetaRequest
*> last_unsafe_reqs
;
3278 for (const auto &p
: mds_sessions
) {
3279 const auto s
= p
.second
;
3280 if (!s
->unsafe_requests
.empty()) {
3281 MetaRequest
*req
= s
->unsafe_requests
.back();
3283 last_unsafe_reqs
.push_back(req
);
3287 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
3288 p
!= last_unsafe_reqs
.end();
3290 MetaRequest
*req
= *p
;
3291 if (req
->unsafe_item
.is_on_list())
3292 wait_on_list(req
->waitfor_safe
);
3297 void Client::kick_requests_closed(MetaSession
*session
)
3299 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
3300 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
3301 p
!= mds_requests
.end(); ) {
3302 MetaRequest
*req
= p
->second
;
3304 if (req
->mds
== session
->mds_num
) {
3305 if (req
->caller_cond
) {
3307 req
->caller_cond
->notify_all();
3309 req
->item
.remove_myself();
3310 if (req
->got_unsafe
) {
3311 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
3312 req
->unsafe_item
.remove_myself();
3313 if (is_dir_operation(req
)) {
3314 Inode
*dir
= req
->inode();
3316 dir
->set_async_err(-CEPHFS_EIO
);
3317 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
3318 << dir
->ino
<< " " << req
->get_tid() << dendl
;
3319 req
->unsafe_dir_item
.remove_myself();
3322 InodeRef
&in
= req
->target
;
3323 in
->set_async_err(-CEPHFS_EIO
);
3324 lderr(cct
) << "kick_requests_closed drop req of inode : "
3325 << in
->ino
<< " " << req
->get_tid() << dendl
;
3326 req
->unsafe_target_item
.remove_myself();
3328 signal_cond_list(req
->waitfor_safe
);
3329 unregister_request(req
);
3333 ceph_assert(session
->requests
.empty());
3334 ceph_assert(session
->unsafe_requests
.empty());
3344 void Client::got_mds_push(MetaSession
*s
)
3347 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
3348 if (s
->state
== MetaSession::STATE_CLOSING
) {
3349 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3353 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3355 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3357 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3358 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3360 std::scoped_lock
cl(client_lock
);
3361 auto session
= _get_mds_session(mds
, m
->get_connection().get());
3366 got_mds_push(session
.get());
3368 ceph_seq_t seq
= m
->get_seq();
3371 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3372 if (inode_map
.count(vino
) == 0) {
3373 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3376 in
= inode_map
[vino
];
3378 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3379 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3380 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3383 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3384 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3390 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3391 m
->get_mask(), m
->get_ino(),
3392 m
->get_first(), m
->get_last(), m
->dname
);
3393 m
->get_connection()->send_message2(std::move(reply
));
3397 void Client::_put_inode(Inode
*in
, int n
)
3399 ldout(cct
, 10) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3401 int left
= in
->get_nref();
3402 ceph_assert(left
>= n
+ 1);
3405 if (left
== 1) { // the last one will be held by the inode_map
3407 remove_all_caps(in
);
3409 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3410 bool unclean
= objectcacher
->release_set(&in
->oset
);
3411 ceph_assert(!unclean
);
3412 inode_map
.erase(in
->vino());
3413 if (use_faked_inos())
3414 _release_faked_ino(in
);
3416 if (root
== nullptr) {
3418 while (!root_parents
.empty())
3419 root_parents
.erase(root_parents
.begin());
3426 void Client::delay_put_inodes(bool wakeup
)
3428 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
3430 std::map
<Inode
*,int> release
;
3432 std::scoped_lock
dl(delay_i_lock
);
3433 release
.swap(delay_i_release
);
3436 if (release
.empty())
3439 for (auto &[in
, cnt
] : release
)
3440 _put_inode(in
, cnt
);
3443 mount_cond
.notify_all();
3446 void Client::put_inode(Inode
*in
, int n
)
3448 ldout(cct
, 20) << __func__
<< " on " << *in
<< " n = " << n
<< dendl
;
3450 std::scoped_lock
dl(delay_i_lock
);
3451 delay_i_release
[in
] += n
;
3454 void Client::close_dir(Dir
*dir
)
3456 Inode
*in
= dir
->parent_inode
;
3457 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3458 ceph_assert(dir
->is_empty());
3459 ceph_assert(in
->dir
== dir
);
3460 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3461 if (!in
->dentries
.empty())
3462 in
->get_first_parent()->put(); // unpin dentry
3466 put_inode(in
); // unpin inode
3470 * Don't call this with in==NULL, use get_or_create for that
3471 * leave dn set to default NULL unless you're trying to add
3472 * a new inode to a pre-created Dentry
3474 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3477 // create a new Dentry
3478 dn
= new Dentry(dir
, name
);
3480 lru
.lru_insert_mid(dn
); // mid or top?
3483 ldout(cct
, 15) << "link dir " << *dir
->parent_inode
<< " '" << name
<< "' to inode " << *in
3484 << " dn " << *dn
<< " (new dn)" << dendl
;
3486 ldout(cct
, 15) << "link dir " << *dir
->parent_inode
<< " '" << name
<< "' "
3487 << " dn " << *dn
<< " (new dn)" << dendl
;
3490 ceph_assert(!dn
->inode
);
3491 ldout(cct
, 15) << "link dir " << *dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3492 << " dn " << *dn
<< " (old dn)" << dendl
;
3495 if (in
) { // link to inode
3497 // only one parent for directories!
3498 if (in
->is_dir() && !in
->dentries
.empty()) {
3499 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3500 Dentry
*olddn
= in
->get_first_parent();
3501 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3502 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3503 clear_dir_complete_and_ordered(old_diri
, true);
3504 unlink(olddn
, true, true); // keep dir, dentry
3509 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3515 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3517 InodeRef
in(dn
->inode
);
3518 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3519 << " inode " << dn
->inode
<< dendl
;
3521 // unlink from inode
3525 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3531 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3541 if (dir
->is_empty() && !keepdir
)
3547 * For asynchronous flushes, check for errors from the IO and
3548 * update the inode if necessary
3550 class C_Client_FlushComplete
: public Context
{
3555 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3556 void finish(int r
) override
{
3557 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3559 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3560 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3561 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3562 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3563 inode
->set_async_err(r
);
3573 void Client::get_cap_ref(Inode
*in
, int cap
)
3575 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3576 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3577 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3580 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3581 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3582 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3585 in
->get_cap_ref(cap
);
3588 void Client::put_cap_ref(Inode
*in
, int cap
)
3590 int last
= in
->put_cap_ref(cap
);
3593 int drop
= last
& ~in
->caps_issued();
3594 if (in
->snapid
== CEPH_NOSNAP
) {
3595 if ((last
& CEPH_CAP_FILE_WR
) &&
3596 !in
->cap_snaps
.empty() &&
3597 in
->cap_snaps
.rbegin()->second
.writing
) {
3598 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3599 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3600 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3601 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3603 if (last
& CEPH_CAP_FILE_BUFFER
) {
3604 for (auto &p
: in
->cap_snaps
)
3605 p
.second
.dirty_data
= 0;
3606 signal_cond_list(in
->waitfor_commit
);
3607 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3610 if (!in
->cap_snaps
.empty()) {
3615 if (last
& CEPH_CAP_FILE_CACHE
) {
3616 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3622 put_inode(in
, put_nref
);
3626 // get caps for a given file handle -- the inode should have @need caps
3627 // issued by the mds and @want caps not revoked (or not under revocation).
3628 // this routine blocks till the cap requirement is satisfied. also account
3629 // (track) for capability hit when required (when cap requirement succeedes).
3630 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3632 Inode
*in
= fh
->inode
.get();
3634 int r
= check_pool_perm(in
, need
);
3639 int file_wanted
= in
->caps_file_wanted();
3640 if ((file_wanted
& need
) != need
) {
3641 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3642 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3644 return -CEPHFS_EBADF
;
3647 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3648 return -CEPHFS_EBADF
;
3650 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3654 int have
= in
->caps_issued(&implemented
);
3656 bool waitfor_caps
= false;
3657 bool waitfor_commit
= false;
3659 if (have
& need
& CEPH_CAP_FILE_WR
) {
3661 if ((endoff
>= (loff_t
)in
->max_size
||
3662 endoff
> (loff_t
)(in
->size
<< 1)) &&
3663 endoff
> (loff_t
)in
->wanted_max_size
) {
3664 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3665 in
->wanted_max_size
= endoff
;
3667 if (in
->wanted_max_size
> in
->max_size
&&
3668 in
->wanted_max_size
> in
->requested_max_size
)
3672 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3673 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3674 waitfor_caps
= true;
3676 if (!in
->cap_snaps
.empty()) {
3677 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3678 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3679 waitfor_caps
= true;
3681 for (auto &p
: in
->cap_snaps
) {
3682 if (p
.second
.dirty_data
) {
3683 waitfor_commit
= true;
3687 if (waitfor_commit
) {
3688 _flush(in
, new C_Client_FlushComplete(this, in
));
3689 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3694 if (!waitfor_caps
&& !waitfor_commit
) {
3695 if ((have
& need
) == need
) {
3696 int revoking
= implemented
& ~have
;
3697 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3698 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3699 << " revoking " << ccap_string(revoking
)
3701 if ((revoking
& want
) == 0) {
3702 *phave
= need
| (have
& want
);
3703 in
->get_cap_ref(need
);
3708 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3709 waitfor_caps
= true;
3712 if ((need
& CEPH_CAP_FILE_WR
) &&
3713 ((in
->auth_cap
&& in
->auth_cap
->session
->readonly
) ||
3714 // userland clients are only allowed to read if fscrypt enabled
3715 in
->is_fscrypt_enabled()))
3716 return -CEPHFS_EROFS
;
3718 if (in
->flags
& I_CAP_DROPPED
) {
3719 int mds_wanted
= in
->caps_mds_wanted();
3720 if ((mds_wanted
& need
) != need
) {
3721 int ret
= _renew_caps(in
);
3726 if (!(file_wanted
& ~mds_wanted
))
3727 in
->flags
&= ~I_CAP_DROPPED
;
3731 wait_on_list(in
->waitfor_caps
);
3732 else if (waitfor_commit
)
3733 wait_on_list(in
->waitfor_commit
);
3737 int Client::get_caps_used(Inode
*in
)
3739 unsigned used
= in
->caps_used();
3740 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3741 !objectcacher
->set_is_empty(&in
->oset
))
3742 used
|= CEPH_CAP_FILE_CACHE
;
3746 void Client::cap_delay_requeue(Inode
*in
)
3748 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3750 in
->hold_caps_until
= ceph::coarse_mono_clock::now() + caps_release_delay
;
3751 delayed_list
.push_back(&in
->delay_cap_item
);
3754 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3755 int flags
, int used
, int want
, int retain
,
3756 int flush
, ceph_tid_t flush_tid
)
3758 int held
= cap
->issued
| cap
->implemented
;
3759 int revoking
= cap
->implemented
& ~cap
->issued
;
3760 retain
&= ~revoking
;
3761 int dropping
= cap
->issued
& ~retain
;
3762 int op
= CEPH_CAP_OP_UPDATE
;
3764 ldout(cct
, 10) << __func__
<< " " << *in
3765 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3766 << " used " << ccap_string(used
)
3767 << " want " << ccap_string(want
)
3768 << " flush " << ccap_string(flush
)
3769 << " retain " << ccap_string(retain
)
3770 << " held "<< ccap_string(held
)
3771 << " revoking " << ccap_string(revoking
)
3772 << " dropping " << ccap_string(dropping
)
3775 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3776 const int would_have_issued
= cap
->issued
& retain
;
3777 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3779 // - tell the server we think issued is whatever they issued plus whatever we implemented
3780 // - leave what we have implemented in place
3781 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3782 cap
->issued
= cap
->issued
| cap
->implemented
;
3784 // Make an exception for revoking xattr caps: we are injecting
3785 // failure to release other caps, but allow xattr because client
3786 // will block on xattr ops if it can't release these to MDS (#9800)
3787 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3788 cap
->issued
^= xattr_mask
& revoking
;
3789 cap
->implemented
^= xattr_mask
& revoking
;
3791 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3792 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3795 cap
->issued
&= retain
;
3796 cap
->implemented
&= cap
->issued
| used
;
3799 snapid_t follows
= 0;
3802 follows
= in
->snaprealm
->get_snap_context().seq
;
3804 auto m
= make_message
<MClientCaps
>(op
,
3807 cap
->cap_id
, cap
->seq
,
3813 m
->caller_uid
= in
->cap_dirtier_uid
;
3814 m
->caller_gid
= in
->cap_dirtier_gid
;
3816 m
->head
.issue_seq
= cap
->issue_seq
;
3817 m
->set_tid(flush_tid
);
3819 m
->head
.uid
= in
->uid
;
3820 m
->head
.gid
= in
->gid
;
3821 m
->head
.mode
= in
->mode
;
3823 m
->head
.nlink
= in
->nlink
;
3825 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3826 encode(in
->xattrs
, m
->xattrbl
);
3827 m
->head
.xattr_version
= in
->xattr_version
;
3831 m
->max_size
= in
->max_size
;
3832 m
->truncate_seq
= in
->truncate_seq
;
3833 m
->truncate_size
= in
->truncate_size
;
3834 m
->mtime
= in
->mtime
;
3835 m
->atime
= in
->atime
;
3836 m
->ctime
= in
->ctime
;
3837 m
->btime
= in
->btime
;
3838 m
->time_warp_seq
= in
->time_warp_seq
;
3839 m
->change_attr
= in
->change_attr
;
3840 m
->fscrypt_auth
= in
->fscrypt_auth
;
3841 m
->fscrypt_file
= in
->fscrypt_file
;
3843 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3844 !in
->cap_snaps
.empty() &&
3845 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3846 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3849 if (flush
& CEPH_CAP_FILE_WR
) {
3850 m
->inline_version
= in
->inline_version
;
3851 m
->inline_data
= in
->inline_data
;
3854 in
->reported_size
= in
->size
;
3855 m
->set_snap_follows(follows
);
3857 if (cap
== in
->auth_cap
) {
3858 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3859 m
->set_max_size(in
->wanted_max_size
);
3860 in
->requested_max_size
= in
->wanted_max_size
;
3861 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3863 in
->requested_max_size
= 0;
3864 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3868 if (!session
->flushing_caps_tids
.empty())
3869 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3871 session
->con
->send_message2(std::move(m
));
3874 static bool is_max_size_approaching(Inode
*in
)
3876 /* mds will adjust max size according to the reported size */
3877 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3879 if (in
->size
>= in
->max_size
)
3881 /* half of previous max_size increment has been used */
3882 if (in
->max_size
> in
->reported_size
&&
3883 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3888 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3890 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3892 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3895 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3896 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3897 used
&= ~CEPH_CAP_FILE_CACHE
;
3898 used
|= CEPH_CAP_FILE_LAZYIO
;
3900 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3901 used
&= ~CEPH_CAP_FILE_BUFFER
;
3902 used
|= CEPH_CAP_FILE_LAZYIO
;
3905 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3906 used
&= ~CEPH_CAP_FILE_CACHE
;
3907 used
|= CEPH_CAP_FILE_LAZYIO
;
3909 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3910 used
&= ~CEPH_CAP_FILE_BUFFER
;
3911 used
|= CEPH_CAP_FILE_LAZYIO
;
3920 * Examine currently used and wanted versus held caps. Release, flush or ack
3921 * revoked caps to the MDS as appropriate.
3923 * @param in the inode to check
3924 * @param flags flags to apply to cap check
3926 void Client::check_caps(Inode
*in
, unsigned flags
)
3928 unsigned wanted
= in
->caps_wanted();
3929 unsigned used
= get_caps_used(in
);
3933 int issued
= in
->caps_issued(&implemented
);
3934 int revoking
= implemented
& ~issued
;
3936 int orig_used
= used
;
3937 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3939 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3940 if (!is_unmounting() && in
->nlink
> 0) {
3942 retain
|= CEPH_CAP_ANY
;
3943 } else if (in
->is_dir() &&
3944 (issued
& CEPH_CAP_FILE_SHARED
) &&
3945 (in
->flags
& I_COMPLETE
)) {
3946 // we do this here because we don't want to drop to Fs (and then
3947 // drop the Fs if we do a create!) if that alone makes us send lookups
3948 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3949 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3952 retain
|= CEPH_CAP_ANY_SHARED
;
3953 // keep RD only if we didn't have the file open RW,
3954 // because then the mds would revoke it anyway to
3955 // journal max_size=0.
3956 if (in
->max_size
== 0)
3957 retain
|= CEPH_CAP_ANY_RD
;
3961 ldout(cct
, 10) << __func__
<< " on " << *in
3962 << " wanted " << ccap_string(wanted
)
3963 << " used " << ccap_string(used
)
3964 << " issued " << ccap_string(issued
)
3965 << " revoking " << ccap_string(revoking
)
3966 << " flags=" << flags
3969 if (in
->snapid
!= CEPH_NOSNAP
)
3970 return; //snap caps last forever, can't write
3972 if (in
->caps
.empty())
3973 return; // guard if at end of func
3975 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3976 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3978 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3981 for (auto &[mds
, cap
] : in
->caps
) {
3982 auto session
= mds_sessions
.at(mds
);
3985 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3986 cap_used
&= ~in
->auth_cap
->issued
;
3988 revoking
= cap
.implemented
& ~cap
.issued
;
3990 ldout(cct
, 10) << " cap mds." << mds
3991 << " issued " << ccap_string(cap
.issued
)
3992 << " implemented " << ccap_string(cap
.implemented
)
3993 << " revoking " << ccap_string(revoking
) << dendl
;
3995 if (in
->wanted_max_size
> in
->max_size
&&
3996 in
->wanted_max_size
> in
->requested_max_size
&&
3997 &cap
== in
->auth_cap
)
4000 /* approaching file_max? */
4001 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
4002 &cap
== in
->auth_cap
&&
4003 is_max_size_approaching(in
)) {
4004 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
4005 << ", reported " << in
->reported_size
<< dendl
;
4009 /* completed revocation? */
4010 if (revoking
&& (revoking
& cap_used
) == 0) {
4011 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
4015 /* want more caps from mds? */
4016 if (wanted
& ~(cap
.wanted
| cap
.issued
))
4019 if (!revoking
&& is_unmounting() && (cap_used
== 0))
4022 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
4023 !in
->dirty_caps
) // and we have no dirty caps
4026 if (!(flags
& CHECK_CAPS_NODELAY
)) {
4027 ldout(cct
, 10) << "delaying cap release" << dendl
;
4028 cap_delay_requeue(in
);
4033 if (&cap
== in
->auth_cap
) {
4034 if (in
->flags
& I_KICK_FLUSH
) {
4035 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
4036 << " to mds." << mds
<< dendl
;
4037 kick_flushing_caps(in
, session
.get());
4039 if (!in
->cap_snaps
.empty() &&
4040 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
4046 ceph_tid_t flush_tid
;
4047 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
4048 flushing
= mark_caps_flushing(in
, &flush_tid
);
4049 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
4050 msg_flags
|= MClientCaps::FLAG_SYNC
;
4056 in
->delay_cap_item
.remove_myself();
4057 send_cap(in
, session
.get(), &cap
, msg_flags
, cap_used
, wanted
, retain
,
4058 flushing
, flush_tid
);
4063 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
4065 int used
= get_caps_used(in
);
4066 int dirty
= in
->caps_dirty();
4067 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
4069 if (in
->cap_snaps
.size() &&
4070 in
->cap_snaps
.rbegin()->second
.writing
) {
4071 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
4073 } else if (dirty
|| (used
& CEPH_CAP_FILE_WR
)) {
4074 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
4075 ceph_assert(capsnapem
.second
); /* element inserted */
4076 CapSnap
&capsnap
= capsnapem
.first
->second
;
4077 capsnap
.context
= old_snapc
;
4078 capsnap
.issued
= in
->caps_issued();
4079 capsnap
.dirty
= dirty
;
4081 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
4083 capsnap
.uid
= in
->uid
;
4084 capsnap
.gid
= in
->gid
;
4085 capsnap
.mode
= in
->mode
;
4086 capsnap
.btime
= in
->btime
;
4087 capsnap
.xattrs
= in
->xattrs
;
4088 capsnap
.xattr_version
= in
->xattr_version
;
4089 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
4090 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
4092 if (used
& CEPH_CAP_FILE_WR
) {
4093 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
4094 capsnap
.writing
= 1;
4096 finish_cap_snap(in
, capsnap
, used
);
4099 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
4103 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
4105 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
4106 capsnap
.size
= in
->size
;
4107 capsnap
.mtime
= in
->mtime
;
4108 capsnap
.atime
= in
->atime
;
4109 capsnap
.ctime
= in
->ctime
;
4110 capsnap
.time_warp_seq
= in
->time_warp_seq
;
4111 capsnap
.change_attr
= in
->change_attr
;
4112 capsnap
.dirty
|= in
->caps_dirty();
4114 /* Only reset it if it wasn't set before */
4115 if (capsnap
.cap_dirtier_uid
== -1) {
4116 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
4117 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
4120 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4121 capsnap
.inline_data
= in
->inline_data
;
4122 capsnap
.inline_version
= in
->inline_version
;
4125 if (used
& CEPH_CAP_FILE_BUFFER
) {
4126 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
4127 << " WRBUFFER, trigger to flush dirty buffer" << dendl
;
4129 /* trigger to flush the buffer */
4130 _flush(in
, new C_Client_FlushComplete(this, in
));
4132 capsnap
.dirty_data
= 0;
4137 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
4138 snapid_t follows
, CapSnap
& capsnap
)
4140 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
4141 in
->ino
, in
->snaprealm
->ino
, 0,
4142 in
->auth_cap
->mseq
, cap_epoch_barrier
);
4143 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
4144 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
4146 m
->set_client_tid(capsnap
.flush_tid
);
4147 m
->head
.snap_follows
= follows
;
4149 m
->head
.caps
= capsnap
.issued
;
4150 m
->head
.dirty
= capsnap
.dirty
;
4152 m
->head
.uid
= capsnap
.uid
;
4153 m
->head
.gid
= capsnap
.gid
;
4154 m
->head
.mode
= capsnap
.mode
;
4155 m
->btime
= capsnap
.btime
;
4157 m
->size
= capsnap
.size
;
4159 m
->head
.xattr_version
= capsnap
.xattr_version
;
4160 encode(capsnap
.xattrs
, m
->xattrbl
);
4162 m
->ctime
= capsnap
.ctime
;
4163 m
->btime
= capsnap
.btime
;
4164 m
->mtime
= capsnap
.mtime
;
4165 m
->atime
= capsnap
.atime
;
4166 m
->time_warp_seq
= capsnap
.time_warp_seq
;
4167 m
->change_attr
= capsnap
.change_attr
;
4169 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
4170 m
->inline_version
= in
->inline_version
;
4171 m
->inline_data
= in
->inline_data
;
4174 ceph_assert(!session
->flushing_caps_tids
.empty());
4175 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
4177 session
->con
->send_message2(std::move(m
));
4180 void Client::flush_snaps(Inode
*in
)
4182 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
4183 ceph_assert(in
->cap_snaps
.size());
4186 ceph_assert(in
->auth_cap
);
4187 MetaSession
*session
= in
->auth_cap
->session
;
4189 for (auto &p
: in
->cap_snaps
) {
4190 CapSnap
&capsnap
= p
.second
;
4191 // only do new flush
4192 if (capsnap
.flush_tid
> 0)
4195 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
4196 << " follows " << p
.first
4197 << " size " << capsnap
.size
4198 << " mtime " << capsnap
.mtime
4199 << " dirty_data=" << capsnap
.dirty_data
4200 << " writing=" << capsnap
.writing
4201 << " on " << *in
<< dendl
;
4202 if (capsnap
.dirty_data
|| capsnap
.writing
)
4205 capsnap
.flush_tid
= ++last_flush_tid
;
4206 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4207 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
4208 if (!in
->flushing_cap_item
.is_on_list())
4209 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4211 send_flush_snap(in
, session
, p
.first
, capsnap
);
4215 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
4217 ceph::condition_variable cond
;
4218 ls
.push_back(&cond
);
4219 std::unique_lock l
{client_lock
, std::adopt_lock
};
4225 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
4227 for (auto cond
: ls
) {
4232 void Client::wait_on_context_list(list
<Context
*>& ls
)
4234 ceph::condition_variable cond
;
4237 ls
.push_back(new C_Cond(cond
, &done
, &r
));
4238 std::unique_lock l
{client_lock
, std::adopt_lock
};
4239 cond
.wait(l
, [&done
] { return done
;});
4243 void Client::signal_context_list(list
<Context
*>& ls
)
4245 while (!ls
.empty()) {
4246 ls
.front()->complete(0);
4251 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
4253 for (const auto &cap
: s
->caps
) {
4254 auto &in
= cap
->inode
;
4256 in
.requested_max_size
= 0;
4257 in
.wanted_max_size
= 0;
4259 if (cap
->gen
< s
->cap_gen
) {
4260 // mds did not re-issue stale cap.
4261 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
4262 // make sure mds knows what we want.
4263 if (in
.caps_file_wanted() & ~cap
->wanted
)
4264 in
.flags
|= I_CAP_DROPPED
;
4267 signal_cond_list(in
.waitfor_caps
);
4272 // flush dirty data (from objectcache)
4274 class C_Client_CacheInvalidate
: public Context
{
4278 int64_t offset
, length
;
4280 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
4281 client(c
), offset(off
), length(len
) {
4282 if (client
->use_faked_inos())
4283 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4287 void finish(int r
) override
{
4288 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
4289 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4290 client
->_async_invalidate(ino
, offset
, length
);
4294 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
4296 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4297 if (!mref_reader
.is_state_satisfied())
4300 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
4301 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
4304 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
4306 if (ino_invalidate_cb
)
4307 // we queue the invalidate, which calls the callback and decrements the ref
4308 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
4311 void Client::_invalidate_inode_cache(Inode
*in
)
4313 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
4315 // invalidate our userspace inode cache
4316 if (cct
->_conf
->client_oc
) {
4317 objectcacher
->release_set(&in
->oset
);
4318 if (!objectcacher
->set_is_empty(&in
->oset
))
4319 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
4322 _schedule_invalidate_callback(in
, 0, 0);
4325 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
4327 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
4329 // invalidate our userspace inode cache
4330 if (cct
->_conf
->client_oc
) {
4331 vector
<ObjectExtent
> ls
;
4332 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
4333 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
4336 _schedule_invalidate_callback(in
, off
, len
);
4339 bool Client::_release(Inode
*in
)
4341 ldout(cct
, 20) << "_release " << *in
<< dendl
;
4342 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
4343 _invalidate_inode_cache(in
);
4349 bool Client::_flush(Inode
*in
, Context
*onfinish
)
4351 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
4353 if (!in
->oset
.dirty_or_tx
) {
4354 ldout(cct
, 10) << " nothing to flush" << dendl
;
4355 onfinish
->complete(0);
4359 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
4360 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
4361 objectcacher
->purge_set(&in
->oset
);
4363 onfinish
->complete(-CEPHFS_ENOSPC
);
4368 return objectcacher
->flush_set(&in
->oset
, onfinish
);
4371 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
4373 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
4374 if (!in
->oset
.dirty_or_tx
) {
4375 ldout(cct
, 10) << " nothing to flush" << dendl
;
4379 C_SaferCond
onflush("Client::_flush_range flock");
4380 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
4381 offset
, size
, &onflush
);
4384 client_lock
.unlock();
4390 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
4392 // std::scoped_lock l(client_lock);
4393 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
)); // will be called via dispatch() -> objecter -> ...
4394 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
4399 void Client::_flushed(Inode
*in
)
4401 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4403 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4408 // checks common to add_update_cap, handle_cap_grant
4409 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4411 unsigned had
= in
->caps_issued();
4413 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4414 !(had
& CEPH_CAP_FILE_CACHE
))
4417 if ((issued
& CEPH_CAP_FILE_SHARED
) !=
4418 (had
& CEPH_CAP_FILE_SHARED
)) {
4419 if (issued
& CEPH_CAP_FILE_SHARED
)
4422 clear_dir_complete_and_ordered(in
, true);
4426 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4427 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4428 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4430 if (!in
->is_any_caps()) {
4431 ceph_assert(in
->snaprealm
== 0);
4432 in
->snaprealm
= get_snap_realm(realm
);
4433 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4434 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4436 ceph_assert(in
->snaprealm
);
4437 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4438 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4439 in
->snaprealm_item
.remove_myself();
4440 auto oldrealm
= in
->snaprealm
;
4441 in
->snaprealm
= get_snap_realm(realm
);
4442 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4443 put_snap_realm(oldrealm
);
4447 mds_rank_t mds
= mds_session
->mds_num
;
4448 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4449 Cap
&cap
= capem
.first
->second
;
4450 if (!capem
.second
) {
4451 if (cap
.gen
< mds_session
->cap_gen
)
4452 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4455 * auth mds of the inode changed. we received the cap export
4456 * message, but still haven't received the cap import message.
4457 * handle_cap_export() updated the new auth MDS' cap.
4459 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4460 * a message that was send before the cap import message. So
4461 * don't remove caps.
4463 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4464 if (&cap
!= in
->auth_cap
)
4465 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4467 ceph_assert(cap
.cap_id
== cap_id
);
4470 issued
|= cap
.issued
;
4471 flags
|= CEPH_CAP_FLAG_AUTH
;
4477 check_cap_issue(in
, issued
);
4479 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4480 if (in
->auth_cap
!= &cap
&&
4481 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4483 if (in
->flushing_cap_item
.is_on_list()) {
4484 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4485 << "add myself to new auth MDS' flushing caps list" << dendl
;
4486 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4488 if (in
->dirty_cap_item
.is_on_list()) {
4489 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4490 << "add myself to new auth MDS' dirty caps list" << dendl
;
4491 mds_session
->get_dirty_list().push_back(&in
->dirty_cap_item
);
4495 in
->auth_cap
= &cap
;
4499 unsigned old_caps
= cap
.issued
;
4500 cap
.cap_id
= cap_id
;
4501 cap
.issued
= issued
;
4502 cap
.implemented
|= issued
;
4503 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4504 cap
.wanted
= wanted
;
4506 cap
.wanted
|= wanted
;
4508 cap
.issue_seq
= seq
;
4510 cap
.gen
= mds_session
->cap_gen
;
4511 cap
.latest_perms
= cap_perms
;
4512 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4513 << " from mds." << mds
4517 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4518 // non-auth MDS is revoking the newly grant caps ?
4519 for (auto &p
: in
->caps
) {
4520 if (&p
.second
== &cap
)
4522 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4523 check_caps(in
, CHECK_CAPS_NODELAY
);
4529 if (issued
& ~old_caps
)
4530 signal_cond_list(in
->waitfor_caps
);
4533 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4535 auto &in
= cap
->inode
;
4536 MetaSession
*session
= cap
->session
;
4537 mds_rank_t mds
= cap
->session
->mds_num
;
4539 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4541 if (queue_release
) {
4542 session
->enqueue_cap_release(
4553 if (in
.auth_cap
== cap
) {
4554 if (in
.flushing_cap_item
.is_on_list()) {
4555 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4556 in
.flushing_cap_item
.remove_myself();
4560 size_t n
= in
.caps
.erase(mds
);
4561 ceph_assert(n
== 1);
4564 if (!in
.is_any_caps()) {
4565 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4566 in
.snaprealm_item
.remove_myself();
4567 put_snap_realm(in
.snaprealm
);
4572 void Client::remove_all_caps(Inode
*in
)
4574 while (!in
->caps
.empty())
4575 remove_cap(&in
->caps
.begin()->second
, true);
4578 void Client::remove_session_caps(MetaSession
*s
, int err
)
4580 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4582 while (s
->caps
.size()) {
4583 Cap
*cap
= *s
->caps
.begin();
4584 InodeRef
in(&cap
->inode
);
4585 bool dirty_caps
= false;
4586 if (in
->auth_cap
== cap
) {
4587 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4588 in
->wanted_max_size
= 0;
4589 in
->requested_max_size
= 0;
4590 if (in
->has_any_filelocks())
4591 in
->flags
|= I_ERROR_FILELOCK
;
4593 auto caps
= cap
->implemented
;
4594 if (cap
->wanted
| cap
->issued
)
4595 in
->flags
|= I_CAP_DROPPED
;
4596 remove_cap(cap
, false);
4597 in
->cap_snaps
.clear();
4599 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4600 if (in
->flushing_caps
) {
4601 num_flushing_caps
--;
4602 in
->flushing_cap_tids
.clear();
4604 in
->flushing_caps
= 0;
4605 in
->mark_caps_clean();
4606 put_inode(in
.get());
4608 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4609 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4610 if (err
== -CEPHFS_EBLOCKLISTED
) {
4611 if (in
->oset
.dirty_or_tx
) {
4612 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4613 in
->set_async_err(err
);
4615 objectcacher
->purge_set(&in
->oset
);
4617 objectcacher
->release_set(&in
->oset
);
4619 _schedule_invalidate_callback(in
.get(), 0, 0);
4622 signal_cond_list(in
->waitfor_caps
);
4624 s
->flushing_caps_tids
.clear();
4625 sync_cond
.notify_all();
4628 std::pair
<int, bool> Client::_do_remount(bool retry_on_error
)
4630 uint64_t max_retries
= cct
->_conf
.get_val
<uint64_t>("client_max_retries_on_remount_failure");
4631 bool abort_on_failure
= false;
4634 int r
= remount_cb(callback_handle
);
4636 retries_on_invalidate
= 0;
4639 client_t whoami
= get_nodeid();
4642 "failed to remount (to trim kernel dentries): "
4643 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4646 "failed to remount (to trim kernel dentries): "
4647 "return code = " << r
<< dendl
;
4650 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4651 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4652 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4653 if (should_abort
&& !is_unmounting()) {
4654 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4655 abort_on_failure
= true;
4658 return std::make_pair(r
, abort_on_failure
);
4661 class C_Client_Remount
: public Context
{
4665 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4666 void finish(int r
) override
{
4667 ceph_assert(r
== 0);
4668 auto result
= client
->_do_remount(true);
4669 if (result
.second
) {
4675 void Client::_invalidate_kernel_dcache()
4677 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4678 if (!mref_reader
.is_state_satisfied())
4681 if (can_invalidate_dentries
) {
4682 if (dentry_invalidate_cb
&& root
->dir
) {
4683 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4684 p
!= root
->dir
->dentries
.end();
4686 if (p
->second
->inode
)
4687 _schedule_invalidate_dentry_callback(p
->second
, false);
4690 } else if (remount_cb
) {
4692 // when remounting a file system, linux kernel trims all unused dentries in the fs
4693 remount_finisher
.queue(new C_Client_Remount(this));
4697 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4703 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4704 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4705 Dentry
*dn
= p
->second
;
4707 ceph_assert(!dn
->inode
);
4708 if (dn
->lru_is_expireable())
4709 unlink(dn
, true, false); // keep dir, drop dentry
4711 if (dir
->dentries
.empty()) {
4716 if (in
->flags
& I_SNAPDIR_OPEN
) {
4717 InodeRef snapdir
= open_snapdir(in
.get());
4718 _trim_negative_child_dentries(snapdir
);
4722 class C_Client_CacheRelease
: public Context
{
4727 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4729 if (client
->use_faked_inos())
4730 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4734 void finish(int r
) override
{
4735 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4736 client
->_async_inode_release(ino
);
4740 void Client::_async_inode_release(vinodeno_t ino
)
4742 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
4743 if (!mref_reader
.is_state_satisfied())
4746 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4747 ino_release_cb(callback_handle
, ino
);
4750 void Client::_schedule_ino_release_callback(Inode
*in
) {
4753 // we queue the invalidate, which calls the callback and decrements the ref
4754 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4757 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4759 mds_rank_t mds
= s
->mds_num
;
4760 size_t caps_size
= s
->caps
.size();
4761 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4762 << " caps " << caps_size
<< dendl
;
4764 uint64_t trimmed
= 0;
4765 auto p
= s
->caps
.begin();
4766 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4767 * looking at from getting deleted during traversal. */
4768 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4770 InodeRef
in(&cap
->inode
);
4772 // Increment p early because it will be invalidated if cap
4773 // is deleted inside remove_cap
4776 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4777 int mine
= cap
->issued
| cap
->implemented
;
4778 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4779 // disposable non-auth cap
4780 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4781 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4782 cap
= (remove_cap(cap
, true), nullptr);
4786 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4787 _trim_negative_child_dentries(in
);
4789 auto q
= in
->dentries
.begin();
4790 while (q
!= in
->dentries
.end()) {
4793 if (dn
->lru_is_expireable()) {
4794 if (can_invalidate_dentries
&&
4795 dn
->dir
->parent_inode
->ino
== CEPH_INO_ROOT
) {
4796 // Only issue one of these per DN for inodes in root: handle
4797 // others more efficiently by calling for root-child DNs at
4798 // the end of this function.
4799 _schedule_invalidate_dentry_callback(dn
, true);
4801 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4804 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4808 if (in
->ll_ref
== 1 && in
->ino
!= CEPH_INO_ROOT
) {
4809 _schedule_ino_release_callback(in
.get());
4811 if (all
&& in
->ino
!= CEPH_INO_ROOT
) {
4812 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4817 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4818 for (const auto &dn
: to_trim
) {
4823 caps_size
= s
->caps
.size();
4824 if (caps_size
> (size_t)max
)
4825 _invalidate_kernel_dcache();
4828 void Client::force_session_readonly(MetaSession
*s
)
4831 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4832 auto &in
= (*p
)->inode
;
4833 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4834 signal_cond_list(in
.waitfor_caps
);
4838 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4840 MetaSession
*session
= in
->auth_cap
->session
;
4842 int flushing
= in
->dirty_caps
;
4843 ceph_assert(flushing
);
4845 ceph_tid_t flush_tid
= ++last_flush_tid
;
4846 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4848 if (!in
->flushing_caps
) {
4849 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4850 num_flushing_caps
++;
4852 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4855 in
->flushing_caps
|= flushing
;
4856 in
->mark_caps_clean();
4858 if (!in
->flushing_cap_item
.is_on_list())
4859 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4860 session
->flushing_caps_tids
.insert(flush_tid
);
4866 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4868 for (auto &p
: in
->cap_snaps
) {
4869 CapSnap
&capsnap
= p
.second
;
4870 if (capsnap
.flush_tid
> 0) {
4871 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4872 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4875 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4876 it
!= in
->flushing_cap_tids
.end();
4878 old_s
->flushing_caps_tids
.erase(it
->first
);
4879 new_s
->flushing_caps_tids
.insert(it
->first
);
4881 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4885 * Flush all the dirty caps back to the MDS. Because the callers
4886 * generally wait on the result of this function (syncfs and umount
4887 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4889 void Client::flush_caps_sync()
4891 ldout(cct
, 10) << __func__
<< dendl
;
4892 for (auto &q
: mds_sessions
) {
4894 xlist
<Inode
*>::iterator p
= s
->dirty_list
.begin();
4896 unsigned flags
= CHECK_CAPS_NODELAY
;
4901 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4902 check_caps(in
, flags
);
4907 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4909 while (in
->flushing_caps
) {
4910 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4911 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4912 if (it
->first
> want
)
4914 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4915 << ccap_string(it
->second
) << " want " << want
4916 << " last " << it
->first
<< dendl
;
4917 wait_on_list(in
->waitfor_caps
);
4921 void Client::wait_sync_caps(ceph_tid_t want
)
4924 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4925 << num_flushing_caps
<< " total flushing)" << dendl
;
4926 for (auto &p
: mds_sessions
) {
4928 if (s
->flushing_caps_tids
.empty())
4930 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4931 if (oldest_tid
<= want
) {
4932 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4933 << " (want " << want
<< ")" << dendl
;
4934 std::unique_lock l
{client_lock
, std::adopt_lock
};
4942 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4944 in
->flags
&= ~I_KICK_FLUSH
;
4946 Cap
*cap
= in
->auth_cap
;
4947 ceph_assert(cap
->session
== session
);
4949 ceph_tid_t last_snap_flush
= 0;
4950 for (auto p
= in
->flushing_cap_tids
.rbegin();
4951 p
!= in
->flushing_cap_tids
.rend();
4954 last_snap_flush
= p
->first
;
4959 int wanted
= in
->caps_wanted();
4960 int used
= get_caps_used(in
) | in
->caps_dirty();
4961 auto it
= in
->cap_snaps
.begin();
4962 for (auto& p
: in
->flushing_cap_tids
) {
4964 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4965 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4968 ceph_assert(it
!= in
->cap_snaps
.end());
4969 ceph_assert(it
->second
.flush_tid
== p
.first
);
4970 send_flush_snap(in
, session
, it
->first
, it
->second
);
4976 void Client::kick_flushing_caps(MetaSession
*session
)
4978 mds_rank_t mds
= session
->mds_num
;
4979 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4981 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4983 if (in
->flags
& I_KICK_FLUSH
) {
4984 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4985 kick_flushing_caps(in
, session
);
4990 void Client::early_kick_flushing_caps(MetaSession
*session
)
4992 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4994 Cap
*cap
= in
->auth_cap
;
4997 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4998 // stage. This guarantees that MDS processes the cap flush message before issuing
4999 // the flushing caps to other client.
5000 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
5001 in
->flags
|= I_KICK_FLUSH
;
5005 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
5006 << " to mds." << session
->mds_num
<< dendl
;
5007 // send_reconnect() also will reset these sequence numbers. make sure
5008 // sequence numbers in cap flush message match later reconnect message.
5012 cap
->issued
= cap
->implemented
;
5014 kick_flushing_caps(in
, session
);
5018 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
5023 while (!q
.empty()) {
5027 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
5028 realm
->invalidate_cache();
5030 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
5031 p
!= realm
->pchildren
.end();
5037 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
5039 SnapRealm
*realm
= snap_realms
[r
];
5041 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref was "
5042 << (realm
? realm
->nref
: 0) << dendl
;
5044 snap_realms
[r
] = realm
= new SnapRealm(r
);
5046 // Do not release the global snaprealm until unmounting.
5047 if (r
== CEPH_INO_GLOBAL_SNAPREALM
)
5052 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< ", nref now is "
5053 << realm
->nref
<< dendl
;
5057 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
5059 if (snap_realms
.count(r
) == 0) {
5060 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
5063 SnapRealm
*realm
= snap_realms
[r
];
5064 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
5069 void Client::put_snap_realm(SnapRealm
*realm
)
5071 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
5072 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
5073 if (--realm
->nref
== 0) {
5074 snap_realms
.erase(realm
->ino
);
5075 if (realm
->pparent
) {
5076 realm
->pparent
->pchildren
.erase(realm
);
5077 put_snap_realm(realm
->pparent
);
5083 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
5085 if (realm
->parent
!= parent
) {
5086 ldout(cct
, 10) << __func__
<< " " << *realm
5087 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
5088 realm
->parent
= parent
;
5089 if (realm
->pparent
) {
5090 realm
->pparent
->pchildren
.erase(realm
);
5091 put_snap_realm(realm
->pparent
);
5093 realm
->pparent
= get_snap_realm(parent
);
5094 realm
->pparent
->pchildren
.insert(realm
);
5100 static bool has_new_snaps(const SnapContext
& old_snapc
,
5101 const SnapContext
& new_snapc
)
5103 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
5106 struct SnapRealmInfoMeta
{
5107 SnapRealmInfoMeta(utime_t last_modified
, uint64_t change_attr
)
5108 : last_modified(last_modified
),
5109 change_attr(change_attr
) {
5112 utime_t last_modified
;
5113 uint64_t change_attr
;
5116 static std::pair
<SnapRealmInfo
, std::optional
<SnapRealmInfoMeta
>> get_snap_realm_info(
5117 MetaSession
*session
, bufferlist::const_iterator
&p
) {
5118 if (session
->mds_features
.test(CEPHFS_FEATURE_NEW_SNAPREALM_INFO
)) {
5119 SnapRealmInfoNew ninfo
;
5121 return std::make_pair(ninfo
.info
, SnapRealmInfoMeta(ninfo
.last_modified
, ninfo
.change_attr
));
5125 return std::make_pair(info
, std::nullopt
);
5130 void Client::update_snap_trace(MetaSession
*session
, const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
5132 SnapRealm
*first_realm
= NULL
;
5133 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
5135 map
<SnapRealm
*, SnapContext
> dirty_realms
;
5137 auto p
= bl
.cbegin();
5139 auto [info
, realm_info_meta
] = get_snap_realm_info(session
, p
);
5140 SnapRealm
*realm
= get_snap_realm(info
.ino());
5142 bool invalidate
= false;
5144 if (info
.seq() > realm
->seq
||
5145 (realm_info_meta
&& (*realm_info_meta
).change_attr
> realm
->change_attr
)) {
5146 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
5150 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5151 // flush me + children
5154 while (!q
.empty()) {
5155 SnapRealm
*realm
= q
.front();
5158 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
5159 p
!= realm
->pchildren
.end();
5163 if (dirty_realms
.count(realm
) == 0) {
5165 dirty_realms
[realm
] = realm
->get_snap_context();
5171 realm
->seq
= info
.seq();
5172 realm
->created
= info
.created();
5173 realm
->parent_since
= info
.parent_since();
5174 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
5175 if (realm_info_meta
) {
5176 realm
->last_modified
= (*realm_info_meta
).last_modified
;
5177 realm
->change_attr
= (*realm_info_meta
).change_attr
;
5179 realm
->my_snaps
= info
.my_snaps
;
5183 // _always_ verify parent
5184 if (adjust_realm_parent(realm
, info
.parent()))
5188 invalidate_snaprealm_and_children(realm
);
5189 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
5190 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
5192 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
5193 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
5197 first_realm
= realm
;
5199 put_snap_realm(realm
);
5202 for (auto &[realm
, snapc
] : dirty_realms
) {
5203 // if there are new snaps ?
5204 if (has_new_snaps(snapc
, realm
->get_snap_context())) {
5205 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
5206 for (auto&& in
: realm
->inodes_with_caps
) {
5207 queue_cap_snap(in
, snapc
);
5210 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
5212 put_snap_realm(realm
);
5216 *realm_ret
= first_realm
;
5218 put_snap_realm(first_realm
);
5221 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
5223 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
5224 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5226 std::scoped_lock
cl(client_lock
);
5227 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5232 got_mds_push(session
.get());
5234 map
<Inode
*, SnapContext
> to_move
;
5235 SnapRealm
*realm
= 0;
5237 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
5238 ceph_assert(m
->head
.split
);
5239 auto p
= m
->bl
.cbegin();
5240 auto [info
, _
] = get_snap_realm_info(session
.get(), p
);
5241 ceph_assert(info
.ino() == m
->head
.split
);
5243 // flush, then move, ino's.
5244 realm
= get_snap_realm(info
.ino());
5245 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
5246 for (auto& ino
: m
->split_inos
) {
5247 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
5248 if (inode_map
.count(vino
)) {
5249 Inode
*in
= inode_map
[vino
];
5250 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
5252 if (in
->snaprealm
->created
> info
.created()) {
5253 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
5254 << *in
->snaprealm
<< dendl
;
5257 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
5260 in
->snaprealm_item
.remove_myself();
5261 to_move
[in
] = in
->snaprealm
->get_snap_context();
5262 put_snap_realm(in
->snaprealm
);
5266 // move child snaprealms, too
5267 for (auto& child_realm
: m
->split_realms
) {
5268 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
5269 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
5272 adjust_realm_parent(child
, realm
->ino
);
5273 put_snap_realm(child
);
5277 update_snap_trace(session
.get(), m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
5280 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
5281 Inode
*in
= p
->first
;
5282 in
->snaprealm
= realm
;
5283 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
5285 // queue for snap writeback
5286 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
5287 queue_cap_snap(in
, p
->second
);
5289 put_snap_realm(realm
);
5293 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
5295 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5297 std::scoped_lock
cl(client_lock
);
5298 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5303 got_mds_push(session
.get());
5305 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
5307 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
5308 if (inode_map
.count(vino
)) {
5310 in
= inode_map
[vino
];
5313 in
->quota
= m
->quota
;
5314 in
->rstat
= m
->rstat
;
5319 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
5321 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
5323 std::scoped_lock
cl(client_lock
);
5324 auto session
= _get_mds_session(mds
, m
->get_connection().get());
5329 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
5330 // Pause RADOS operations until we see the required epoch
5331 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
5334 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
5335 // Record the barrier so that we will transmit it to MDS when releasing
5336 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
5339 got_mds_push(session
.get());
5341 bool do_cap_release
= false;
5343 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
5344 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
5347 /* MDS maybe waiting for cap release with increased seq */
5348 switch (m
->get_op()) {
5349 case CEPH_CAP_OP_REVOKE
:
5350 case CEPH_CAP_OP_GRANT
:
5351 if (!in
->caps
.count(mds
)) {
5352 do_cap_release
= true;
5353 ldout(cct
, 5) << __func__
<< " vino " << vino
<< " don't have cap "
5354 << m
->get_cap_id() << " op " << m
->get_op()
5355 << ", immediately releasing" << dendl
;
5359 /* MDS maybe waiting for cap release with increased seq */
5360 switch (m
->get_op()) {
5361 case CEPH_CAP_OP_IMPORT
:
5362 case CEPH_CAP_OP_REVOKE
:
5363 case CEPH_CAP_OP_GRANT
:
5364 do_cap_release
= true;
5365 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " op "
5366 << m
->get_op() << ", immediately releasing" << dendl
;
5369 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
5374 // In case the mds is waiting on e.g. a revocation
5375 if (do_cap_release
) {
5376 session
->enqueue_cap_release(
5383 flush_cap_releases();
5387 switch (m
->get_op()) {
5388 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
.get(), in
, m
);
5389 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
.get(), in
, m
);
5390 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
.get(), in
, m
);
5393 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
5394 Cap
&cap
= in
->caps
.at(mds
);
5396 switch (m
->get_op()) {
5397 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
.get(), in
, m
);
5398 case CEPH_CAP_OP_IMPORT
:
5399 case CEPH_CAP_OP_REVOKE
:
5400 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
.get(), in
, &cap
, m
);
5401 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
.get(), in
, &cap
, m
);
5404 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
5409 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5411 mds_rank_t mds
= session
->mds_num
;
5413 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5414 << " IMPORT from mds." << mds
<< dendl
;
5416 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
5419 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
5421 cap_perms
= cap
->latest_perms
;
5425 SnapRealm
*realm
= NULL
;
5426 update_snap_trace(session
, m
->snapbl
, &realm
);
5428 int issued
= m
->get_caps();
5429 int wanted
= m
->get_wanted();
5430 add_update_cap(in
, session
, m
->get_cap_id(),
5431 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
5432 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
5434 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
5435 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
5439 put_snap_realm(realm
);
5441 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
5442 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
5443 in
->requested_max_size
> m
->get_max_size()) {
5444 in
->requested_max_size
= 0;
5445 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5447 // reflush any/all caps (if we are now the auth_cap)
5448 kick_flushing_caps(in
, session
);
5452 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5454 mds_rank_t mds
= session
->mds_num
;
5456 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5457 << " EXPORT from mds." << mds
<< dendl
;
5459 auto it
= in
->caps
.find(mds
);
5460 if (it
!= in
->caps
.end()) {
5461 Cap
&cap
= it
->second
;
5462 if (cap
.cap_id
== m
->get_cap_id()) {
5463 if (m
->peer
.cap_id
) {
5464 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5465 auto tsession
= _get_or_open_mds_session(peer_mds
);
5466 auto it
= in
->caps
.find(peer_mds
);
5467 if (it
!= in
->caps
.end()) {
5468 Cap
&tcap
= it
->second
;
5469 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5470 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5471 tcap
.cap_id
= m
->peer
.cap_id
;
5472 tcap
.seq
= m
->peer
.seq
- 1;
5473 tcap
.issue_seq
= tcap
.seq
;
5474 tcap
.issued
|= cap
.issued
;
5475 tcap
.implemented
|= cap
.issued
;
5476 if (&cap
== in
->auth_cap
)
5477 in
->auth_cap
= &tcap
;
5478 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5479 adjust_session_flushing_caps(in
, session
, tsession
.get());
5482 add_update_cap(in
, tsession
.get(), m
->peer
.cap_id
, cap
.issued
, 0,
5483 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5484 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5488 if (cap
.wanted
| cap
.issued
)
5489 in
->flags
|= I_CAP_DROPPED
;
5492 remove_cap(&cap
, false);
5497 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5499 mds_rank_t mds
= session
->mds_num
;
5500 ceph_assert(in
->caps
.count(mds
));
5502 uint64_t size
= m
->get_size();
5503 if (in
->is_fscrypt_enabled()) {
5504 size
= std::stoll(std::string(std::rbegin(m
->fscrypt_file
),
5505 std::rend(m
->fscrypt_file
)));
5507 ldout(cct
, 10) << __func__
<< " on ino " << *in
5508 << " size " << in
->size
<< " -> " << m
->get_size()
5512 in
->caps_issued(&issued
);
5513 issued
|= in
->caps_dirty();
5514 update_inode_file_size(in
, issued
, size
, m
->get_truncate_seq(),
5515 m
->get_truncate_size());
5518 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5520 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5521 int dirty
= m
->get_dirty();
5525 auto it
= in
->flushing_cap_tids
.begin();
5526 if (it
->first
< flush_ack_tid
) {
5527 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5528 << " got unexpected flush ack tid " << flush_ack_tid
5529 << " expected is " << it
->first
<< dendl
;
5531 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5537 if (it
->first
== flush_ack_tid
)
5538 cleaned
= it
->second
;
5539 if (it
->first
<= flush_ack_tid
) {
5540 session
->flushing_caps_tids
.erase(it
->first
);
5541 in
->flushing_cap_tids
.erase(it
++);
5545 cleaned
&= ~it
->second
;
5551 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5552 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5553 << " with " << ccap_string(dirty
) << dendl
;
5556 signal_cond_list(in
->waitfor_caps
);
5557 if (session
->flushing_caps_tids
.empty() ||
5558 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5559 sync_cond
.notify_all();
5563 in
->cap_dirtier_uid
= -1;
5564 in
->cap_dirtier_gid
= -1;
5568 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5570 if (in
->flushing_caps
) {
5571 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5572 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5573 in
->flushing_caps
&= ~cleaned
;
5574 if (in
->flushing_caps
== 0) {
5575 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5576 num_flushing_caps
--;
5577 if (in
->flushing_cap_tids
.empty())
5578 in
->flushing_cap_item
.remove_myself();
5580 if (!in
->caps_dirty())
5587 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5589 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5590 mds_rank_t mds
= session
->mds_num
;
5591 ceph_assert(in
->caps
.count(mds
));
5592 snapid_t follows
= m
->get_snap_follows();
5594 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5595 auto& capsnap
= it
->second
;
5596 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5597 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5599 InodeRef
tmp_ref(in
);
5600 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5601 << " on " << *in
<< dendl
;
5602 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5603 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5604 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5605 in
->flushing_cap_item
.remove_myself();
5606 in
->cap_snaps
.erase(it
);
5608 signal_cond_list(in
->waitfor_caps
);
5609 if (session
->flushing_caps_tids
.empty() ||
5610 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5611 sync_cond
.notify_all();
5614 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5615 << " on " << *in
<< dendl
;
5616 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5620 class C_Client_DentryInvalidate
: public Context
{
5627 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5628 client(c
), name(dn
->name
) {
5629 if (client
->use_faked_inos()) {
5630 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5632 ino
.ino
= dn
->inode
->faked_ino
;
5634 dirino
= dn
->dir
->parent_inode
->vino();
5636 ino
= dn
->inode
->vino();
5639 ino
.ino
= inodeno_t();
5641 void finish(int r
) override
{
5642 // _async_dentry_invalidate is responsible for its own locking
5643 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5644 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5648 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5650 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
5651 if (!mref_reader
.is_state_satisfied())
5654 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5655 << " in dir " << dirino
<< dendl
;
5656 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5659 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5661 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5662 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5665 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5667 int ref
= in
->get_nref();
5668 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5670 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5671 for (auto p
= in
->dir
->dentries
.begin();
5672 p
!= in
->dir
->dentries
.end(); ) {
5673 Dentry
*dn
= p
->second
;
5675 /* rmsnap removes whole subtree, need trim inodes recursively.
5676 * we don't need to invalidate dentries recursively. because
5677 * invalidating a directory dentry effectively invalidate
5679 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5680 _try_to_trim_inode(dn
->inode
.get(), false);
5682 if (dn
->lru_is_expireable())
5683 unlink(dn
, true, false); // keep dir, drop dentry
5685 if (in
->dir
->dentries
.empty()) {
5691 if (ref
> 1 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5692 InodeRef snapdir
= open_snapdir(in
);
5693 _try_to_trim_inode(snapdir
.get(), false);
5698 auto q
= in
->dentries
.begin();
5699 while (q
!= in
->dentries
.end()) {
5702 if( in
->ll_ref
> 0 && sched_inval
) {
5703 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5704 // so in->dentries doesn't always reflect the state of kernel's dcache.
5705 _schedule_invalidate_dentry_callback(dn
, true);
5707 unlink(dn
, true, true);
5712 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5714 mds_rank_t mds
= session
->mds_num
;
5715 int used
= get_caps_used(in
);
5716 int wanted
= in
->caps_wanted();
5719 const unsigned new_caps
= m
->get_caps();
5720 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5721 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5722 << " mds." << mds
<< " seq " << m
->get_seq()
5723 << " caps now " << ccap_string(new_caps
)
5724 << " was " << ccap_string(cap
->issued
)
5725 << (was_stale
? " (stale)" : "") << dendl
;
5728 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5729 cap
->seq
= m
->get_seq();
5730 cap
->gen
= session
->cap_gen
;
5732 check_cap_issue(in
, new_caps
);
5736 in
->caps_issued(&issued
);
5737 issued
|= in
->caps_dirty();
5739 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5740 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5741 in
->mode
= m
->head
.mode
;
5742 in
->uid
= m
->head
.uid
;
5743 in
->gid
= m
->head
.gid
;
5744 in
->btime
= m
->btime
;
5746 bool deleted_inode
= false;
5747 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5748 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5749 in
->nlink
= m
->head
.nlink
;
5751 deleted_inode
= true;
5753 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5754 m
->xattrbl
.length() &&
5755 m
->head
.xattr_version
> in
->xattr_version
) {
5756 auto p
= m
->xattrbl
.cbegin();
5757 decode(in
->xattrs
, p
);
5758 in
->xattr_version
= m
->head
.xattr_version
;
5761 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5762 in
->dirstat
.nfiles
= m
->get_nfiles();
5763 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5766 if (new_caps
& CEPH_CAP_ANY_RD
) {
5767 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5768 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5771 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5772 in
->layout
= m
->get_layout();
5773 update_inode_file_size(in
, issued
, m
->get_size(),
5774 m
->get_truncate_seq(), m
->get_truncate_size());
5777 if (m
->inline_version
> in
->inline_version
) {
5778 in
->inline_data
= m
->inline_data
;
5779 in
->inline_version
= m
->inline_version
;
5782 /* always take a newer change attr */
5783 if (m
->get_change_attr() > in
->change_attr
)
5784 in
->change_attr
= m
->get_change_attr();
5787 if (cap
== in
->auth_cap
&&
5788 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5789 (m
->get_max_size() != in
->max_size
)) {
5790 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5791 in
->max_size
= m
->get_max_size();
5792 if (in
->max_size
> in
->wanted_max_size
) {
5793 in
->wanted_max_size
= 0;
5794 in
->requested_max_size
= 0;
5799 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5800 (wanted
& ~(cap
->wanted
| new_caps
))) {
5801 // If mds is importing cap, prior cap messages that update 'wanted'
5802 // may get dropped by mds (migrate seq mismatch).
5804 // We don't send cap message to update 'wanted' if what we want are
5805 // already issued. If mds revokes caps, cap message that releases caps
5806 // also tells mds what we want. But if caps got revoked by mds forcedly
5807 // (session stale). We may haven't told mds what we want.
5813 auto revoked
= cap
->issued
& ~new_caps
;
5815 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5816 cap
->issued
= new_caps
;
5817 cap
->implemented
|= new_caps
;
5819 // recall delegations if we're losing caps necessary for them
5820 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5821 in
->recall_deleg(false);
5822 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5823 in
->recall_deleg(true);
5825 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5826 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5827 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5828 // waitin' for flush
5829 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5832 flags
= CHECK_CAPS_NODELAY
;
5835 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5837 flags
= CHECK_CAPS_NODELAY
;
5839 } else if (cap
->issued
== new_caps
) {
5840 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5842 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5843 cap
->issued
= new_caps
;
5844 cap
->implemented
|= new_caps
;
5846 if (cap
== in
->auth_cap
) {
5847 // non-auth MDS is revoking the newly grant caps ?
5848 for (const auto &p
: in
->caps
) {
5849 if (&p
.second
== cap
)
5851 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5859 // just in case the caps was released just before we get the revoke msg
5860 if (!check
&& m
->get_op() == CEPH_CAP_OP_REVOKE
) {
5861 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5863 flags
= CHECK_CAPS_NODELAY
;
5867 check_caps(in
, flags
);
5871 signal_cond_list(in
->waitfor_caps
);
5873 // may drop inode's last ref
5875 _try_to_trim_inode(in
, true);
5878 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5880 if (perms
.uid() == 0) {
5881 // For directories, DACs are overridable.
5882 // For files, Read/write DACs are always overridable but executable DACs are
5883 // overridable when there is at least one exec bit set
5884 if(!S_ISDIR(in
->mode
) && (want
& MAY_EXEC
) && !(in
->mode
& S_IXUGO
))
5885 return -CEPHFS_EACCES
;
5889 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5890 int ret
= _posix_acl_permission(in
, perms
, want
);
5891 if (ret
!= -CEPHFS_EAGAIN
)
5895 // check permissions before doing anything else
5896 if (!in
->check_mode(perms
, want
))
5897 return -CEPHFS_EACCES
;
5901 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5902 const UserPerm
& perms
)
5904 int r
= _getattr_for_perm(in
, perms
);
5909 if (strncmp(name
, "system.", 7) == 0) {
5910 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5913 r
= inode_permission(in
, perms
, want
);
5916 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5920 std::ostream
& operator<<(std::ostream
&out
, const UserPerm
& perm
) {
5921 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5925 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5926 const UserPerm
& perms
)
5928 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< " stx_mode: "
5929 << hex
<< stx
->stx_mode
<< " mask:" << mask
<< dec
<< dendl
;
5930 int r
= _getattr_for_perm(in
, perms
);
5934 if (mask
& CEPH_SETATTR_SIZE
) {
5935 r
= inode_permission(in
, perms
, MAY_WRITE
);
5941 if (mask
& CEPH_SETATTR_UID
) {
5942 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5945 if (mask
& CEPH_SETATTR_GID
) {
5946 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5947 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5951 if (mask
& CEPH_SETATTR_MODE
) {
5952 uint32_t m
= ~stx
->stx_mode
& in
->mode
; // mode bits removed
5953 ldout(cct
, 20) << __func__
<< " " << *in
<< " = " << hex
<< m
<< dec
<< dendl
;
5954 if (perms
.uid() != 0 && perms
.uid() != in
->uid
&&
5956 * Currently the kernel fuse and libfuse code is buggy and
5957 * won't pass the ATTR_KILL_SUID/ATTR_KILL_SGID to ceph-fuse.
5958 * But will just set the ATTR_MODE and at the same time by
5959 * clearing the suid/sgid bits.
5961 * Only allow unprivileged users to clear S_ISUID and S_ISUID.
5963 (m
& ~(S_ISUID
| S_ISGID
)))
5966 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5967 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5968 stx
->stx_mode
&= ~S_ISGID
;
5971 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5972 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5973 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5974 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5975 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5976 check_mask
|= CEPH_SETATTR_MTIME
;
5977 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5978 check_mask
|= CEPH_SETATTR_ATIME
;
5979 if (check_mask
& mask
) {
5982 r
= inode_permission(in
, perms
, MAY_WRITE
);
5990 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5994 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5996 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5999 if ((flags
& O_ACCMODE
) == O_WRONLY
)
6001 else if ((flags
& O_ACCMODE
) == O_RDWR
)
6002 want
= MAY_READ
| MAY_WRITE
;
6003 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
6005 if (flags
& O_TRUNC
)
6009 switch (in
->mode
& S_IFMT
) {
6014 if (want
& MAY_WRITE
) {
6021 r
= _getattr_for_perm(in
, perms
);
6025 r
= inode_permission(in
, perms
, want
);
6027 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
6031 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
6033 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
6034 int r
= _getattr_for_perm(dir
, perms
);
6038 r
= inode_permission(dir
, perms
, MAY_EXEC
);
6040 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
6044 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
6046 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
6047 int r
= _getattr_for_perm(dir
, perms
);
6051 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
6053 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
6057 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
6059 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
6060 int r
= _getattr_for_perm(dir
, perms
);
6064 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
6068 /* 'name == NULL' means rmsnap w/o permission checks */
6069 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
6071 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
6074 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
6078 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
6082 int Client::may_delete(const char *relpath
, const UserPerm
& perms
) {
6083 ldout(cct
, 20) << __func__
<< " " << relpath
<< "; " << perms
<< dendl
;
6085 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
6086 if (!mref_reader
.is_state_satisfied())
6087 return -CEPHFS_ENOTCONN
;
6089 filepath
path(relpath
);
6090 string name
= path
.last_dentry();
6094 std::scoped_lock
lock(client_lock
);
6095 int r
= path_walk(path
, &dir
, perms
);
6098 if (cct
->_conf
->client_permissions
) {
6099 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6107 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
6109 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
6110 int r
= _getattr_for_perm(in
, perms
);
6114 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
6120 if (!S_ISREG(in
->mode
))
6123 if (in
->mode
& S_ISUID
)
6126 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
6129 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
6131 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
6135 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
6137 int mask
= CEPH_STAT_CAP_MODE
;
6139 if (acl_type
!= NO_ACL
) {
6140 mask
|= CEPH_STAT_CAP_XATTR
;
6141 force
= in
->xattr_version
== 0;
6143 return _getattr(in
, mask
, perms
, force
);
6146 vinodeno_t
Client::_get_vino(Inode
*in
)
6148 /* The caller must hold the client lock */
6149 return vinodeno_t(in
->ino
, in
->snapid
);
6153 * Resolve an MDS spec to a list of MDS daemon GIDs.
6155 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
6156 * It may be '*' in which case it matches all GIDs.
6158 * If no error is returned, the `targets` vector will be populated with at least
6161 int Client::resolve_mds(
6162 const std::string
&mds_spec
,
6163 std::vector
<mds_gid_t
> *targets
)
6166 ceph_assert(targets
!= nullptr);
6169 CachedStackStringStream css
;
6170 int role_r
= fsmap
->parse_role(mds_spec
, &role
, *css
);
6172 // We got a role, resolve it to a GID
6173 auto& info
= fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
);
6174 ldout(cct
, 10) << __func__
<< ": resolved " << mds_spec
<< " to role '"
6175 << role
<< "' aka " << info
.human_name() << dendl
;
6176 targets
->push_back(info
.global_id
);
6180 std::string strtol_err
;
6181 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
6182 if (strtol_err
.empty()) {
6183 // It is a possible GID
6184 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
6185 if (fsmap
->gid_exists(mds_gid
)) {
6186 auto& info
= fsmap
->get_info_gid(mds_gid
);
6187 ldout(cct
, 10) << __func__
<< ": validated gid " << mds_gid
<< " aka "
6188 << info
.human_name() << dendl
;
6189 targets
->push_back(mds_gid
);
6192 lderr(cct
) << __func__
<< ": gid " << mds_gid
<< " not in MDS map"
6194 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6195 return -CEPHFS_ENOENT
;
6197 } else if (mds_spec
== "*") {
6198 // It is a wildcard: use all MDSs
6199 const auto& mds_info
= fsmap
->get_mds_info();
6201 ldout(cct
, 10) << __func__
<< ": resolving `*' to all MDS daemons" << dendl
;
6202 if (mds_info
.empty()) {
6203 lderr(cct
) << __func__
<< ": no MDS daemons found" << dendl
;
6204 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6205 return -CEPHFS_ENOENT
;
6208 for (const auto& [gid
, info
] : mds_info
) {
6209 ldout(cct
, 10) << __func__
<< ": appending " << info
.human_name() << " to targets" << dendl
;
6210 targets
->push_back(gid
);
6214 // It did not parse as an integer, it is not a wildcard, it must be a name
6215 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
6216 if (mds_gid
== mds_gid_t
{0}) {
6217 lderr(cct
) << __func__
<< ": no MDS daemons found by name `" << mds_spec
<< "'" << dendl
;
6218 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
6219 return -CEPHFS_ENOENT
;
6221 auto& info
= fsmap
->get_info_gid(mds_gid
);
6222 ldout(cct
, 10) << __func__
<< ": resolved name '" << mds_spec
6223 << "' to " << info
.human_name() << dendl
;
6224 targets
->push_back(mds_gid
);
6232 * Authenticate with mon and establish global ID
6234 int Client::authenticate()
6236 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6238 if (monclient
->is_authenticated()) {
6242 client_lock
.unlock();
6243 int r
= monclient
->authenticate(std::chrono::duration
<double>(mount_timeout
).count());
6249 whoami
= monclient
->get_global_id();
6250 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
6255 int Client::fetch_fsmap(bool user
)
6257 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6259 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6260 // rather than MDSMap because no one MDSMap contains all the daemons, and
6261 // a `tell` can address any daemon.
6262 version_t fsmap_latest
;
6265 client_lock
.unlock();
6266 std::tie(fsmap_latest
, std::ignore
) =
6267 monclient
->get_version("fsmap", ca::use_blocked
[ec
]);
6269 } while (ec
== bs::errc::resource_unavailable_try_again
);
6272 lderr(cct
) << "Failed to learn FSMap version: " << ec
<< dendl
;
6273 return ceph::from_error_code(ec
);
6276 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
6279 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
6280 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6281 monclient
->renew_subs();
6282 wait_on_list(waiting_for_fsmap
);
6284 ceph_assert(fsmap_user
);
6285 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
6287 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
6288 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
6289 monclient
->renew_subs();
6290 wait_on_list(waiting_for_fsmap
);
6293 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
6295 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
6296 << fsmap_latest
<< dendl
;
6302 * @mds_spec one of ID, rank, GID, "*"
6305 int Client::mds_command(
6306 const std::string
&mds_spec
,
6307 const vector
<string
>& cmd
,
6308 const bufferlist
& inbl
,
6313 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
6314 if (!iref_reader
.is_state_satisfied())
6315 return -CEPHFS_ENOTCONN
;
6317 std::unique_lock
cl(client_lock
);
6325 r
= fetch_fsmap(false);
6330 // Look up MDS target(s) of the command
6331 std::vector
<mds_gid_t
> targets
;
6332 r
= resolve_mds(mds_spec
, &targets
);
6337 // If daemons are laggy, we won't send them commands. If all
6338 // are laggy then we fail.
6339 std::vector
<mds_gid_t
> non_laggy
;
6340 for (const auto& gid
: targets
) {
6341 const auto info
= fsmap
->get_info_gid(gid
);
6342 if (!info
.laggy()) {
6343 non_laggy
.push_back(gid
);
6346 if (non_laggy
.size() == 0) {
6347 *outs
= "All targeted MDS daemons are laggy";
6348 return -CEPHFS_ENOENT
;
6351 if (metadata
.empty()) {
6352 // We are called on an unmounted client, so metadata
6353 // won't be initialized yet.
6354 populate_metadata("");
6357 // Send commands to targets
6358 C_GatherBuilder
gather(cct
, onfinish
);
6359 for (const auto& target_gid
: non_laggy
) {
6360 const auto info
= fsmap
->get_info_gid(target_gid
);
6362 // Open a connection to the target MDS
6363 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
6367 std::scoped_lock
cmd_lock(command_lock
);
6368 // Generate MDSCommandOp state
6369 auto &op
= command_table
.start_command();
6371 op
.on_finish
= gather
.new_sub();
6376 op
.mds_gid
= target_gid
;
6379 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
6380 << " tid=" << op
.tid
<< cmd
<< dendl
;
6382 // Construct and send MCommand
6383 MessageRef m
= op
.get_message(monclient
->get_fsid());
6384 conn
->send_message2(std::move(m
));
6393 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
6395 ceph_tid_t
const tid
= m
->get_tid();
6397 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
6399 std::scoped_lock
cmd_lock(command_lock
);
6400 if (!command_table
.exists(tid
)) {
6401 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
6405 auto &op
= command_table
.get_command(tid
);
6407 *op
.outbl
= m
->get_data();
6414 op
.on_finish
->complete(m
->r
);
6417 command_table
.erase(tid
);
6420 // -------------------
6423 int Client::subscribe_mdsmap(const std::string
&fs_name
)
6425 int r
= authenticate();
6427 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
6431 std::string resolved_fs_name
;
6432 if (fs_name
.empty()) {
6433 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
6434 if (resolved_fs_name
.empty())
6435 // Try the backwards compatibility fs name option
6436 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
6438 resolved_fs_name
= fs_name
;
6441 std::string want
= "mdsmap";
6442 if (!resolved_fs_name
.empty()) {
6443 r
= fetch_fsmap(true);
6446 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
6447 if (fscid
== FS_CLUSTER_ID_NONE
) {
6448 return -CEPHFS_ENOENT
;
6451 std::ostringstream oss
;
6452 oss
<< want
<< "." << fscid
;
6455 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
6457 monclient
->sub_want(want
, 0, 0);
6458 monclient
->renew_subs();
6463 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
6464 bool require_mds
, const std::string
&fs_name
)
6466 ceph_assert(is_initialized());
6469 * To make sure that the _unmount() must wait until the mount()
6472 RWRef_t
mref_writer(mount_state
, CLIENT_MOUNTING
, false);
6473 if (!mref_writer
.is_first_writer()) // already mounting or mounted
6476 std::unique_lock
cl(client_lock
);
6478 int r
= subscribe_mdsmap(fs_name
);
6480 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
6484 start_tick_thread(); // start tick thread
6488 auto availability
= mdsmap
->is_cluster_available();
6489 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
6491 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
6492 return CEPH_FUSE_NO_MDS_UP
;
6493 } else if (availability
== MDSMap::AVAILABLE
) {
6494 // Continue to mount
6496 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
6497 // Else, wait. MDSMonitor will update the map to bring
6498 // us to a conclusion eventually.
6499 wait_on_list(waiting_for_mdsmap
);
6501 // Unexpected value!
6507 if(mdsmap
->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
)) {
6508 lderr(cct
) << "connections cannot be made while"
6509 " the flag refuse_client_session is set" << dendl
;
6510 return -CEPHFS_EACCES
;
6513 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
6515 filepath
fp(CEPH_INO_ROOT
);
6516 if (!mount_root
.empty()) {
6517 fp
= filepath(mount_root
.c_str());
6520 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6521 req
->set_filepath(fp
);
6522 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
6523 int res
= make_request(req
, perms
);
6525 if (res
== -CEPHFS_EACCES
&& root
) {
6526 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6539 _ll_get(root
.get());
6542 if (!cct
->_conf
->client_trace
.empty()) {
6543 traceout
.open(cct
->_conf
->client_trace
.c_str());
6544 if (traceout
.is_open()) {
6545 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6547 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6552 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6553 ldout(cct, 3) << "op: struct stat st;" << dendl;
6554 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6555 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6556 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6557 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6558 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6559 ldout(cct, 3) << "op: int fd;" << dendl;
6562 mref_writer
.update_state(CLIENT_MOUNTED
);
6568 void Client::_close_sessions()
6570 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6571 if (it
->second
->state
== MetaSession::STATE_REJECTED
)
6572 mds_sessions
.erase(it
++);
6577 while (!mds_sessions
.empty()) {
6578 // send session closes!
6579 for (auto &p
: mds_sessions
) {
6580 if (p
.second
->state
!= MetaSession::STATE_CLOSING
) {
6581 _close_mds_session(p
.second
.get());
6582 mds_ranks_closing
.insert(p
.first
);
6586 // wait for sessions to close
6587 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6588 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6589 << timo
<< "s)" << dendl
;
6590 std::unique_lock l
{client_lock
, std::adopt_lock
};
6593 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6594 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6595 while (!mds_ranks_closing
.empty()) {
6596 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6597 // this prunes entry from mds_sessions and mds_ranks_closing
6598 _closed_mds_session(session
.get(), -CEPHFS_ETIMEDOUT
);
6602 mds_ranks_closing
.clear();
6607 void Client::flush_mdlog_sync(Inode
*in
)
6609 if (in
->unsafe_ops
.empty()) {
6613 std::set
<mds_rank_t
> anchor
;
6614 for (auto &&p
: in
->unsafe_ops
) {
6615 anchor
.emplace(p
->mds
);
6618 anchor
.emplace(in
->auth_cap
->session
->mds_num
);
6621 for (auto &rank
: anchor
) {
6622 auto session
= &mds_sessions
.at(rank
);
6623 flush_mdlog(session
->get());
6627 void Client::flush_mdlog_sync()
6629 if (mds_requests
.empty())
6631 for (auto &p
: mds_sessions
) {
6632 flush_mdlog(p
.second
.get());
6636 void Client::flush_mdlog(MetaSession
*session
)
6638 // Only send this to Luminous or newer MDS daemons, older daemons
6639 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6640 const uint64_t features
= session
->con
->get_features();
6641 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6642 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6643 session
->con
->send_message2(std::move(m
));
6648 void Client::_abort_mds_sessions(int err
)
6650 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6651 auto req
= p
->second
;
6653 // unsafe requests will be removed during close session below.
6654 if (req
->got_unsafe
)
6658 if (req
->caller_cond
) {
6660 req
->caller_cond
->notify_all();
6664 // Process aborts on any requests that were on this waitlist.
6665 // Any requests that were on a waiting_for_open session waitlist
6666 // will get kicked during close session below.
6667 signal_cond_list(waiting_for_mdsmap
);
6669 // Force-close all sessions
6670 while(!mds_sessions
.empty()) {
6671 auto session
= mds_sessions
.begin()->second
;
6672 _closed_mds_session(session
.get(), err
);
6676 void Client::_unmount(bool abort
)
6679 * We are unmounting the client.
6681 * Just declare the state to STATE_UNMOUNTING to block and fail
6682 * any new comming "reader" and then try to wait all the in-flight
6683 * "readers" to finish.
6685 RWRef_t
mref_writer(mount_state
, CLIENT_UNMOUNTING
, false);
6686 if (!mref_writer
.is_first_writer())
6688 mref_writer
.wait_readers_done();
6690 std::unique_lock lock
{client_lock
};
6692 if (abort
|| blocklisted
) {
6693 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blocklisted)") << dendl
;
6695 ldout(cct
, 2) << "unmounting" << dendl
;
6701 mount_aborted
= true;
6702 // Abort all mds sessions
6703 _abort_mds_sessions(-CEPHFS_ENOTCONN
);
6705 objecter
->op_cancel_writes(-CEPHFS_ENOTCONN
);
6707 // flush the mdlog for pending requests, if any
6711 mount_cond
.wait(lock
, [this] {
6712 // Only wait for write OPs
6713 for (auto& [tid
, req
] : mds_requests
) {
6714 if (req
->is_write()) {
6715 ldout(cct
, 10) << "waiting for write request '" << tid
6716 << "' to complete, currently there are "
6717 << mds_requests
.size()
6718 << " outstanding read/write requests"
6729 // clean up any unclosed files
6730 while (!fd_map
.empty()) {
6731 Fh
*fh
= fd_map
.begin()->second
;
6732 fd_map
.erase(fd_map
.begin());
6733 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6737 while (!ll_unclosed_fh_set
.empty()) {
6738 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6740 ll_unclosed_fh_set
.erase(fh
);
6741 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6745 while (!opened_dirs
.empty()) {
6746 dir_result_t
*dirp
= *opened_dirs
.begin();
6747 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6753 if (cct
->_conf
->client_oc
) {
6754 // flush/release all buffered data
6755 std::list
<InodeRef
> anchor
;
6756 for (auto& p
: inode_map
) {
6757 Inode
*in
= p
.second
;
6759 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6763 // prevent inode from getting freed
6764 anchor
.emplace_back(in
);
6766 if (abort
|| blocklisted
) {
6767 objectcacher
->purge_set(&in
->oset
);
6768 } else if (!in
->caps
.empty()) {
6770 _flush(in
, new C_Client_FlushComplete(this, in
));
6775 if (abort
|| blocklisted
) {
6776 for (auto &q
: mds_sessions
) {
6778 for (auto p
= s
->dirty_list
.begin(); !p
.end(); ) {
6781 if (in
->dirty_caps
) {
6782 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6783 in
->mark_caps_clean();
6790 wait_sync_caps(last_flush_tid
);
6798 while (lru
.lru_get_size() > 0 ||
6799 !inode_map
.empty()) {
6800 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6801 << "+" << inode_map
.size() << " items"
6802 << ", waiting (for caps to release?)"
6805 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6806 r
== std::cv_status::timeout
) {
6810 ceph_assert(lru
.lru_get_size() == 0);
6811 ceph_assert(inode_map
.empty());
6814 if (!cct
->_conf
->client_trace
.empty()) {
6815 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6819 // stop the tick thread
6820 tick_thread_stopped
= true;
6821 upkeep_cond
.notify_one();
6825 // release the global snapshot realm
6826 SnapRealm
*global_realm
= snap_realms
[CEPH_INO_GLOBAL_SNAPREALM
];
6828 ceph_assert(global_realm
->nref
== 1);
6829 put_snap_realm(global_realm
);
6832 mref_writer
.update_state(CLIENT_UNMOUNTED
);
6835 * Stop the remount_queue before clearing the mountpoint memory
6836 * to avoid possible use-after-free bug.
6839 ldout(cct
, 10) << "unmount stopping remount finisher" << dendl
;
6840 remount_finisher
.wait_for_empty();
6841 remount_finisher
.stop();
6842 remount_cb
= nullptr;
6845 ldout(cct
, 2) << "unmounted." << dendl
;
6848 void Client::unmount()
6853 void Client::abort_conn()
6858 void Client::flush_cap_releases()
6860 uint64_t nr_caps
= 0;
6862 // send any cap releases
6863 for (auto &p
: mds_sessions
) {
6864 auto session
= p
.second
;
6865 if (session
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6867 nr_caps
+= session
->release
->caps
.size();
6868 if (cct
->_conf
->client_inject_release_failure
) {
6869 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6871 session
->con
->send_message2(std::move(session
->release
));
6873 session
->release
.reset();
6878 dec_pinned_icaps(nr_caps
);
6882 void Client::renew_and_flush_cap_releases()
6884 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6886 if (!mount_aborted
&& mdsmap
->get_epoch()) {
6888 auto el
= ceph::coarse_mono_clock::now() - last_cap_renew
;
6889 if (unlikely(utime_t(el
) > mdsmap
->get_session_timeout() / 3.0))
6892 flush_cap_releases();
6898 ldout(cct
, 20) << "tick" << dendl
;
6900 auto now
= ceph::coarse_mono_clock::now();
6903 * If the mount() is not finished
6905 if (is_mounting() && !mds_requests
.empty()) {
6906 MetaRequest
*req
= mds_requests
.begin()->second
;
6908 if (req
->created
+ mount_timeout
< now
) {
6909 req
->abort(-CEPHFS_ETIMEDOUT
);
6910 if (req
->caller_cond
) {
6912 req
->caller_cond
->notify_all();
6914 signal_cond_list(waiting_for_mdsmap
);
6915 for (auto &p
: mds_sessions
) {
6916 signal_context_list(p
.second
->waiting_for_open
);
6921 renew_and_flush_cap_releases();
6924 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6928 if (!mount_aborted
&& in
->hold_caps_until
> now
)
6930 delayed_list
.pop_front();
6932 check_caps(in
, CHECK_CAPS_NODELAY
);
6936 collect_and_send_metrics();
6938 delay_put_inodes(is_unmounting());
6941 if (blocklisted
&& (is_mounted() || is_unmounting()) &&
6942 last_auto_reconnect
+ std::chrono::seconds(30 * 60) < now
&&
6943 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6944 messenger
->client_reset();
6945 fd_gen
++; // invalidate open files
6946 blocklisted
= false;
6947 _kick_stale_sessions();
6948 last_auto_reconnect
= now
;
6952 void Client::start_tick_thread()
6954 upkeeper
= std::thread([this]() {
6955 using time
= ceph::coarse_mono_time
;
6956 using sec
= std::chrono::seconds
;
6958 auto last_tick
= time::min();
6960 std::unique_lock
cl(client_lock
);
6961 while (!tick_thread_stopped
) {
6962 auto now
= clock::now();
6963 auto since
= now
- last_tick
;
6965 auto t_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_tick_interval"));
6966 auto d_interval
= clock::duration(cct
->_conf
.get_val
<sec
>("client_debug_inject_tick_delay"));
6968 auto interval
= std::max(t_interval
, d_interval
);
6969 if (likely(since
>= interval
*.90)) {
6971 last_tick
= clock::now();
6976 ldout(cct
, 20) << "upkeep thread waiting interval " << interval
<< dendl
;
6977 if (!tick_thread_stopped
)
6978 upkeep_cond
.wait_for(cl
, interval
);
6983 void Client::collect_and_send_metrics() {
6984 ldout(cct
, 20) << __func__
<< dendl
;
6986 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6988 // right now, we only track and send global metrics. its sufficient
6989 // to send these metrics to MDS rank0.
6990 collect_and_send_global_metrics();
6993 void Client::collect_and_send_global_metrics() {
6994 ldout(cct
, 20) << __func__
<< dendl
;
6995 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6997 /* Do not send the metrics until the MDS rank is ready */
6998 if (!mdsmap
->is_active((mds_rank_t
)0)) {
6999 ldout(cct
, 5) << __func__
<< " MDS rank 0 is not ready yet -- not sending metric"
7004 if (!have_open_session((mds_rank_t
)0)) {
7005 ldout(cct
, 5) << __func__
<< ": no session with rank=0 -- not sending metric"
7009 auto session
= _get_or_open_mds_session((mds_rank_t
)0);
7010 if (!session
->mds_features
.test(CEPHFS_FEATURE_METRIC_COLLECT
)) {
7011 ldout(cct
, 5) << __func__
<< ": rank=0 does not support metrics" << dendl
;
7015 ClientMetricMessage metric
;
7016 std::vector
<ClientMetricMessage
> message
;
7019 if (_collect_and_send_global_metrics
||
7020 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_LATENCY
)) {
7021 metric
= ClientMetricMessage(ReadLatencyPayload(logger
->tget(l_c_read
),
7022 logger
->tget(l_c_rd_avg
),
7023 logger
->get(l_c_rd_sqsum
),
7025 message
.push_back(metric
);
7029 if (_collect_and_send_global_metrics
||
7030 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_LATENCY
)) {
7031 metric
= ClientMetricMessage(WriteLatencyPayload(logger
->tget(l_c_wrlat
),
7032 logger
->tget(l_c_wr_avg
),
7033 logger
->get(l_c_wr_sqsum
),
7035 message
.push_back(metric
);
7039 if (_collect_and_send_global_metrics
||
7040 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_METADATA_LATENCY
)) {
7041 metric
= ClientMetricMessage(MetadataLatencyPayload(logger
->tget(l_c_lat
),
7042 logger
->tget(l_c_md_avg
),
7043 logger
->get(l_c_md_sqsum
),
7044 nr_metadata_request
));
7045 message
.push_back(metric
);
7048 // cap hit ratio -- nr_caps is unused right now
7049 if (_collect_and_send_global_metrics
||
7050 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_CAP_INFO
)) {
7051 auto [cap_hits
, cap_misses
] = get_cap_hit_rates();
7052 metric
= ClientMetricMessage(CapInfoPayload(cap_hits
, cap_misses
, 0));
7053 message
.push_back(metric
);
7056 // dentry lease hit ratio
7057 if (_collect_and_send_global_metrics
||
7058 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_DENTRY_LEASE
)) {
7059 auto [dlease_hits
, dlease_misses
, nr
] = get_dlease_hit_rates();
7060 metric
= ClientMetricMessage(DentryLeasePayload(dlease_hits
, dlease_misses
, nr
));
7061 message
.push_back(metric
);
7065 if (_collect_and_send_global_metrics
||
7066 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_FILES
)) {
7067 auto [opened_files
, total_inodes
] = get_opened_files_rates();
7068 metric
= ClientMetricMessage(OpenedFilesPayload(opened_files
, total_inodes
));
7069 message
.push_back(metric
);
7073 if (_collect_and_send_global_metrics
||
7074 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_PINNED_ICAPS
)) {
7075 auto [pinned_icaps
, total_inodes
] = get_pinned_icaps_rates();
7076 metric
= ClientMetricMessage(PinnedIcapsPayload(pinned_icaps
, total_inodes
));
7077 message
.push_back(metric
);
7081 if (_collect_and_send_global_metrics
||
7082 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_OPENED_INODES
)) {
7083 auto [opened_inodes
, total_inodes
] = get_opened_inodes_rates();
7084 metric
= ClientMetricMessage(OpenedInodesPayload(opened_inodes
, total_inodes
));
7085 message
.push_back(metric
);
7089 if (_collect_and_send_global_metrics
||
7090 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_READ_IO_SIZES
)) {
7091 metric
= ClientMetricMessage(ReadIoSizesPayload(total_read_ops
,
7093 message
.push_back(metric
);
7097 if (_collect_and_send_global_metrics
||
7098 session
->mds_metric_flags
.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES
)) {
7099 metric
= ClientMetricMessage(WriteIoSizesPayload(total_write_ops
,
7101 message
.push_back(metric
);
7104 session
->con
->send_message2(make_message
<MClientMetrics
>(std::move(message
)));
7107 void Client::renew_caps()
7109 ldout(cct
, 10) << "renew_caps()" << dendl
;
7110 last_cap_renew
= ceph::coarse_mono_clock::now();
7112 for (auto &p
: mds_sessions
) {
7113 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
7114 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
7115 renew_caps(p
.second
.get());
7119 void Client::renew_caps(MetaSession
*session
)
7121 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
7122 session
->last_cap_renew_request
= ceph_clock_now();
7123 uint64_t seq
= ++session
->cap_renew_seq
;
7124 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
7128 // ===============================================================
7129 // high level (POSIXy) interface
7131 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
7132 InodeRef
*target
, const UserPerm
& perms
)
7134 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
7135 MetaRequest
*req
= new MetaRequest(op
);
7137 dir
->make_nosnap_relative_path(path
);
7138 path
.push_dentry(name
);
7139 req
->set_filepath(path
);
7140 req
->set_inode(dir
);
7141 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
7142 mask
|= DEBUG_GETATTR_CAPS
;
7143 req
->head
.args
.getattr
.mask
= mask
;
7145 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
7147 int r
= make_request(req
, perms
, target
);
7148 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
7152 bool Client::_dentry_valid(const Dentry
*dn
)
7154 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
7156 // is dn lease valid?
7157 utime_t now
= ceph_clock_now();
7158 if (dn
->lease_mds
>= 0 && dn
->lease_ttl
> now
&&
7159 mds_sessions
.count(dn
->lease_mds
)) {
7160 auto s
= mds_sessions
.at(dn
->lease_mds
);
7161 if (s
->cap_ttl
> now
&& s
->cap_gen
== dn
->lease_gen
) {
7166 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
7167 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
7174 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
7175 const UserPerm
& perms
, std::string
* alternate_name
,
7180 bool did_lookup_request
= false;
7181 // can only request shared caps
7182 mask
&= CEPH_CAP_ANY_SHARED
| CEPH_STAT_RSTAT
;
7184 if (dname
== "..") {
7185 if (dir
->dentries
.empty()) {
7186 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
7187 filepath
path(dir
->ino
);
7188 req
->set_filepath(path
);
7191 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
7194 *target
= std::move(tmptarget
);
7195 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
7201 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
7210 if (!dir
->is_dir()) {
7211 r
= -CEPHFS_ENOTDIR
;
7215 if (dname
.length() > NAME_MAX
) {
7216 r
= -CEPHFS_ENAMETOOLONG
;
7220 if (dname
== cct
->_conf
->client_snapdir
&&
7221 dir
->snapid
== CEPH_NOSNAP
) {
7222 *target
= open_snapdir(dir
);
7228 dir
->dir
->dentries
.count(dname
)) {
7229 dn
= dir
->dir
->dentries
[dname
];
7231 ldout(cct
, 20) << __func__
<< " have " << *dn
<< " from mds." << dn
->lease_mds
7232 << " ttl " << dn
->lease_ttl
<< " seq " << dn
->lease_seq
<< dendl
;
7234 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
7235 if (_dentry_valid(dn
)) {
7236 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7237 // make trim_caps() behave.
7238 dir
->try_touch_cap(dn
->lease_mds
);
7242 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
7243 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
7244 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
7246 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
7247 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
7248 << *dir
<< " dn '" << dname
<< "'" << dendl
;
7249 return -CEPHFS_ENOENT
;
7253 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
7256 // In rare case during the rename if another thread tries to
7257 // lookup the dst dentry, it may get an inconsistent result
7258 // that both src dentry and dst dentry will link to the same
7259 // inode at the same time.
7260 // Will wait the rename to finish and try it again.
7261 if (!is_rename
&& dn
->is_renaming
) {
7262 ldout(cct
, 1) << __func__
<< " dir " << *dir
7263 << " rename is on the way, will wait for dn '"
7264 << dname
<< "'" << dendl
;
7265 wait_on_list(waiting_for_rename
);
7269 // can we conclude ENOENT locally?
7270 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
7271 (dir
->flags
& I_COMPLETE
)) {
7272 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
7273 return -CEPHFS_ENOENT
;
7277 if (did_lookup_request
) {
7281 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
7282 did_lookup_request
= true;
7284 /* complete lookup to get dentry for alternate_name */
7292 *target
= dn
->inode
;
7294 *alternate_name
= dn
->alternate_name
;
7303 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
7305 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
7309 Dentry
*Client::get_or_create(Inode
*dir
, const char* name
)
7312 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
7314 if (dir
->dir
->dentries
.count(name
))
7315 return dir
->dir
->dentries
[name
];
7316 else // otherwise link up a new one
7317 return link(dir
->dir
, name
, NULL
, NULL
);
7320 int Client::walk(std::string_view path
, walk_dentry_result
* wdr
, const UserPerm
& perms
, bool followsym
)
7322 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7323 if (!mref_reader
.is_state_satisfied())
7324 return -CEPHFS_ENOTCONN
;
7326 ldout(cct
, 10) << __func__
<< ": " << path
<< dendl
;
7328 std::scoped_lock
lock(client_lock
);
7330 return path_walk(path
, wdr
, perms
, followsym
);
7333 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
7334 const UserPerm
& perms
, bool followsym
, int mask
, InodeRef dirinode
)
7336 walk_dentry_result wdr
;
7337 int rc
= path_walk(origpath
, &wdr
, perms
, followsym
, mask
, dirinode
);
7338 *end
= std::move(wdr
.in
);
7342 int Client::path_walk(const filepath
& origpath
, walk_dentry_result
* result
, const UserPerm
& perms
,
7343 bool followsym
, int mask
, InodeRef dirinode
)
7345 filepath path
= origpath
;
7347 std::string alternate_name
;
7348 if (origpath
.absolute())
7357 ldout(cct
, 20) << __func__
<< " cur=" << *cur
<< dendl
;
7358 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
7363 while (i
< path
.depth() && cur
) {
7365 const string
&dname
= path
[i
];
7366 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
7367 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
7369 if (cct
->_conf
->client_permissions
) {
7370 int r
= may_lookup(cur
.get(), perms
);
7373 caps
= CEPH_CAP_AUTH_SHARED
;
7376 /* Get extra requested caps on the last component */
7377 if (i
== (path
.depth() - 1))
7379 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
, &alternate_name
);
7382 // only follow trailing symlink if followsym. always follow
7383 // 'directory' symlinks.
7384 if (next
&& next
->is_symlink()) {
7386 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
7387 if (symlinks
> MAXSYMLINKS
) {
7388 return -CEPHFS_ELOOP
;
7391 if (i
< path
.depth() - 1) {
7393 // replace consumed components of path with symlink dir target
7394 filepath
resolved(next
->symlink
.c_str());
7395 resolved
.append(path
.postfixpath(i
+ 1));
7398 if (next
->symlink
[0] == '/') {
7402 } else if (followsym
) {
7403 if (next
->symlink
[0] == '/') {
7404 path
= next
->symlink
.c_str();
7409 filepath
more(next
->symlink
.c_str());
7410 // we need to remove the symlink component from off of the path
7411 // before adding the target that the symlink points to. remain
7412 // at the same position in the path.
7423 return -CEPHFS_ENOENT
;
7425 result
->in
= std::move(cur
);
7426 result
->alternate_name
= std::move(alternate_name
);
7434 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
, std::string alternate_name
)
7436 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7437 if (!mref_reader
.is_state_satisfied())
7438 return -CEPHFS_ENOTCONN
;
7440 tout(cct
) << "link" << std::endl
;
7441 tout(cct
) << relexisting
<< std::endl
;
7442 tout(cct
) << relpath
<< std::endl
;
7444 filepath
existing(relexisting
);
7448 std::scoped_lock
lock(client_lock
);
7449 int r
= path_walk(existing
, &in
, perm
, true);
7452 if (std::string(relpath
) == "/") {
7456 filepath
path(relpath
);
7457 string name
= path
.last_dentry();
7460 r
= path_walk(path
, &dir
, perm
, true);
7463 if (cct
->_conf
->client_permissions
) {
7464 if (S_ISDIR(in
->mode
)) {
7468 r
= may_hardlink(in
.get(), perm
);
7471 r
= may_create(dir
.get(), perm
);
7475 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
, std::move(alternate_name
));
7479 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
7481 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, 0, perm
);
7484 int Client::unlinkat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perm
)
7486 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7487 if (!mref_reader
.is_state_satisfied()) {
7488 return -CEPHFS_ENOTCONN
;
7491 tout(cct
) << __func__
<< std::endl
;
7492 tout(cct
) << dirfd
<< std::endl
;
7493 tout(cct
) << relpath
<< std::endl
;
7494 tout(cct
) << flags
<< std::endl
;
7496 if (std::string(relpath
) == "/") {
7497 return flags
& AT_REMOVEDIR
? -CEPHFS_EBUSY
: -CEPHFS_EISDIR
;
7500 filepath
path(relpath
);
7501 string name
= path
.last_dentry();
7505 std::scoped_lock
lock(client_lock
);
7508 int r
= get_fd_inode(dirfd
, &dirinode
);
7513 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7517 if (cct
->_conf
->client_permissions
) {
7518 r
= may_delete(dir
.get(), name
.c_str(), perm
);
7523 if (flags
& AT_REMOVEDIR
) {
7524 r
= _rmdir(dir
.get(), name
.c_str(), perm
);
7526 r
= _unlink(dir
.get(), name
.c_str(), perm
);
7531 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
, std::string alternate_name
)
7533 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7534 if (!mref_reader
.is_state_satisfied())
7535 return -CEPHFS_ENOTCONN
;
7537 tout(cct
) << __func__
<< std::endl
;
7538 tout(cct
) << relfrom
<< std::endl
;
7539 tout(cct
) << relto
<< std::endl
;
7541 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
7542 return -CEPHFS_EBUSY
;
7544 filepath
from(relfrom
);
7546 string fromname
= from
.last_dentry();
7548 string toname
= to
.last_dentry();
7551 InodeRef fromdir
, todir
;
7553 std::scoped_lock
lock(client_lock
);
7554 int r
= path_walk(from
, &fromdir
, perm
);
7557 r
= path_walk(to
, &todir
, perm
);
7561 if (cct
->_conf
->client_permissions
) {
7562 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
7565 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
7566 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
7569 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
, std::move(alternate_name
));
7576 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
, std::string alternate_name
)
7578 return mkdirat(CEPHFS_AT_FDCWD
, relpath
, mode
, perm
, alternate_name
);
7581 int Client::mkdirat(int dirfd
, const char *relpath
, mode_t mode
, const UserPerm
& perm
,
7582 std::string alternate_name
)
7584 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7585 if (!mref_reader
.is_state_satisfied())
7586 return -CEPHFS_ENOTCONN
;
7588 tout(cct
) << __func__
<< std::endl
;
7589 tout(cct
) << dirfd
<< std::endl
;
7590 tout(cct
) << relpath
<< std::endl
;
7591 tout(cct
) << mode
<< std::endl
;
7592 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
7594 if (std::string(relpath
) == "/") {
7595 return -CEPHFS_EEXIST
;
7598 filepath
path(relpath
);
7599 string name
= path
.last_dentry();
7603 std::scoped_lock
lock(client_lock
);
7606 int r
= get_fd_inode(dirfd
, &dirinode
);
7611 r
= path_walk(path
, &dir
, perm
, true, 0, dirinode
);
7615 if (cct
->_conf
->client_permissions
) {
7616 r
= may_create(dir
.get(), perm
);
7621 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
, 0, {}, std::move(alternate_name
));
7624 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7626 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7627 if (!mref_reader
.is_state_satisfied())
7628 return -CEPHFS_ENOTCONN
;
7630 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
7631 tout(cct
) << __func__
<< std::endl
;
7632 tout(cct
) << relpath
<< std::endl
;
7633 tout(cct
) << mode
<< std::endl
;
7635 //get through existing parts of path
7636 filepath
path(relpath
);
7638 int r
= 0, caps
= 0;
7641 std::scoped_lock
lock(client_lock
);
7643 for (i
=0; i
<path
.depth(); ++i
) {
7644 if (cct
->_conf
->client_permissions
) {
7645 r
= may_lookup(cur
.get(), perms
);
7648 caps
= CEPH_CAP_AUTH_SHARED
;
7650 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
7655 if (r
!=-CEPHFS_ENOENT
) return r
;
7656 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
7657 //make new directory at each level
7658 for (; i
<path
.depth(); ++i
) {
7659 if (cct
->_conf
->client_permissions
) {
7660 r
= may_create(cur
.get(), perms
);
7665 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
7667 //check proper creation/existence
7668 if(-CEPHFS_EEXIST
== r
&& i
< path
.depth() - 1) {
7669 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
7673 //move to new dir and continue
7675 ldout(cct
, 20) << __func__
<< ": successfully created directory "
7676 << filepath(cur
->ino
).get_path() << dendl
;
7681 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
7683 return unlinkat(CEPHFS_AT_FDCWD
, relpath
, AT_REMOVEDIR
, perms
);
7686 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
7688 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7689 if (!mref_reader
.is_state_satisfied())
7690 return -CEPHFS_ENOTCONN
;
7692 tout(cct
) << __func__
<< std::endl
;
7693 tout(cct
) << relpath
<< std::endl
;
7694 tout(cct
) << mode
<< std::endl
;
7695 tout(cct
) << rdev
<< std::endl
;
7697 if (std::string(relpath
) == "/")
7698 return -CEPHFS_EEXIST
;
7700 filepath
path(relpath
);
7701 string name
= path
.last_dentry();
7705 std::scoped_lock
lock(client_lock
);
7706 int r
= path_walk(path
, &dir
, perms
);
7709 if (cct
->_conf
->client_permissions
) {
7710 int r
= may_create(dir
.get(), perms
);
7714 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
7719 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
, std::string alternate_name
)
7721 return symlinkat(target
, CEPHFS_AT_FDCWD
, relpath
, perms
, alternate_name
);
7724 int Client::symlinkat(const char *target
, int dirfd
, const char *relpath
, const UserPerm
& perms
,
7725 std::string alternate_name
)
7727 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7728 if (!mref_reader
.is_state_satisfied()) {
7729 return -CEPHFS_ENOTCONN
;
7732 tout(cct
) << __func__
<< std::endl
;
7733 tout(cct
) << target
<< std::endl
;
7734 tout(cct
) << dirfd
<< std::endl
;
7735 tout(cct
) << relpath
<< std::endl
;
7737 if (std::string(relpath
) == "/") {
7738 return -CEPHFS_EEXIST
;
7741 filepath
path(relpath
);
7742 string name
= path
.last_dentry();
7746 std::scoped_lock
lock(client_lock
);
7749 int r
= get_fd_inode(dirfd
, &dirinode
);
7753 r
= path_walk(path
, &dir
, perms
, true, 0, dirinode
);
7757 if (cct
->_conf
->client_permissions
) {
7758 int r
= may_create(dir
.get(), perms
);
7763 return _symlink(dir
.get(), name
.c_str(), target
, perms
, std::move(alternate_name
));
7766 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
7768 return readlinkat(CEPHFS_AT_FDCWD
, relpath
, buf
, size
, perms
);
7771 int Client::readlinkat(int dirfd
, const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
) {
7772 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
7773 if (!mref_reader
.is_state_satisfied()) {
7774 return -CEPHFS_ENOTCONN
;
7777 tout(cct
) << __func__
<< std::endl
;
7778 tout(cct
) << dirfd
<< std::endl
;
7779 tout(cct
) << relpath
<< std::endl
;
7782 std::scoped_lock
lock(client_lock
);
7783 int r
= get_fd_inode(dirfd
, &dirinode
);
7789 filepath
path(relpath
);
7790 r
= path_walk(path
, &in
, perms
, false, 0, dirinode
);
7795 return _readlink(in
.get(), buf
, size
);
7798 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
7800 if (!in
->is_symlink())
7801 return -CEPHFS_EINVAL
;
7803 // copy into buf (at most size bytes)
7804 int r
= in
->symlink
.length();
7807 memcpy(buf
, in
->symlink
.c_str(), r
);
7814 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
7816 bool yes
= in
->caps_issued_mask(mask
, true);
7818 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
7822 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
7824 in
->make_nosnap_relative_path(path
);
7825 req
->set_filepath(path
);
7827 req
->head
.args
.getattr
.mask
= mask
;
7829 int res
= make_request(req
, perms
);
7830 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7834 int Client::_getvxattr(
7836 const UserPerm
& perms
,
7837 const char *xattr_name
,
7842 if (!xattr_name
|| strlen(xattr_name
) <= 0 || strlen(xattr_name
) > 255) {
7843 return -CEPHFS_ENODATA
;
7846 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETVXATTR
);
7848 in
->make_nosnap_relative_path(path
);
7849 req
->set_filepath(path
);
7851 req
->set_string2(xattr_name
);
7854 int res
= make_request(req
, perms
, nullptr, nullptr, rank
, &bl
,
7855 CEPHFS_FEATURE_OP_GETVXATTR
);
7856 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
7859 if (res
== -CEPHFS_EOPNOTSUPP
) {
7860 return -CEPHFS_ENODATA
;
7866 auto p
= bl
.cbegin();
7872 ssize_t len
= buf
.length();
7874 res
= len
; // refer to man getxattr(2) for output buffer size == 0
7878 res
= -CEPHFS_ERANGE
; // insufficient output buffer space
7880 memcpy(value
, buf
.c_str(), len
);
7886 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7887 const UserPerm
& perms
, InodeRef
*inp
,
7888 std::vector
<uint8_t>* aux
)
7890 int issued
= in
->caps_issued();
7891 union ceph_mds_request_args args
;
7892 bool kill_sguid
= false;
7897 auxsize
= aux
->size();
7899 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
7900 ccap_string(issued
) << " aux size " << auxsize
<< dendl
;
7902 if (in
->snapid
!= CEPH_NOSNAP
) {
7903 return -CEPHFS_EROFS
;
7905 if ((mask
& CEPH_SETATTR_SIZE
) &&
7906 (uint64_t)stx
->stx_size
> in
->size
&&
7907 is_quota_bytes_exceeded(in
, (uint64_t)stx
->stx_size
- in
->size
,
7909 return -CEPHFS_EDQUOT
;
7912 // Can't set fscrypt_auth and file at the same time!
7913 if ((mask
& (CEPH_SETATTR_FSCRYPT_AUTH
|CEPH_SETATTR_FSCRYPT_FILE
)) ==
7914 (CEPH_SETATTR_FSCRYPT_AUTH
|CEPH_SETATTR_FSCRYPT_FILE
))
7915 return -CEPHFS_EINVAL
;
7917 if (!aux
&& (mask
& (CEPH_SETATTR_FSCRYPT_AUTH
|CEPH_SETATTR_FSCRYPT_FILE
)))
7918 return -CEPHFS_EINVAL
;
7920 memset(&args
, 0, sizeof(args
));
7922 // make the change locally?
7923 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
7924 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
7925 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
7926 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7927 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7930 * This works because we implicitly flush the caps as part of the
7931 * request, so the cap update check will happen with the writeback
7932 * cap context, and then the setattr check will happen with the
7935 * In reality this pattern is likely pretty rare (different users
7936 * setattr'ing the same file). If that turns out not to be the
7937 * case later, we can build a more complex pipelined cap writeback
7940 mask
|= CEPH_SETATTR_CTIME
;
7944 // caller just needs us to bump the ctime
7945 in
->ctime
= ceph_clock_now();
7946 in
->cap_dirtier_uid
= perms
.uid();
7947 in
->cap_dirtier_gid
= perms
.gid();
7948 if (issued
& CEPH_CAP_AUTH_EXCL
)
7949 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7950 else if (issued
& CEPH_CAP_FILE_EXCL
)
7951 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7952 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7953 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7955 mask
|= CEPH_SETATTR_CTIME
;
7958 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7959 kill_sguid
= !!(mask
& CEPH_SETATTR_KILL_SGUID
);
7962 if (mask
& CEPH_SETATTR_UID
) {
7963 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7965 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7966 in
->ctime
= ceph_clock_now();
7967 in
->cap_dirtier_uid
= perms
.uid();
7968 in
->cap_dirtier_gid
= perms
.gid();
7969 in
->uid
= stx
->stx_uid
;
7970 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7971 mask
&= ~CEPH_SETATTR_UID
;
7973 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7974 in
->uid
!= stx
->stx_uid
) {
7975 args
.setattr
.uid
= stx
->stx_uid
;
7976 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7978 mask
&= ~CEPH_SETATTR_UID
;
7982 if (mask
& CEPH_SETATTR_GID
) {
7983 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7985 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7986 in
->ctime
= ceph_clock_now();
7987 in
->cap_dirtier_uid
= perms
.uid();
7988 in
->cap_dirtier_gid
= perms
.gid();
7989 in
->gid
= stx
->stx_gid
;
7990 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7991 mask
&= ~CEPH_SETATTR_GID
;
7993 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
7994 in
->gid
!= stx
->stx_gid
) {
7995 args
.setattr
.gid
= stx
->stx_gid
;
7996 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7998 mask
&= ~CEPH_SETATTR_GID
;
8002 if (mask
& CEPH_SETATTR_MODE
) {
8003 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
8005 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
8006 in
->ctime
= ceph_clock_now();
8007 in
->cap_dirtier_uid
= perms
.uid();
8008 in
->cap_dirtier_gid
= perms
.gid();
8009 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
8010 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
8011 mask
&= ~CEPH_SETATTR_MODE
;
8012 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
8013 in
->mode
!= stx
->stx_mode
) {
8014 args
.setattr
.mode
= stx
->stx_mode
;
8015 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
8017 mask
&= ~CEPH_SETATTR_MODE
;
8019 } else if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
) && S_ISREG(in
->mode
)) {
8020 if (kill_sguid
&& (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
8021 in
->mode
&= ~(S_ISUID
|S_ISGID
);
8023 if (mask
& CEPH_SETATTR_KILL_SUID
) {
8024 in
->mode
&= ~S_ISUID
;
8026 if (mask
& CEPH_SETATTR_KILL_SGID
) {
8027 in
->mode
&= ~S_ISGID
;
8030 mask
&= ~(CEPH_SETATTR_KILL_SGUID
|CEPH_SETATTR_KILL_SUID
|CEPH_SETATTR_KILL_SGID
);
8031 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
8034 if (mask
& CEPH_SETATTR_BTIME
) {
8035 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
8037 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
8038 in
->ctime
= ceph_clock_now();
8039 in
->cap_dirtier_uid
= perms
.uid();
8040 in
->cap_dirtier_gid
= perms
.gid();
8041 in
->btime
= utime_t(stx
->stx_btime
);
8042 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
8043 mask
&= ~CEPH_SETATTR_BTIME
;
8044 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
8045 in
->btime
!= utime_t(stx
->stx_btime
)) {
8046 args
.setattr
.btime
= utime_t(stx
->stx_btime
);
8047 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
8049 mask
&= ~CEPH_SETATTR_BTIME
;
8053 if (mask
& CEPH_SETATTR_FSCRYPT_AUTH
) {
8054 ldout(cct
,10) << "resetting cached fscrypt_auth field. size now "
8055 << in
->fscrypt_auth
.size() << dendl
;
8057 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
8058 in
->ctime
= ceph_clock_now();
8059 in
->cap_dirtier_uid
= perms
.uid();
8060 in
->cap_dirtier_gid
= perms
.gid();
8061 in
->fscrypt_auth
= *aux
;
8062 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
8063 mask
&= ~CEPH_SETATTR_FSCRYPT_AUTH
;
8064 } else if (!in
->caps_issued_mask(CEPH_CAP_AUTH_SHARED
) ||
8065 in
->fscrypt_auth
!= *aux
) {
8066 inode_drop
|= CEPH_CAP_AUTH_SHARED
;
8068 mask
&= ~CEPH_SETATTR_FSCRYPT_AUTH
;
8072 if (mask
& CEPH_SETATTR_SIZE
) {
8073 if ((uint64_t)stx
->stx_size
>= mdsmap
->get_max_filesize()) {
8075 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
8076 return -CEPHFS_EFBIG
;
8079 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
8080 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
) &&
8081 !(mask
& CEPH_SETATTR_KILL_SGUID
) &&
8082 stx
->stx_size
>= in
->size
) {
8083 if (stx
->stx_size
> in
->size
) {
8084 in
->size
= in
->reported_size
= stx
->stx_size
;
8085 in
->cap_dirtier_uid
= perms
.uid();
8086 in
->cap_dirtier_gid
= perms
.gid();
8087 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
8088 mask
&= ~(CEPH_SETATTR_SIZE
);
8089 mask
|= CEPH_SETATTR_MTIME
;
8091 // ignore it when size doesn't change
8092 mask
&= ~(CEPH_SETATTR_SIZE
);
8095 args
.setattr
.size
= stx
->stx_size
;
8096 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
8101 if (mask
& CEPH_SETATTR_FSCRYPT_FILE
) {
8102 ldout(cct
,10) << "resetting cached fscrypt_file field. size now "
8103 << in
->fscrypt_file
.size() << dendl
;
8105 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
8106 in
->ctime
= ceph_clock_now();
8107 in
->cap_dirtier_uid
= perms
.uid();
8108 in
->cap_dirtier_gid
= perms
.gid();
8109 in
->fscrypt_file
= *aux
;
8110 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
8111 mask
&= ~CEPH_SETATTR_FSCRYPT_FILE
;
8112 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
8113 in
->fscrypt_file
!= *aux
) {
8114 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
;
8116 mask
&= ~CEPH_SETATTR_FSCRYPT_FILE
;
8120 if (mask
& CEPH_SETATTR_MTIME
) {
8121 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
8122 in
->mtime
= utime_t(stx
->stx_mtime
);
8123 in
->ctime
= ceph_clock_now();
8124 in
->cap_dirtier_uid
= perms
.uid();
8125 in
->cap_dirtier_gid
= perms
.gid();
8126 in
->time_warp_seq
++;
8127 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
8128 mask
&= ~CEPH_SETATTR_MTIME
;
8129 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
8130 utime_t(stx
->stx_mtime
) > in
->mtime
) {
8131 in
->mtime
= utime_t(stx
->stx_mtime
);
8132 in
->ctime
= ceph_clock_now();
8133 in
->cap_dirtier_uid
= perms
.uid();
8134 in
->cap_dirtier_gid
= perms
.gid();
8135 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
8136 mask
&= ~CEPH_SETATTR_MTIME
;
8137 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
8138 in
->mtime
!= utime_t(stx
->stx_mtime
)) {
8139 args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
8140 inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
8143 mask
&= ~CEPH_SETATTR_MTIME
;
8147 if (mask
& CEPH_SETATTR_ATIME
) {
8148 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
8149 in
->atime
= utime_t(stx
->stx_atime
);
8150 in
->ctime
= ceph_clock_now();
8151 in
->cap_dirtier_uid
= perms
.uid();
8152 in
->cap_dirtier_gid
= perms
.gid();
8153 in
->time_warp_seq
++;
8154 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
8155 mask
&= ~CEPH_SETATTR_ATIME
;
8156 } else if (in
->caps_issued_mask(CEPH_CAP_FILE_WR
) &&
8157 utime_t(stx
->stx_atime
) > in
->atime
) {
8158 in
->atime
= utime_t(stx
->stx_atime
);
8159 in
->ctime
= ceph_clock_now();
8160 in
->cap_dirtier_uid
= perms
.uid();
8161 in
->cap_dirtier_gid
= perms
.gid();
8162 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
8163 mask
&= ~CEPH_SETATTR_ATIME
;
8164 } else if (!in
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) ||
8165 in
->atime
!= utime_t(stx
->stx_atime
)) {
8166 args
.setattr
.atime
= utime_t(stx
->stx_atime
);
8167 inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
8170 mask
&= ~CEPH_SETATTR_ATIME
;
8176 if (in
->is_dir() && in
->snapid
== CEPH_NOSNAP
) {
8177 vinodeno_t
vino(in
->ino
, CEPH_SNAPDIR
);
8178 if (inode_map
.count(vino
)) {
8179 refresh_snapdir_attrs(inode_map
[vino
], in
);
8185 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
8189 in
->make_nosnap_relative_path(path
);
8190 req
->set_filepath(path
);
8193 req
->head
.args
= args
;
8194 req
->inode_drop
= inode_drop
;
8195 if (mask
& CEPH_SETATTR_FSCRYPT_AUTH
) {
8196 req
->fscrypt_auth
= *aux
;
8197 } else if (mask
& CEPH_SETATTR_FSCRYPT_FILE
) {
8198 req
->fscrypt_file
= *aux
;
8200 req
->head
.args
.setattr
.mask
= mask
;
8201 req
->regetattr_mask
= mask
;
8203 int res
= make_request(req
, perms
, inp
);
8204 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
8208 /* Note that we only care about attrs that setattr cares about */
8209 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
8211 stx
->stx_size
= st
->st_size
;
8212 stx
->stx_mode
= st
->st_mode
;
8213 stx
->stx_uid
= st
->st_uid
;
8214 stx
->stx_gid
= st
->st_gid
;
8216 stx
->stx_mtime
= st
->st_mtimespec
;
8217 stx
->stx_atime
= st
->st_atimespec
;
8219 stx
->stx_mtime
.tv_sec
= st
->st_mtime
;
8220 stx
->stx_mtime
.tv_nsec
= 0;
8221 stx
->stx_atime
.tv_sec
= st
->st_atime
;
8222 stx
->stx_atime
.tv_nsec
= 0;
8224 stx
->stx_mtime
= st
->st_mtim
;
8225 stx
->stx_atime
= st
->st_atim
;
8229 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
8230 const UserPerm
& perms
, InodeRef
*inp
)
8232 if (mask
& CEPH_SETATTR_SIZE
) {
8233 mask
|= clear_suid_sgid(in
, perms
, true);
8236 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
8239 if (mask
& CEPH_SETATTR_MODE
)
8240 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
8244 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
8245 const UserPerm
& perms
)
8247 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
8248 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
8249 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
8250 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
8251 if (cct
->_conf
->client_permissions
) {
8252 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
8256 return __setattrx(in
.get(), stx
, mask
, perms
);
8259 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
8260 const UserPerm
& perms
)
8262 struct ceph_statx stx
;
8264 stat_to_statx(attr
, &stx
);
8265 mask
&= ~CEPH_SETATTR_BTIME
;
8267 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
8268 mask
&= ~CEPH_SETATTR_UID
;
8270 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
8271 mask
&= ~CEPH_SETATTR_GID
;
8274 return _setattrx(in
, &stx
, mask
, perms
);
8277 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
8278 const UserPerm
& perms
)
8280 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8281 if (!mref_reader
.is_state_satisfied())
8282 return -CEPHFS_ENOTCONN
;
8284 tout(cct
) << __func__
<< std::endl
;
8285 tout(cct
) << relpath
<< std::endl
;
8286 tout(cct
) << mask
<< std::endl
;
8288 filepath
path(relpath
);
8291 std::scoped_lock
lock(client_lock
);
8292 int r
= path_walk(path
, &in
, perms
);
8295 return _setattr(in
, attr
, mask
, perms
);
8298 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
8299 const UserPerm
& perms
, int flags
)
8301 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8302 if (!mref_reader
.is_state_satisfied())
8303 return -CEPHFS_ENOTCONN
;
8305 tout(cct
) << __func__
<< std::endl
;
8306 tout(cct
) << relpath
<< std::endl
;
8307 tout(cct
) << mask
<< std::endl
;
8309 filepath
path(relpath
);
8312 std::scoped_lock
lock(client_lock
);
8313 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
8316 return _setattrx(in
, stx
, mask
, perms
);
8319 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
8321 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8322 if (!mref_reader
.is_state_satisfied())
8323 return -CEPHFS_ENOTCONN
;
8325 tout(cct
) << __func__
<< std::endl
;
8326 tout(cct
) << fd
<< std::endl
;
8327 tout(cct
) << mask
<< std::endl
;
8329 std::scoped_lock
lock(client_lock
);
8330 Fh
*f
= get_filehandle(fd
);
8332 return -CEPHFS_EBADF
;
8333 #if defined(__linux__) && defined(O_PATH)
8334 if (f
->flags
& O_PATH
)
8335 return -CEPHFS_EBADF
;
8337 return _setattr(f
->inode
, attr
, mask
, perms
);
8340 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
8342 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8343 if (!mref_reader
.is_state_satisfied())
8344 return -CEPHFS_ENOTCONN
;
8346 tout(cct
) << __func__
<< std::endl
;
8347 tout(cct
) << fd
<< std::endl
;
8348 tout(cct
) << mask
<< std::endl
;
8350 std::scoped_lock
lock(client_lock
);
8351 Fh
*f
= get_filehandle(fd
);
8353 return -CEPHFS_EBADF
;
8354 #if defined(__linux__) && defined(O_PATH)
8355 if (f
->flags
& O_PATH
)
8356 return -CEPHFS_EBADF
;
8358 return _setattrx(f
->inode
, stx
, mask
, perms
);
8361 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
8362 frag_info_t
*dirstat
, int mask
)
8364 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8365 if (!mref_reader
.is_state_satisfied())
8366 return -CEPHFS_ENOTCONN
;
8368 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8369 tout(cct
) << "stat" << std::endl
;
8370 tout(cct
) << relpath
<< std::endl
;
8372 filepath
path(relpath
);
8375 std::scoped_lock
lock(client_lock
);
8376 int r
= path_walk(path
, &in
, perms
, true, mask
);
8379 r
= _getattr(in
, mask
, perms
);
8381 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8384 fill_stat(in
, stbuf
, dirstat
);
8385 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8389 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
8393 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8394 if ((flags
& AT_STATX_SYNC_TYPE
) == AT_STATX_DONT_SYNC
)
8397 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
8398 mask
|= CEPH_CAP_PIN
;
8399 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8400 mask
|= CEPH_CAP_AUTH_SHARED
;
8401 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
8402 mask
|= CEPH_CAP_LINK_SHARED
;
8403 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
8404 mask
|= CEPH_CAP_FILE_SHARED
;
8405 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
8406 mask
|= CEPH_CAP_XATTR_SHARED
;
8411 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
8412 const UserPerm
& perms
,
8413 unsigned int want
, unsigned int flags
)
8415 return statxat(CEPHFS_AT_FDCWD
, relpath
, stx
, perms
, want
, flags
);
8418 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
8419 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
8421 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8422 if (!mref_reader
.is_state_satisfied())
8423 return -CEPHFS_ENOTCONN
;
8425 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8426 tout(cct
) << __func__
<< std::endl
;
8427 tout(cct
) << relpath
<< std::endl
;
8429 filepath
path(relpath
);
8432 std::scoped_lock
lock(client_lock
);
8433 // don't follow symlinks
8434 int r
= path_walk(path
, &in
, perms
, false, mask
);
8437 r
= _getattr(in
, mask
, perms
);
8439 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
8442 fill_stat(in
, stbuf
, dirstat
);
8443 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
8447 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
8449 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8450 << " mode 0" << oct
<< in
->mode
<< dec
8451 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
8452 memset(st
, 0, sizeof(struct stat
));
8453 if (use_faked_inos())
8454 st
->st_ino
= in
->faked_ino
;
8456 st
->st_ino
= in
->ino
;
8457 st
->st_dev
= in
->snapid
;
8458 st
->st_mode
= in
->mode
;
8459 st
->st_rdev
= in
->rdev
;
8461 switch (in
->nlink
) {
8463 st
->st_nlink
= 0; /* dir is unlinked */
8466 st
->st_nlink
= 1 /* parent dentry */
8468 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8474 st
->st_nlink
= in
->nlink
;
8476 st
->st_uid
= in
->uid
;
8477 st
->st_gid
= in
->gid
;
8478 if (in
->ctime
> in
->mtime
) {
8479 stat_set_ctime_sec(st
, in
->ctime
.sec());
8480 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
8482 stat_set_ctime_sec(st
, in
->mtime
.sec());
8483 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
8485 stat_set_atime_sec(st
, in
->atime
.sec());
8486 stat_set_atime_nsec(st
, in
->atime
.nsec());
8487 stat_set_mtime_sec(st
, in
->mtime
.sec());
8488 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
8490 if (cct
->_conf
->client_dirsize_rbytes
) {
8491 st
->st_size
= in
->rstat
.rbytes
;
8492 } else if (in
->snapid
== CEPH_SNAPDIR
) {
8493 SnapRealm
*realm
= get_snap_realm_maybe(in
->vino().ino
);
8495 st
->st_size
= realm
->my_snaps
.size();
8496 put_snap_realm(realm
);
8499 st
->st_size
= in
->dirstat
.size();
8501 // The Windows "stat" structure provides just a subset of the fields that are
8502 // available on Linux.
8507 st
->st_size
= in
->size
;
8509 st
->st_blocks
= (in
->size
+ 511) >> 9;
8513 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8517 *dirstat
= in
->dirstat
;
8521 return in
->caps_issued();
8524 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
8526 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
8527 << " mode 0" << oct
<< in
->mode
<< dec
8528 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< " change_attr " << in
->change_attr
<< dendl
;
8529 memset(stx
, 0, sizeof(struct ceph_statx
));
8532 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
8533 * so that all bits are set.
8538 /* These are always considered to be available */
8539 stx
->stx_dev
= in
->snapid
;
8540 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
8542 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8543 stx
->stx_mode
= S_IFMT
& in
->mode
;
8544 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (uint64_t)in
->ino
;
8545 stx
->stx_rdev
= in
->rdev
;
8546 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
8548 if (mask
& CEPH_CAP_AUTH_SHARED
) {
8549 stx
->stx_uid
= in
->uid
;
8550 stx
->stx_gid
= in
->gid
;
8551 stx
->stx_mode
= in
->mode
;
8552 in
->btime
.to_timespec(&stx
->stx_btime
);
8553 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
8556 if (mask
& CEPH_CAP_LINK_SHARED
) {
8558 switch (in
->nlink
) {
8560 stx
->stx_nlink
= 0; /* dir is unlinked */
8563 stx
->stx_nlink
= 1 /* parent dentry */
8565 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
8571 stx
->stx_nlink
= in
->nlink
;
8573 stx
->stx_mask
|= CEPH_STATX_NLINK
;
8576 if (mask
& CEPH_CAP_FILE_SHARED
) {
8578 in
->atime
.to_timespec(&stx
->stx_atime
);
8579 in
->mtime
.to_timespec(&stx
->stx_mtime
);
8582 if (cct
->_conf
->client_dirsize_rbytes
) {
8583 stx
->stx_size
= in
->rstat
.rbytes
;
8584 } else if (in
->snapid
== CEPH_SNAPDIR
) {
8585 SnapRealm
*realm
= get_snap_realm_maybe(in
->vino().ino
);
8587 stx
->stx_size
= realm
->my_snaps
.size();
8588 put_snap_realm(realm
);
8591 stx
->stx_size
= in
->dirstat
.size();
8593 stx
->stx_blocks
= 1;
8595 stx
->stx_size
= in
->size
;
8596 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
8598 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
8599 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
8602 /* Change time and change_attr both require all shared caps to view */
8603 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
8604 stx
->stx_version
= in
->change_attr
;
8605 if (in
->ctime
> in
->mtime
)
8606 in
->ctime
.to_timespec(&stx
->stx_ctime
);
8608 in
->mtime
.to_timespec(&stx
->stx_ctime
);
8609 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
8614 void Client::touch_dn(Dentry
*dn
)
8619 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8621 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, 0, perms
);
8624 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
8626 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8627 if (!mref_reader
.is_state_satisfied())
8628 return -CEPHFS_ENOTCONN
;
8630 tout(cct
) << __func__
<< std::endl
;
8631 tout(cct
) << fd
<< std::endl
;
8632 tout(cct
) << mode
<< std::endl
;
8634 std::scoped_lock
lock(client_lock
);
8635 Fh
*f
= get_filehandle(fd
);
8637 return -CEPHFS_EBADF
;
8638 #if defined(__linux__) && defined(O_PATH)
8639 if (f
->flags
& O_PATH
)
8640 return -CEPHFS_EBADF
;
8643 attr
.st_mode
= mode
;
8644 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
8647 int Client::chmodat(int dirfd
, const char *relpath
, mode_t mode
, int flags
,
8648 const UserPerm
& perms
) {
8649 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8650 if (!mref_reader
.is_state_satisfied()) {
8651 return -CEPHFS_ENOTCONN
;
8654 tout(cct
) << __func__
<< std::endl
;
8655 tout(cct
) << dirfd
<< std::endl
;
8656 tout(cct
) << relpath
<< std::endl
;
8657 tout(cct
) << mode
<< std::endl
;
8658 tout(cct
) << flags
<< std::endl
;
8660 filepath
path(relpath
);
8664 std::scoped_lock
lock(client_lock
);
8665 int r
= get_fd_inode(dirfd
, &dirinode
);
8670 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8675 attr
.st_mode
= mode
;
8676 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
8679 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
8681 return chmodat(CEPHFS_AT_FDCWD
, relpath
, mode
, AT_SYMLINK_NOFOLLOW
, perms
);
8684 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8685 const UserPerm
& perms
)
8687 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, 0, perms
);
8690 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
8692 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8693 if (!mref_reader
.is_state_satisfied())
8694 return -CEPHFS_ENOTCONN
;
8696 tout(cct
) << __func__
<< std::endl
;
8697 tout(cct
) << fd
<< std::endl
;
8698 tout(cct
) << new_uid
<< std::endl
;
8699 tout(cct
) << new_gid
<< std::endl
;
8701 std::scoped_lock
lock(client_lock
);
8702 Fh
*f
= get_filehandle(fd
);
8704 return -CEPHFS_EBADF
;
8705 #if defined(__linux__) && defined(O_PATH)
8706 if (f
->flags
& O_PATH
)
8707 return -CEPHFS_EBADF
;
8710 attr
.st_uid
= new_uid
;
8711 attr
.st_gid
= new_gid
;
8713 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
8714 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
8715 return _setattr(f
->inode
, &attr
, mask
, perms
);
8718 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8719 const UserPerm
& perms
)
8721 return chownat(CEPHFS_AT_FDCWD
, relpath
, new_uid
, new_gid
, AT_SYMLINK_NOFOLLOW
, perms
);
8724 int Client::chownat(int dirfd
, const char *relpath
, uid_t new_uid
, gid_t new_gid
,
8725 int flags
, const UserPerm
& perms
) {
8726 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8727 if (!mref_reader
.is_state_satisfied()) {
8728 return -CEPHFS_ENOTCONN
;
8731 tout(cct
) << __func__
<< std::endl
;
8732 tout(cct
) << dirfd
<< std::endl
;
8733 tout(cct
) << relpath
<< std::endl
;
8734 tout(cct
) << new_uid
<< std::endl
;
8735 tout(cct
) << new_gid
<< std::endl
;
8736 tout(cct
) << flags
<< std::endl
;
8738 filepath
path(relpath
);
8742 std::scoped_lock
lock(client_lock
);
8743 int r
= get_fd_inode(dirfd
, &dirinode
);
8748 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8753 attr
.st_uid
= new_uid
;
8754 attr
.st_gid
= new_gid
;
8755 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
8758 static void attr_set_atime_and_mtime(struct stat
*attr
,
8759 const utime_t
&atime
,
8760 const utime_t
&mtime
)
8762 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
8763 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
8764 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
8765 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
8768 // for [l]utime() invoke the timeval variant as the timespec
8769 // variant are not yet implemented. for futime[s](), invoke
8770 // the timespec variant.
8771 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
8772 const UserPerm
& perms
)
8774 struct timeval tv
[2];
8775 tv
[0].tv_sec
= buf
->actime
;
8777 tv
[1].tv_sec
= buf
->modtime
;
8780 return utimes(relpath
, tv
, perms
);
8783 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
8784 const UserPerm
& perms
)
8786 struct timeval tv
[2];
8787 tv
[0].tv_sec
= buf
->actime
;
8789 tv
[1].tv_sec
= buf
->modtime
;
8792 return lutimes(relpath
, tv
, perms
);
8795 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
8797 struct timespec ts
[2];
8798 ts
[0].tv_sec
= buf
->actime
;
8800 ts
[1].tv_sec
= buf
->modtime
;
8803 return futimens(fd
, ts
, perms
);
8806 int Client::utimes(const char *relpath
, struct timeval times
[2],
8807 const UserPerm
& perms
)
8809 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8810 if (!mref_reader
.is_state_satisfied())
8811 return -CEPHFS_ENOTCONN
;
8813 tout(cct
) << __func__
<< std::endl
;
8814 tout(cct
) << relpath
<< std::endl
;
8815 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8817 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8820 filepath
path(relpath
);
8823 std::scoped_lock
lock(client_lock
);
8824 int r
= path_walk(path
, &in
, perms
);
8827 struct ceph_statx attr
;
8828 utime_t(times
[0]).to_timespec(&attr
.stx_atime
);
8829 utime_t(times
[1]).to_timespec(&attr
.stx_mtime
);
8831 return _setattrx(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8834 int Client::lutimes(const char *relpath
, struct timeval times
[2],
8835 const UserPerm
& perms
)
8837 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8838 if (!mref_reader
.is_state_satisfied())
8839 return -CEPHFS_ENOTCONN
;
8841 tout(cct
) << __func__
<< std::endl
;
8842 tout(cct
) << relpath
<< std::endl
;
8843 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
8845 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
8848 filepath
path(relpath
);
8851 std::scoped_lock
lock(client_lock
);
8852 int r
= path_walk(path
, &in
, perms
, false);
8855 struct ceph_statx attr
;
8856 utime_t(times
[0]).to_timespec(&attr
.stx_atime
);
8857 utime_t(times
[1]).to_timespec(&attr
.stx_mtime
);
8859 return _setattrx(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8862 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
8864 struct timespec ts
[2];
8865 ts
[0].tv_sec
= times
[0].tv_sec
;
8866 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
8867 ts
[1].tv_sec
= times
[1].tv_sec
;
8868 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
8870 return futimens(fd
, ts
, perms
);
8873 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
8875 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8876 if (!mref_reader
.is_state_satisfied())
8877 return -CEPHFS_ENOTCONN
;
8879 tout(cct
) << __func__
<< std::endl
;
8880 tout(cct
) << fd
<< std::endl
;
8881 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8883 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8886 std::scoped_lock
lock(client_lock
);
8887 Fh
*f
= get_filehandle(fd
);
8889 return -CEPHFS_EBADF
;
8890 #if defined(__linux__) && defined(O_PATH)
8891 if (f
->flags
& O_PATH
)
8892 return -CEPHFS_EBADF
;
8894 struct ceph_statx attr
;
8895 utime_t(times
[0]).to_timespec(&attr
.stx_atime
);
8896 utime_t(times
[1]).to_timespec(&attr
.stx_mtime
);
8898 return _setattrx(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8901 int Client::utimensat(int dirfd
, const char *relpath
, struct timespec times
[2], int flags
,
8902 const UserPerm
& perms
) {
8903 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8904 if (!mref_reader
.is_state_satisfied()) {
8905 return -CEPHFS_ENOTCONN
;
8908 tout(cct
) << __func__
<< std::endl
;
8909 tout(cct
) << dirfd
<< std::endl
;
8910 tout(cct
) << relpath
<< std::endl
;
8911 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
8913 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
8915 tout(cct
) << flags
<< std::endl
;
8917 filepath
path(relpath
);
8921 std::scoped_lock
lock(client_lock
);
8922 int r
= get_fd_inode(dirfd
, &dirinode
);
8927 #if defined(__linux__) && defined(O_PATH)
8928 if (flags
& O_PATH
) {
8929 return -CEPHFS_EBADF
;
8933 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), 0, dirinode
);
8937 struct ceph_statx attr
;
8938 utime_t(times
[0]).to_timespec(&attr
.stx_atime
);
8939 utime_t(times
[1]).to_timespec(&attr
.stx_mtime
);
8941 return _setattrx(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
8944 int Client::flock(int fd
, int operation
, uint64_t owner
)
8946 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8947 if (!mref_reader
.is_state_satisfied())
8948 return -CEPHFS_ENOTCONN
;
8950 tout(cct
) << __func__
<< std::endl
;
8951 tout(cct
) << fd
<< std::endl
;
8952 tout(cct
) << operation
<< std::endl
;
8953 tout(cct
) << owner
<< std::endl
;
8955 std::scoped_lock
lock(client_lock
);
8956 Fh
*f
= get_filehandle(fd
);
8958 return -CEPHFS_EBADF
;
8960 return _flock(f
, operation
, owner
);
8963 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
8965 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8966 if (!mref_reader
.is_state_satisfied())
8967 return -CEPHFS_ENOTCONN
;
8969 tout(cct
) << __func__
<< std::endl
;
8970 tout(cct
) << relpath
<< std::endl
;
8972 filepath
path(relpath
);
8975 std::scoped_lock
lock(client_lock
);
8976 int r
= path_walk(path
, &in
, perms
, true);
8979 if (cct
->_conf
->client_permissions
) {
8980 int r
= may_open(in
.get(), O_RDONLY
, perms
);
8984 r
= _opendir(in
.get(), dirpp
, perms
);
8985 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8986 if (r
!= -CEPHFS_ENOTDIR
)
8987 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
8991 int Client::fdopendir(int dirfd
, dir_result_t
**dirpp
, const UserPerm
&perms
) {
8992 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
8993 if (!mref_reader
.is_state_satisfied()) {
8994 return -CEPHFS_ENOTCONN
;
8997 tout(cct
) << __func__
<< std::endl
;
8998 tout(cct
) << dirfd
<< std::endl
;
9001 std::scoped_lock
locker(client_lock
);
9002 int r
= get_fd_inode(dirfd
, &dirinode
);
9007 if (cct
->_conf
->client_permissions
) {
9008 r
= may_open(dirinode
.get(), O_RDONLY
, perms
);
9013 r
= _opendir(dirinode
.get(), dirpp
, perms
);
9014 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
9015 if (r
!= -CEPHFS_ENOTDIR
) {
9016 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
9021 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
9024 return -CEPHFS_ENOTDIR
;
9025 *dirpp
= new dir_result_t(in
, perms
);
9026 opened_dirs
.insert(*dirpp
);
9027 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
9032 int Client::closedir(dir_result_t
*dir
)
9034 tout(cct
) << __func__
<< std::endl
;
9035 tout(cct
) << (uintptr_t)dir
<< std::endl
;
9037 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
9038 std::scoped_lock
lock(client_lock
);
9043 void Client::_closedir(dir_result_t
*dirp
)
9045 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
9048 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
9049 dirp
->inode
.reset();
9051 _readdir_drop_dirp_buffer(dirp
);
9052 opened_dirs
.erase(dirp
);
9056 void Client::rewinddir(dir_result_t
*dirp
)
9058 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
9060 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9061 if (!mref_reader
.is_state_satisfied())
9064 std::scoped_lock
lock(client_lock
);
9065 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
9066 _readdir_drop_dirp_buffer(d
);
9070 loff_t
Client::telldir(dir_result_t
*dirp
)
9072 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
9073 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
9077 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
9079 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
9081 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9082 if (!mref_reader
.is_state_satisfied())
9085 std::scoped_lock
lock(client_lock
);
9087 if (offset
== dirp
->offset
)
9090 if (offset
> dirp
->offset
)
9091 dirp
->release_count
= 0; // bump if we do a forward seek
9093 dirp
->ordered_count
= 0; // disable filling readdir cache
9095 if (dirp
->hash_order()) {
9096 if (dirp
->offset
> offset
) {
9097 _readdir_drop_dirp_buffer(dirp
);
9102 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
9103 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
9104 _readdir_drop_dirp_buffer(dirp
);
9109 dirp
->offset
= offset
;
9114 // ino_t d_ino; /* inode number */
9115 // off_t d_off; /* offset to the next dirent */
9116 // unsigned short d_reclen; /* length of this record */
9117 // unsigned char d_type; /* type of file */
9118 // char d_name[256]; /* filename */
9120 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
9122 strncpy(de
->d_name
, name
, 255);
9123 de
->d_name
[255] = '\0';
9124 #if !defined(__CYGWIN__) && !(defined(_WIN32))
9126 #if !defined(__APPLE__) && !defined(__FreeBSD__)
9127 de
->d_off
= next_off
;
9130 de
->d_type
= IFTODT(type
);
9131 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
9132 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
9136 void Client::_readdir_next_frag(dir_result_t
*dirp
)
9138 frag_t fg
= dirp
->buffer_frag
;
9140 if (fg
.is_rightmost()) {
9141 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
9148 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
9150 if (dirp
->hash_order()) {
9152 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
9153 if (dirp
->offset
< new_offset
) // don't decrease offset
9154 dirp
->offset
= new_offset
;
9156 dirp
->last_name
.clear();
9157 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
9158 _readdir_rechoose_frag(dirp
);
9162 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
9164 ceph_assert(dirp
->inode
);
9166 if (dirp
->hash_order())
9169 frag_t cur
= frag_t(dirp
->offset_high());
9170 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
9172 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
9173 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
9174 dirp
->last_name
.clear();
9175 dirp
->next_offset
= 2;
9179 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
9181 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
9182 dirp
->buffer
.clear();
9185 int Client::_readdir_get_frag(int op
, dir_result_t
* dirp
,
9186 fill_readdir_args_cb_t fill_req_cb
)
9189 ceph_assert(dirp
->inode
);
9191 // get the current frag.
9193 if (dirp
->hash_order())
9194 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
9196 fg
= frag_t(dirp
->offset_high());
9198 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
9199 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
9201 InodeRef
& diri
= dirp
->inode
;
9203 MetaRequest
*req
= new MetaRequest(op
);
9204 fill_req_cb(dirp
, req
, diri
, fg
);
9207 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
9209 if (res
== -CEPHFS_EAGAIN
) {
9210 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
9211 _readdir_rechoose_frag(dirp
);
9212 return _readdir_get_frag(op
, dirp
, fill_req_cb
);
9216 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
9217 << " size " << dirp
->buffer
.size() << dendl
;
9219 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
9226 struct dentry_off_lt
{
9227 bool operator()(const Dentry
* dn
, int64_t off
) const {
9228 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
9232 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
9233 int caps
, bool getref
)
9235 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
9236 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
9237 << " last_name " << dirp
->last_name
9238 << " offset " << hex
<< dirp
->offset
<< dec
9240 Dir
*dir
= dirp
->inode
->dir
;
9243 ldout(cct
, 10) << " dir is empty" << dendl
;
9248 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
9249 dir
->readdir_cache
.end(),
9250 dirp
->offset
, dentry_off_lt());
9255 if (!dirp
->inode
->is_complete_and_ordered())
9256 return -CEPHFS_EAGAIN
;
9257 if (pd
== dir
->readdir_cache
.end())
9260 if (dn
->inode
== NULL
) {
9261 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
9265 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
9266 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
9271 int idx
= pd
- dir
->readdir_cache
.begin();
9272 if (dn
->inode
->is_dir()) {
9273 mask
|= CEPH_STAT_RSTAT
;
9275 int r
= _getattr(dn
->inode
, mask
, dirp
->perms
);
9279 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9280 pd
= dir
->readdir_cache
.begin() + idx
;
9281 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
9282 return -CEPHFS_EAGAIN
;
9284 struct ceph_statx stx
;
9286 fill_statx(dn
->inode
, caps
, &stx
);
9288 uint64_t next_off
= dn
->offset
+ 1;
9289 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9291 if (pd
== dir
->readdir_cache
.end())
9292 next_off
= dir_result_t::END
;
9296 in
= dn
->inode
.get();
9300 dn_name
= dn
->name
; // fill in name while we have lock
9302 client_lock
.unlock();
9303 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
9305 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
9306 << " = " << r
<< dendl
;
9311 dirp
->offset
= next_off
;
9313 dirp
->next_offset
= 2;
9315 dirp
->next_offset
= dirp
->offset_low();
9316 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
9317 dirp
->release_count
= 0; // last_name no longer match cache index
9322 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
9327 int Client::readdir_r_cb(dir_result_t
* d
,
9334 auto fill_readdir_cb
= [](dir_result_t
* dirp
,
9339 diri
->make_nosnap_relative_path(path
);
9340 req
->set_filepath(path
);
9341 req
->set_inode(diri
.get());
9342 req
->head
.args
.readdir
.frag
= fg
;
9343 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
9344 if (dirp
->last_name
.length()) {
9345 req
->path2
.set_path(dirp
->last_name
);
9346 } else if (dirp
->hash_order()) {
9347 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
9351 int op
= CEPH_MDS_OP_READDIR
;
9352 if (d
->inode
&& d
->inode
->snapid
== CEPH_SNAPDIR
)
9353 op
= CEPH_MDS_OP_LSSNAP
;
9354 return _readdir_r_cb(op
,
9366 // NB: this is used for both readdir and readdir_snapdiff results processing
9367 // hence it should be request type agnostic
9369 int Client::_readdir_r_cb(int op
,
9372 fill_readdir_args_cb_t fill_cb
,
9379 int caps
= statx_to_mask(flags
, want
);
9381 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9382 if (!mref_reader
.is_state_satisfied())
9383 return -CEPHFS_ENOTCONN
;
9385 std::unique_lock
cl(client_lock
);
9387 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
9389 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
9390 << dec
<< " at_end=" << dirp
->at_end()
9391 << " hash_order=" << dirp
->hash_order() << dendl
;
9394 struct ceph_statx stx
;
9395 memset(&de
, 0, sizeof(de
));
9396 memset(&stx
, 0, sizeof(stx
));
9398 InodeRef
& diri
= dirp
->inode
;
9403 if (dirp
->offset
== 0) {
9404 ldout(cct
, 15) << " including ." << dendl
;
9405 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
9406 uint64_t next_off
= 1;
9409 r
= _getattr(diri
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9413 fill_statx(diri
, caps
, &stx
);
9414 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
9416 Inode
*inode
= NULL
;
9423 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9428 dirp
->offset
= next_off
;
9432 if (dirp
->offset
== 1) {
9433 ldout(cct
, 15) << " including .." << dendl
;
9434 uint64_t next_off
= 2;
9436 if (diri
->dentries
.empty())
9439 in
= diri
->get_first_parent()->dir
->parent_inode
;
9442 r
= _getattr(in
, caps
| CEPH_STAT_RSTAT
, dirp
->perms
);
9446 fill_statx(in
, caps
, &stx
);
9447 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
9449 Inode
*inode
= NULL
;
9456 r
= cb(p
, &de
, &stx
, next_off
, inode
);
9461 dirp
->offset
= next_off
;
9466 // can we read from our cache?
9467 ldout(cct
, 10) << __func__
9468 << " offset " << hex
<< dirp
->offset
<< dec
9469 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
9470 << dirp
->inode
->is_complete_and_ordered()
9471 << " issued " << ccap_string(dirp
->inode
->caps_issued())
9473 if (!bypass_cache
&&
9474 dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
9475 dirp
->inode
->is_complete_and_ordered() &&
9476 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
9477 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
9478 if (err
!= -CEPHFS_EAGAIN
)
9486 bool check_caps
= true;
9487 if (!dirp
->is_cached()) {
9488 int r
= _readdir_get_frag(op
, dirp
, fill_cb
);
9491 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9492 // different than the requested one. (our dirfragtree was outdated)
9495 frag_t fg
= dirp
->buffer_frag
;
9497 ldout(cct
, 10) << __func__
9498 << " frag " << fg
<< " buffer size " << dirp
->buffer
.size()
9499 << " offset " << hex
<< dirp
->offset
<< dendl
;
9501 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
9502 dirp
->offset
, dir_result_t::dentry_off_lt());
9503 it
!= dirp
->buffer
.end();
9505 dir_result_t::dentry
&entry
= *it
;
9507 uint64_t next_off
= entry
.offset
+ 1;
9512 if(entry
.inode
->is_dir()){
9513 mask
|= CEPH_STAT_RSTAT
;
9515 r
= _getattr(entry
.inode
, mask
, dirp
->perms
);
9520 fill_statx(entry
.inode
, caps
, &stx
);
9521 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
9523 Inode
*inode
= NULL
;
9525 inode
= entry
.inode
.get();
9530 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
9533 ldout(cct
, 15) << __func__
9534 << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
9535 << " snap " << entry
.inode
->snapid
9536 << " = " << r
<< dendl
;
9540 dirp
->offset
= next_off
;
9545 if (dirp
->next_offset
> 2) {
9546 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
9547 _readdir_drop_dirp_buffer(dirp
);
9551 if (!fg
.is_rightmost()) {
9553 _readdir_next_frag(dirp
);
9557 if (!bypass_cache
&&
9558 diri
->shared_gen
== dirp
->start_shared_gen
&&
9559 diri
->dir_release_count
== dirp
->release_count
) {
9560 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
9561 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
9563 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
9564 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
9566 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
9568 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
9569 diri
->flags
|= I_COMPLETE
;
9581 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
9583 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
9590 * 1 if we got a dirent
9591 * 0 for end of directory
9595 struct single_readdir
{
9597 struct ceph_statx
*stx
;
9602 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
9603 struct ceph_statx
*stx
, off_t off
,
9606 single_readdir
*c
= static_cast<single_readdir
*>(p
);
9609 return -1; // already filled this dirent
9619 struct dirent
*Client::readdir(dir_result_t
*d
)
9629 // our callback fills the dirent and sets sr.full=true on first
9630 // call, and returns -1 the second time around.
9631 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
9633 errno
= -ret
; // this sucks.
9634 return (dirent
*) NULL
;
9639 return (dirent
*) NULL
;
9642 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
9643 struct ceph_statx
*stx
, unsigned want
,
9644 unsigned flags
, Inode
**out
)
9652 // our callback fills the dirent and sets sr.full=true on first
9653 // call, and returns -1 the second time around.
9654 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
9664 int Client::readdir_snapdiff(dir_result_t
* d1
, snapid_t snap2
,
9665 struct dirent
* out_de
,
9668 if (!d1
|| !d1
->inode
|| d1
->inode
->snapid
== snap2
) {
9669 lderr(cct
) << __func__
<< " invalid parameters: "
9671 << " d1->inode:" << (d1
? d1
->inode
: nullptr)
9672 << " snap2 id :" << snap2
9686 auto fill_snapdiff_cb
= [&](dir_result_t
* dirp
,
9691 diri
->make_nosnap_relative_path(path
);
9692 req
->set_filepath(path
);
9693 req
->set_inode(diri
.get());
9694 req
->head
.args
.snapdiff
.snap_other
= snap2
;
9695 req
->head
.args
.snapdiff
.frag
= fg
;
9696 req
->head
.args
.snapdiff
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
9697 if (dirp
->last_name
.length()) {
9698 req
->path2
.set_path(dirp
->last_name
);
9699 } else if (dirp
->hash_order()) {
9700 req
->head
.args
.snapdiff
.offset_hash
= dirp
->offset_high();
9705 // our callback fills the dirent and sets sr.full=true on first
9706 // call, and returns -1 the second time around.
9707 int ret
= _readdir_r_cb(CEPH_MDS_OP_READDIR_SNAPDIFF
,
9709 _readdir_single_dirent_cb
,
9717 lderr(cct
) << __func__
<< " error: "
9718 << cpp_strerror(ret
)
9720 errno
= -ret
; // this sucks.
9724 ldout(cct
, 15) << __func__
<< " " << ret
9725 << " " << sr
.de
->d_name
9726 << " " << stx
.stx_dev
9733 *out_snap
= stx
.stx_dev
;
9741 struct getdents_result
{
9748 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
9749 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9751 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
9757 dlen
= strlen(de
->d_name
) + 1;
9759 if (c
->pos
+ dlen
> c
->buflen
)
9760 return -1; // doesn't fit
9763 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
9765 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
9771 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
9776 gr
.fullent
= fullent
;
9779 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
9781 if (r
< 0) { // some error
9782 if (r
== -1) { // buffer ran out of space
9783 if (gr
.pos
) { // but we got some entries already!
9785 } // or we need a larger buffer
9786 return -CEPHFS_ERANGE
;
9787 } else { // actual error, return it
9796 struct getdir_result
{
9797 list
<string
> *contents
;
9801 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
9803 getdir_result
*r
= static_cast<getdir_result
*>(p
);
9805 r
->contents
->push_back(de
->d_name
);
9810 int Client::getdir(const char *relpath
, list
<string
>& contents
,
9811 const UserPerm
& perms
)
9813 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
9814 tout(cct
) << "getdir" << std::endl
;
9815 tout(cct
) << relpath
<< std::endl
;
9818 int r
= opendir(relpath
, &d
, perms
);
9823 gr
.contents
= &contents
;
9825 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
9835 /****** file i/o **********/
9837 // common parts for open and openat. call with client_lock locked.
9838 int Client::create_and_open(int dirfd
, const char *relpath
, int flags
,
9839 const UserPerm
& perms
, mode_t mode
, int stripe_unit
,
9840 int stripe_count
, int object_size
, const char *data_pool
,
9841 std::string alternate_name
) {
9842 ceph_assert(ceph_mutex_is_locked(client_lock
));
9843 int cflags
= ceph_flags_sys2wire(flags
);
9844 tout(cct
) << cflags
<< std::endl
;
9848 #if defined(__linux__) && defined(O_PATH)
9849 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9850 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9851 * in kernel (fs/open.c). */
9853 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
9856 filepath
path(relpath
);
9858 bool created
= false;
9859 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9860 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
9861 int mask
= ceph_caps_for_mode(ceph_flags_to_mode(cflags
));
9863 InodeRef dirinode
= nullptr;
9864 int r
= get_fd_inode(dirfd
, &dirinode
);
9869 r
= path_walk(path
, &in
, perms
, followsym
, mask
, dirinode
);
9870 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
9871 return -CEPHFS_EEXIST
;
9873 #if defined(__linux__) && defined(O_PATH)
9874 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
9876 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
9878 return -CEPHFS_ELOOP
;
9880 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
9881 filepath dirpath
= path
;
9882 string dname
= dirpath
.last_dentry();
9883 dirpath
.pop_dentry();
9885 r
= path_walk(dirpath
, &dir
, perms
, true,
9886 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0, dirinode
);
9890 if (cct
->_conf
->client_permissions
) {
9891 r
= may_create(dir
.get(), perms
);
9895 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
9896 stripe_count
, object_size
, data_pool
, &created
, perms
,
9897 std::move(alternate_name
));
9903 // posix says we can only check permissions of existing files
9904 if (cct
->_conf
->client_permissions
) {
9905 r
= may_open(in
.get(), flags
, perms
);
9912 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
9914 // allocate a integer file descriptor
9917 ceph_assert(fd_map
.count(r
) == 0);
9925 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
9926 mode_t mode
, int stripe_unit
, int stripe_count
,
9927 int object_size
, const char *data_pool
, std::string alternate_name
)
9929 return openat(CEPHFS_AT_FDCWD
, relpath
, flags
, perms
, mode
, stripe_unit
,
9930 stripe_count
, object_size
, data_pool
, alternate_name
);
9933 int Client::openat(int dirfd
, const char *relpath
, int flags
, const UserPerm
& perms
,
9934 mode_t mode
, int stripe_unit
, int stripe_count
, int object_size
,
9935 const char *data_pool
, std::string alternate_name
) {
9936 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9937 if (!mref_reader
.is_state_satisfied()) {
9938 return -CEPHFS_ENOTCONN
;
9941 ldout(cct
, 3) << "openat enter(" << relpath
<< ")" << dendl
;
9942 tout(cct
) << dirfd
<< std::endl
;
9943 tout(cct
) << relpath
<< std::endl
;
9944 tout(cct
) << flags
<< std::endl
;
9945 tout(cct
) << mode
<< std::endl
;
9947 std::scoped_lock
locker(client_lock
);
9948 int r
= create_and_open(dirfd
, relpath
, flags
, perms
, mode
, stripe_unit
, stripe_count
,
9949 object_size
, data_pool
, alternate_name
);
9951 tout(cct
) << r
<< std::endl
;
9952 ldout(cct
, 3) << "openat exit(" << relpath
<< ")" << dendl
;
9956 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
9957 const UserPerm
& perms
)
9959 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
9961 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9962 if (!mref_reader
.is_state_satisfied())
9963 return -CEPHFS_ENOTCONN
;
9965 std::scoped_lock
lock(client_lock
);
9966 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
9968 req
->set_filepath(path
);
9970 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
9972 sprintf(f
, "%u", h
);
9973 filepath
path2(dirino
);
9974 path2
.push_dentry(string(f
));
9975 req
->set_filepath2(path2
);
9977 int r
= make_request(req
, perms
, NULL
, NULL
,
9978 rand() % mdsmap
->get_num_in_mds());
9979 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
9985 * Load inode into local cache.
9987 * If inode pointer is non-NULL, and take a reference on
9988 * the resulting Inode object in one operation, so that caller
9989 * can safely assume inode will still be there after return.
9991 int Client::_lookup_vino(vinodeno_t vino
, const UserPerm
& perms
, Inode
**inode
)
9993 ldout(cct
, 8) << __func__
<< " enter(" << vino
<< ")" << dendl
;
9995 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
9996 if (!mref_reader
.is_state_satisfied())
9997 return -CEPHFS_ENOTCONN
;
9999 if (is_reserved_vino(vino
))
10000 return -CEPHFS_ESTALE
;
10002 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
10003 filepath
path(vino
.ino
);
10004 req
->set_filepath(path
);
10007 * The MDS expects either a "real" snapid here or 0. The special value
10008 * carveouts for the snapid are all at the end of the range so we can
10009 * just look for any snapid below this value.
10011 if (vino
.snapid
< CEPH_NOSNAP
)
10012 req
->head
.args
.lookupino
.snapid
= vino
.snapid
;
10014 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
10015 if (r
== 0 && inode
!= NULL
) {
10016 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10017 ceph_assert(p
!= inode_map
.end());
10018 *inode
= p
->second
;
10021 ldout(cct
, 8) << __func__
<< " exit(" << vino
<< ") = " << r
<< dendl
;
10025 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
10027 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
10028 std::scoped_lock
lock(client_lock
);
10029 return _lookup_vino(vino
, perms
, inode
);
10033 * Find the parent inode of `ino` and insert it into
10034 * our cache. Conditionally also set `parent` to a referenced
10035 * Inode* if caller provides non-NULL value.
10037 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
10039 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
10041 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
10042 filepath
path(ino
->ino
);
10043 req
->set_filepath(path
);
10046 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
10047 // Give caller a reference to the parent ino if they provided a pointer.
10048 if (parent
!= NULL
) {
10050 *parent
= target
.get();
10052 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
10057 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
10062 * Populate the parent dentry for `ino`, provided it is
10063 * a child of `parent`.
10065 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
10067 ceph_assert(parent
->is_dir());
10068 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
10070 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10071 if (!mref_reader
.is_state_satisfied())
10072 return -CEPHFS_ENOTCONN
;
10074 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10075 req
->set_filepath2(filepath(parent
->ino
));
10076 req
->set_filepath(filepath(ino
->ino
));
10077 req
->set_inode(ino
);
10079 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
10080 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
10084 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
10086 std::scoped_lock
lock(client_lock
);
10087 return _lookup_name(ino
, parent
, perms
);
10090 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
10093 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
10095 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
10097 if (in
->snapid
!= CEPH_NOSNAP
) {
10098 in
->snap_cap_refs
++;
10099 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
10100 << ccap_string(in
->caps_issued()) << dendl
;
10103 const auto& conf
= cct
->_conf
;
10104 f
->readahead
.set_trigger_requests(1);
10105 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
10106 uint64_t max_readahead
= Readahead::NO_LIMIT
;
10107 if (conf
->client_readahead_max_bytes
) {
10108 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
10110 if (conf
->client_readahead_max_periods
) {
10111 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
10113 f
->readahead
.set_max_readahead_size(max_readahead
);
10114 vector
<uint64_t> alignments
;
10115 alignments
.push_back(in
->layout
.get_period());
10116 alignments
.push_back(in
->layout
.stripe_unit
);
10117 f
->readahead
.set_alignments(alignments
);
10122 int Client::_release_fh(Fh
*f
)
10124 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
10125 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
10126 Inode
*in
= f
->inode
.get();
10127 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
10129 in
->unset_deleg(f
);
10131 if (in
->snapid
== CEPH_NOSNAP
) {
10132 if (in
->put_open_ref(f
->mode
)) {
10133 _flush(in
, new C_Client_FlushComplete(this, in
));
10137 ceph_assert(in
->snap_cap_refs
> 0);
10138 in
->snap_cap_refs
--;
10141 _release_filelocks(f
);
10143 // Finally, read any async err (i.e. from flushes)
10144 int err
= f
->take_async_err();
10146 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
10147 << cpp_strerror(err
) << dendl
;
10149 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
10157 void Client::_put_fh(Fh
*f
)
10159 int left
= f
->put();
10165 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
10166 const UserPerm
& perms
)
10168 if (in
->snapid
!= CEPH_NOSNAP
&&
10169 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
10170 return -CEPHFS_EROFS
;
10173 // use normalized flags to generate cmode
10174 int cflags
= ceph_flags_sys2wire(flags
);
10175 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
10176 cflags
|= CEPH_O_LAZY
;
10178 int cmode
= ceph_flags_to_mode(cflags
);
10179 int want
= ceph_caps_for_mode(cmode
);
10182 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
10184 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
10186 check_caps(in
, CHECK_CAPS_NODELAY
);
10189 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
10191 in
->make_nosnap_relative_path(path
);
10192 req
->set_filepath(path
);
10193 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
10194 req
->head
.args
.open
.mode
= mode
;
10195 req
->head
.args
.open
.pool
= -1;
10196 if (cct
->_conf
->client_debug_getattr_caps
)
10197 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
10199 req
->head
.args
.open
.mask
= 0;
10200 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
10201 req
->set_inode(in
);
10202 result
= make_request(req
, perms
);
10205 * NFS expects that delegations will be broken on a conflicting open,
10206 * not just when there is actual conflicting access to the file. SMB leases
10207 * and oplocks also have similar semantics.
10209 * Ensure that clients that have delegations enabled will wait on minimal
10210 * caps during open, just to ensure that other clients holding delegations
10211 * return theirs first.
10213 if (deleg_timeout
&& result
== 0) {
10214 int need
= 0, have
;
10216 if (cmode
& CEPH_FILE_MODE_WR
)
10217 need
|= CEPH_CAP_FILE_WR
;
10218 if (cmode
& CEPH_FILE_MODE_RD
)
10219 need
|= CEPH_CAP_FILE_RD
;
10221 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
10222 result
= get_caps(&fh
, need
, want
, &have
, -1);
10224 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
10225 " . Denying open: " <<
10226 cpp_strerror(result
) << dendl
;
10228 put_cap_ref(in
, need
);
10236 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
10238 in
->put_open_ref(cmode
);
10246 int Client::_renew_caps(Inode
*in
)
10248 int wanted
= in
->caps_file_wanted();
10249 if (in
->is_any_caps() &&
10250 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
10251 check_caps(in
, CHECK_CAPS_NODELAY
);
10256 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
10258 else if (wanted
& CEPH_CAP_FILE_RD
)
10260 else if (wanted
& CEPH_CAP_FILE_WR
)
10263 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
10265 in
->make_nosnap_relative_path(path
);
10266 req
->set_filepath(path
);
10267 req
->head
.args
.open
.flags
= flags
;
10268 req
->head
.args
.open
.pool
= -1;
10269 if (cct
->_conf
->client_debug_getattr_caps
)
10270 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
10272 req
->head
.args
.open
.mask
= 0;
10273 req
->set_inode(in
);
10275 // duplicate in case Cap goes away; not sure if that race is a concern?
10276 const UserPerm
*pperm
= in
->get_best_perms();
10280 int ret
= make_request(req
, perms
);
10284 int Client::_close(int fd
)
10286 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
10287 tout(cct
) << "close" << std::endl
;
10288 tout(cct
) << fd
<< std::endl
;
10290 Fh
*fh
= get_filehandle(fd
);
10292 return -CEPHFS_EBADF
;
10293 int err
= _release_fh(fh
);
10296 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
10300 int Client::close(int fd
) {
10301 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10302 if (!mref_reader
.is_state_satisfied())
10303 return -CEPHFS_ENOTCONN
;
10305 std::scoped_lock
lock(client_lock
);
10312 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
10314 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10315 if (!mref_reader
.is_state_satisfied())
10316 return -CEPHFS_ENOTCONN
;
10318 tout(cct
) << "lseek" << std::endl
;
10319 tout(cct
) << fd
<< std::endl
;
10320 tout(cct
) << offset
<< std::endl
;
10321 tout(cct
) << whence
<< std::endl
;
10323 std::scoped_lock
lock(client_lock
);
10324 Fh
*f
= get_filehandle(fd
);
10326 return -CEPHFS_EBADF
;
10327 #if defined(__linux__) && defined(O_PATH)
10328 if (f
->flags
& O_PATH
)
10329 return -CEPHFS_EBADF
;
10331 return _lseek(f
, offset
, whence
);
10334 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
10336 Inode
*in
= f
->inode
.get();
10337 bool whence_check
= false;
10342 whence_check
= true;
10347 whence_check
= true;
10353 whence_check
= true;
10358 if (whence_check
) {
10359 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10370 pos
= f
->pos
+ offset
;
10374 pos
= in
->size
+ offset
;
10379 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
10380 return -CEPHFS_ENXIO
;
10387 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
10388 return -CEPHFS_ENXIO
;
10394 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
10395 return -CEPHFS_EINVAL
;
10399 return -CEPHFS_EINVAL
;
10404 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
10409 void Client::lock_fh_pos(Fh
*f
)
10411 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
10413 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
10414 ceph::condition_variable cond
;
10415 f
->pos_waiters
.push_back(&cond
);
10416 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
10417 std::unique_lock l
{client_lock
, std::adopt_lock
};
10418 cond
.wait(l
, [f
, me
=&cond
] {
10419 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
10422 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
10423 ceph_assert(f
->pos_waiters
.front() == &cond
);
10424 f
->pos_waiters
.pop_front();
10427 f
->pos_locked
= true;
10430 void Client::unlock_fh_pos(Fh
*f
)
10432 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10434 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
10435 f
->pos_locked
= false;
10436 if (!f
->pos_waiters
.empty()) {
10437 // only wake up the oldest waiter
10438 auto cond
= f
->pos_waiters
.front();
10439 cond
->notify_one();
10443 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
10445 if (!in
->inline_data
.length()) {
10446 onfinish
->complete(0);
10451 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
10452 object_t oid
= oid_buf
;
10454 ObjectOperation create_ops
;
10455 create_ops
.create(false);
10457 objecter
->mutate(oid
,
10458 OSDMap::file_to_object_locator(in
->layout
),
10460 in
->snaprealm
->get_snap_context(),
10461 ceph::real_clock::now(),
10465 bufferlist inline_version_bl
;
10466 encode(in
->inline_version
, inline_version_bl
);
10468 ObjectOperation uninline_ops
;
10469 uninline_ops
.cmpxattr("inline_version",
10470 CEPH_OSD_CMPXATTR_OP_GT
,
10471 CEPH_OSD_CMPXATTR_MODE_U64
,
10472 inline_version_bl
);
10473 bufferlist inline_data
= in
->inline_data
;
10474 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
10475 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
10477 objecter
->mutate(oid
,
10478 OSDMap::file_to_object_locator(in
->layout
),
10480 in
->snaprealm
->get_snap_context(),
10481 ceph::real_clock::now(),
10490 // blocking osd interface
10492 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
10494 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10495 if (!mref_reader
.is_state_satisfied())
10496 return -CEPHFS_ENOTCONN
;
10498 tout(cct
) << "read" << std::endl
;
10499 tout(cct
) << fd
<< std::endl
;
10500 tout(cct
) << size
<< std::endl
;
10501 tout(cct
) << offset
<< std::endl
;
10503 std::unique_lock
lock(client_lock
);
10504 Fh
*f
= get_filehandle(fd
);
10506 return -CEPHFS_EBADF
;
10507 #if defined(__linux__) && defined(O_PATH)
10508 if (f
->flags
& O_PATH
)
10509 return -CEPHFS_EBADF
;
10512 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10513 size
= std::min(size
, (loff_t
)INT_MAX
);
10514 int r
= _read(f
, offset
, size
, &bl
);
10515 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10518 bl
.begin().copy(bl
.length(), buf
);
10524 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
10527 return -CEPHFS_EINVAL
;
10528 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
10531 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
10533 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10535 int want
, have
= 0;
10536 bool movepos
= false;
10538 const auto& conf
= cct
->_conf
;
10539 Inode
*in
= f
->inode
.get();
10541 utime_t start
= ceph_clock_now();
10543 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
10544 return -CEPHFS_EBADF
;
10545 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10552 loff_t start_pos
= offset
;
10554 if (in
->inline_version
== 0) {
10555 auto r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10560 ceph_assert(in
->inline_version
> 0);
10564 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10565 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
10567 want
= CEPH_CAP_FILE_CACHE
;
10569 auto r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
10575 if (f
->flags
& O_DIRECT
)
10576 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
10578 if (in
->inline_version
< CEPH_INLINE_NONE
) {
10579 uint32_t len
= in
->inline_data
.length();
10580 uint64_t endoff
= offset
+ size
;
10581 if (endoff
> in
->size
)
10584 if (offset
< len
) {
10585 if (endoff
<= len
) {
10586 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
10588 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
10589 bl
->append_zero(endoff
- len
);
10591 rc
= endoff
- offset
;
10592 } else if ((uint64_t)offset
< endoff
) {
10593 bl
->append_zero(endoff
- offset
);
10594 rc
= endoff
- offset
;
10601 if (!conf
->client_debug_force_sync_read
&&
10603 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
10605 if (f
->flags
& O_RSYNC
) {
10606 _flush_range(in
, offset
, size
);
10608 rc
= _read_async(f
, offset
, size
, bl
);
10612 if (f
->flags
& O_DIRECT
)
10613 _flush_range(in
, offset
, size
);
10615 bool checkeof
= false;
10616 rc
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
10623 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10627 auto r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10634 // eof? short read.
10635 if ((uint64_t)offset
< in
->size
)
10641 ceph_assert(rc
>= 0);
10642 update_read_io_size(bl
->length());
10645 f
->pos
= start_pos
+ rc
;
10648 lat
= ceph_clock_now();
10652 update_io_stat_read(lat
);
10657 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
10665 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
10668 f
->readahead
.inc_pending();
10671 Client::C_Readahead::~C_Readahead() {
10672 f
->readahead
.dec_pending();
10673 client
->_put_fh(f
);
10676 void Client::C_Readahead::finish(int r
) {
10677 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
10678 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10680 client
->update_read_io_size(r
);
10684 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
10686 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10688 const auto& conf
= cct
->_conf
;
10689 Inode
*in
= f
->inode
.get();
10691 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10693 // trim read based on file size?
10694 if (off
>= in
->size
)
10698 if (off
+ len
> in
->size
) {
10699 len
= in
->size
- off
;
10702 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
10703 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
10704 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
10706 // read (and possibly block)
10708 C_SaferCond
onfinish("Client::_read_async flock");
10709 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10710 off
, len
, bl
, 0, &onfinish
);
10712 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10713 client_lock
.unlock();
10714 r
= onfinish
.wait();
10715 client_lock
.lock();
10716 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
10717 update_read_io_size(bl
->length());
10720 if(f
->readahead
.get_min_readahead_size() > 0) {
10721 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
10722 if (readahead_extent
.second
> 0) {
10723 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
10724 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
10725 Context
*onfinish2
= new C_Readahead(this, f
);
10726 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
10727 readahead_extent
.first
, readahead_extent
.second
,
10728 NULL
, 0, onfinish2
);
10730 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
10731 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
10733 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
10742 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
10745 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10747 Inode
*in
= f
->inode
.get();
10748 uint64_t pos
= off
;
10752 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
10754 // 0 success, 1 continue and < 0 error happen.
10755 auto wait_and_copy
= [&](C_SaferCond
&onfinish
, bufferlist
&tbl
, int wanted
) {
10756 int r
= onfinish
.wait();
10758 // if we get ENOENT from OSD, assume 0 bytes returned
10759 if (r
== -CEPHFS_ENOENT
)
10764 if (tbl
.length()) {
10770 bl
->claim_append(tbl
);
10773 if (r
>= 0 && r
< wanted
) {
10774 if (pos
< in
->size
) {
10775 // zero up to known EOF
10776 int64_t some
= in
->size
- pos
;
10779 auto z
= buffer::ptr_node::create(some
);
10781 bl
->push_back(std::move(z
));
10796 C_SaferCond
onfinish("Client::_read_sync flock");
10800 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
10801 pos
, left
, &tbl
, 0,
10802 in
->truncate_size
, in
->truncate_seq
,
10804 client_lock
.unlock();
10805 int r
= wait_and_copy(onfinish
, tbl
, wanted
);
10806 client_lock
.lock();
10815 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
10817 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10818 if (!mref_reader
.is_state_satisfied())
10819 return -CEPHFS_ENOTCONN
;
10821 tout(cct
) << "write" << std::endl
;
10822 tout(cct
) << fd
<< std::endl
;
10823 tout(cct
) << size
<< std::endl
;
10824 tout(cct
) << offset
<< std::endl
;
10826 std::scoped_lock
lock(client_lock
);
10827 Fh
*fh
= get_filehandle(fd
);
10829 return -CEPHFS_EBADF
;
10830 #if defined(__linux__) && defined(O_PATH)
10831 if (fh
->flags
& O_PATH
)
10832 return -CEPHFS_EBADF
;
10834 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10835 size
= std::min(size
, (loff_t
)INT_MAX
);
10836 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
10837 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
10841 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
10844 return -CEPHFS_EINVAL
;
10845 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
10848 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
10849 unsigned iovcnt
, int64_t offset
,
10850 bool write
, bool clamp_to_int
)
10852 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10854 #if defined(__linux__) && defined(O_PATH)
10855 if (fh
->flags
& O_PATH
)
10856 return -CEPHFS_EBADF
;
10858 loff_t totallen
= 0;
10859 for (unsigned i
= 0; i
< iovcnt
; i
++) {
10860 totallen
+= iov
[i
].iov_len
;
10864 * Some of the API functions take 64-bit size values, but only return
10865 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10866 * we don't do I/Os larger than the values we can return.
10868 if (clamp_to_int
) {
10869 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
10872 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
10873 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
10877 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
10878 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
10882 client_lock
.unlock();
10883 auto iter
= bl
.cbegin();
10884 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
10886 * This piece of code aims to handle the case that bufferlist
10887 * does not have enough data to fill in the iov
10889 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
10890 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
10891 resid
-= round_size
;
10892 /* iter is self-updating */
10894 client_lock
.lock();
10899 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
10901 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
10902 if (!mref_reader
.is_state_satisfied())
10903 return -CEPHFS_ENOTCONN
;
10905 tout(cct
) << fd
<< std::endl
;
10906 tout(cct
) << offset
<< std::endl
;
10908 std::scoped_lock
cl(client_lock
);
10909 Fh
*fh
= get_filehandle(fd
);
10911 return -CEPHFS_EBADF
;
10912 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
10915 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
10916 const struct iovec
*iov
, int iovcnt
)
10918 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
10921 Inode
*in
= f
->inode
.get();
10923 if ( (uint64_t)(offset
+size
) > mdsmap
->get_max_filesize() && //exceeds config
10924 (uint64_t)(offset
+size
) > in
->size
) { //exceeds filesize
10925 return -CEPHFS_EFBIG
;
10927 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10929 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
10930 return -CEPHFS_ENOSPC
;
10933 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
10935 // was Fh opened as writeable?
10936 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
10937 return -CEPHFS_EBADF
;
10939 // use/adjust fd pos?
10943 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10944 * change out from under us.
10946 if (f
->flags
& O_APPEND
) {
10947 auto r
= _lseek(f
, 0, SEEK_END
);
10954 fpos
= offset
+size
;
10959 uint64_t endoff
= offset
+ size
;
10960 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
10962 return -CEPHFS_EDQUOT
;
10965 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10967 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
10970 utime_t start
= ceph_clock_now();
10972 if (in
->inline_version
== 0) {
10973 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
10976 ceph_assert(in
->inline_version
> 0);
10979 // copy into fresh buffer (since our write may be resub, async)
10983 bl
.append(buf
, size
);
10985 for (int i
= 0; i
< iovcnt
; i
++) {
10986 if (iov
[i
].iov_len
> 0) {
10987 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
10993 uint64_t totalwritten
;
10995 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
10996 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
10998 want
= CEPH_CAP_FILE_BUFFER
;
10999 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
11003 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
11005 r
= clear_suid_sgid(in
, f
->actor_perms
);
11007 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
11012 if (f
->flags
& O_DIRECT
)
11013 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
11015 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
11017 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
11019 if (in
->inline_version
< CEPH_INLINE_NONE
) {
11020 if (endoff
> cct
->_conf
->client_max_inline_size
||
11021 endoff
> CEPH_INLINE_MAX_SIZE
||
11022 !(have
& CEPH_CAP_FILE_BUFFER
)) {
11023 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
11024 uninline_data(in
, onuninline
.get());
11026 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11028 uint32_t len
= in
->inline_data
.length();
11031 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
11034 in
->inline_data
.splice(offset
, len
- offset
);
11035 else if (offset
> len
)
11036 in
->inline_data
.append_zero(offset
- len
);
11038 in
->inline_data
.append(bl
);
11039 in
->inline_version
++;
11041 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11047 if (cct
->_conf
->client_oc
&&
11048 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
11049 // do buffered write
11050 if (!in
->oset
.dirty_or_tx
)
11051 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
11053 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11055 // async, caching, non-blocking.
11056 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
11057 in
->snaprealm
->get_snap_context(),
11058 offset
, size
, bl
, ceph::real_clock::now(),
11060 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11065 // flush cached write if O_SYNC is set on file fh
11066 // O_DSYNC == O_SYNC on linux < 2.6.33
11067 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
11068 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
11069 _flush_range(in
, offset
, size
);
11072 if (f
->flags
& O_DIRECT
)
11073 _flush_range(in
, offset
, size
);
11075 // simple, non-atomic sync write
11076 C_SaferCond
onfinish("Client::_write flock");
11077 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11079 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
11080 offset
, size
, bl
, ceph::real_clock::now(), 0,
11081 in
->truncate_size
, in
->truncate_seq
,
11083 client_lock
.unlock();
11084 r
= onfinish
.wait();
11085 client_lock
.lock();
11086 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
11091 // if we get here, write was successful, update client metadata
11093 update_write_io_size(size
);
11095 lat
= ceph_clock_now();
11098 ++nr_write_request
;
11099 update_io_stat_write(lat
);
11106 totalwritten
= size
;
11107 r
= (int64_t)totalwritten
;
11110 if (totalwritten
+ offset
> in
->size
) {
11111 in
->size
= totalwritten
+ offset
;
11112 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
11114 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
11115 check_caps(in
, CHECK_CAPS_NODELAY
);
11116 } else if (is_max_size_approaching(in
)) {
11120 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
11122 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
11126 in
->mtime
= in
->ctime
= ceph_clock_now();
11128 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
11132 if (nullptr != onuninline
) {
11133 client_lock
.unlock();
11134 int uninline_ret
= onuninline
->wait();
11135 client_lock
.lock();
11137 if (uninline_ret
>= 0 || uninline_ret
== -CEPHFS_ECANCELED
) {
11138 in
->inline_data
.clear();
11139 in
->inline_version
= CEPH_INLINE_NONE
;
11140 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
11146 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
11150 int Client::_flush(Fh
*f
)
11152 Inode
*in
= f
->inode
.get();
11153 int err
= f
->take_async_err();
11155 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
11156 << cpp_strerror(err
) << dendl
;
11158 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
11164 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
11166 struct ceph_statx stx
;
11167 stx
.stx_size
= length
;
11168 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
11171 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
11173 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11174 if (!mref_reader
.is_state_satisfied())
11175 return -CEPHFS_ENOTCONN
;
11177 tout(cct
) << __func__
<< std::endl
;
11178 tout(cct
) << fd
<< std::endl
;
11179 tout(cct
) << length
<< std::endl
;
11181 std::scoped_lock
lock(client_lock
);
11182 Fh
*f
= get_filehandle(fd
);
11184 return -CEPHFS_EBADF
;
11185 #if defined(__linux__) && defined(O_PATH)
11186 if (f
->flags
& O_PATH
)
11187 return -CEPHFS_EBADF
;
11189 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
11190 return -CEPHFS_EBADF
;
11192 attr
.st_size
= length
;
11193 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
11196 int Client::fsync(int fd
, bool syncdataonly
)
11198 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11199 if (!mref_reader
.is_state_satisfied())
11200 return -CEPHFS_ENOTCONN
;
11202 tout(cct
) << "fsync" << std::endl
;
11203 tout(cct
) << fd
<< std::endl
;
11204 tout(cct
) << syncdataonly
<< std::endl
;
11206 std::scoped_lock
lock(client_lock
);
11207 Fh
*f
= get_filehandle(fd
);
11209 return -CEPHFS_EBADF
;
11210 #if defined(__linux__) && defined(O_PATH)
11211 if (f
->flags
& O_PATH
)
11212 return -CEPHFS_EBADF
;
11214 int r
= _fsync(f
, syncdataonly
);
11216 // The IOs in this fsync were okay, but maybe something happened
11217 // in the background that we shoudl be reporting?
11218 r
= f
->take_async_err();
11219 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
11220 << ") = 0, async_err = " << r
<< dendl
;
11222 // Assume that an error we encountered during fsync, even reported
11223 // synchronously, would also have applied the error to the Fh, and we
11224 // should clear it here to avoid returning the same error again on next
11226 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
11228 f
->take_async_err();
11233 int Client::_fsync(Inode
*in
, bool syncdataonly
)
11235 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
11238 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
11239 ceph_tid_t flush_tid
= 0;
11242 utime_t start
= ceph_clock_now();
11244 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
11246 if (cct
->_conf
->client_oc
) {
11247 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
11248 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
11249 _flush(in
, object_cacher_completion
.get());
11250 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
11253 if (!syncdataonly
&& in
->dirty_caps
) {
11254 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
11255 if (in
->flushing_caps
)
11256 flush_tid
= last_flush_tid
;
11257 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
11259 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
11260 flush_mdlog_sync(in
);
11262 MetaRequest
*req
= in
->unsafe_ops
.back();
11263 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
11266 wait_on_list(req
->waitfor_safe
);
11270 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
11271 client_lock
.unlock();
11272 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
11273 r
= object_cacher_completion
->wait();
11274 client_lock
.lock();
11275 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
11277 // FIXME: this can starve
11278 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
11279 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
11280 << " uncommitted, waiting" << dendl
;
11281 wait_on_list(in
->waitfor_commit
);
11287 wait_sync_caps(in
, flush_tid
);
11289 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
11291 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
11292 << cpp_strerror(-r
) << dendl
;
11295 lat
= ceph_clock_now();
11297 logger
->tinc(l_c_fsync
, lat
);
11302 int Client::_fsync(Fh
*f
, bool syncdataonly
)
11304 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
11305 return _fsync(f
->inode
.get(), syncdataonly
);
11308 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
11310 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11311 if (!mref_reader
.is_state_satisfied())
11312 return -CEPHFS_ENOTCONN
;
11314 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
11315 tout(cct
) << fd
<< std::endl
;
11317 std::scoped_lock
lock(client_lock
);
11318 Fh
*f
= get_filehandle(fd
);
11320 return -CEPHFS_EBADF
;
11321 int r
= _getattr(f
->inode
, mask
, perms
);
11324 fill_stat(f
->inode
, stbuf
, NULL
);
11325 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
11329 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
11330 unsigned int want
, unsigned int flags
)
11332 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11333 if (!mref_reader
.is_state_satisfied())
11334 return -CEPHFS_ENOTCONN
;
11336 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
11337 tout(cct
) << fd
<< std::endl
;
11339 std::scoped_lock
lock(client_lock
);
11340 Fh
*f
= get_filehandle(fd
);
11342 return -CEPHFS_EBADF
;
11344 unsigned mask
= statx_to_mask(flags
, want
);
11348 r
= _getattr(f
->inode
, mask
, perms
);
11350 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
11355 fill_statx(f
->inode
, mask
, stx
);
11356 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
11360 int Client::statxat(int dirfd
, const char *relpath
,
11361 struct ceph_statx
*stx
, const UserPerm
& perms
,
11362 unsigned int want
, unsigned int flags
) {
11363 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11364 if (!mref_reader
.is_state_satisfied()) {
11365 return -CEPHFS_ENOTCONN
;
11368 tout(cct
) << __func__
<< " flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
11369 tout(cct
) << dirfd
<< std::endl
;
11370 tout(cct
) << relpath
<< std::endl
;
11372 unsigned mask
= statx_to_mask(flags
, want
);
11375 std::scoped_lock
lock(client_lock
);
11376 int r
= get_fd_inode(dirfd
, &dirinode
);
11382 filepath
path(relpath
);
11383 r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
, dirinode
);
11387 r
= _getattr(in
, mask
, perms
);
11389 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
11393 fill_statx(in
, mask
, stx
);
11394 ldout(cct
, 3) << __func__
<< " dirfd" << dirfd
<< ", r= " << r
<< dendl
;
11398 // not written yet, but i want to link!
11400 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
11401 const UserPerm
& perms
)
11403 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11404 if (!mref_reader
.is_state_satisfied())
11405 return -CEPHFS_ENOTCONN
;
11407 tout(cct
) << "chdir" << std::endl
;
11408 tout(cct
) << relpath
<< std::endl
;
11410 filepath
path(relpath
);
11413 std::scoped_lock
lock(client_lock
);
11414 int r
= path_walk(path
, &in
, perms
);
11418 if (!(in
.get()->is_dir()))
11419 return -CEPHFS_ENOTDIR
;
11423 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
11425 _getcwd(new_cwd
, perms
);
11429 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
11432 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
11434 Inode
*in
= cwd
.get();
11435 while (in
!= root
.get()) {
11436 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
11438 // A cwd or ancester is unlinked
11439 if (in
->dentries
.empty()) {
11443 Dentry
*dn
= in
->get_first_parent();
11448 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
11449 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
11450 filepath
path(in
->ino
);
11451 req
->set_filepath(path
);
11452 req
->set_inode(in
);
11453 int res
= make_request(req
, perms
);
11462 path
.push_front_dentry(dn
->name
);
11463 in
= dn
->dir
->parent_inode
;
11466 dir
+= path
.get_path();
11469 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
11471 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11472 if (!mref_reader
.is_state_satisfied())
11475 std::scoped_lock
l(client_lock
);
11477 _getcwd(dir
, perms
);
11480 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
11481 const UserPerm
& perms
)
11483 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11484 if (!mref_reader
.is_state_satisfied())
11485 return -CEPHFS_ENOTCONN
;
11487 tout(cct
) << __func__
<< std::endl
;
11488 unsigned long int total_files_on_fs
;
11493 std::unique_lock
lock(client_lock
);
11494 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
11495 if (data_pools
.size() == 1) {
11496 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
11498 objecter
->get_fs_stats(stats
, std::optional
<int64_t>(), &cond
);
11502 int rval
= cond
.wait();
11506 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
11509 ldout(cct
, 1) << "underlying call to statfs returned error: "
11510 << cpp_strerror(rval
)
11515 memset(stbuf
, 0, sizeof(*stbuf
));
11518 * we're going to set a block size of 4MB so we can represent larger
11519 * FSes without overflowing. Additionally convert the space
11520 * measurements from KB to bytes while making them in terms of
11521 * blocks. We use 4MB only because it is big enough, and because it
11522 * actually *is* the (ceph) default block size.
11524 const int CEPH_BLOCK_SHIFT
= 22;
11525 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
11526 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
11527 stbuf
->f_files
= total_files_on_fs
;
11528 stbuf
->f_ffree
= -1;
11529 stbuf
->f_favail
= -1;
11530 stbuf
->f_fsid
= -1; // ??
11531 stbuf
->f_flag
= 0; // ??
11532 stbuf
->f_namemax
= NAME_MAX
;
11534 // Usually quota_root will == root_ancestor, but if the mount root has no
11535 // quota but we can see a parent of it that does have a quota, we'll
11536 // respect that one instead.
11537 ceph_assert(root
!= nullptr);
11538 InodeRef quota_root
= root
->quota
.is_enabled(QUOTA_MAX_BYTES
) ? root
: get_quota_root(root
.get(), perms
, QUOTA_MAX_BYTES
);
11540 // get_quota_root should always give us something if client quotas are
11542 ceph_assert(cct
->_conf
.get_val
<bool>("client_quota") == false || quota_root
!= nullptr);
11544 /* If bytes quota is set on a directory and conf option "client quota df"
11545 * is also set, available space = quota limit - used space. Else,
11546 * available space = total space - used space. */
11547 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
11549 // Skip the getattr if any sessions are stale, as we don't want to
11550 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11552 if (!_any_stale_sessions()) {
11553 int r
= _getattr(quota_root
, 0, perms
, true);
11555 // Ignore return value: error getting latest inode metadata is not a good
11556 // reason to break "df".
11557 lderr(cct
) << "Error in getattr on quota root 0x"
11558 << std::hex
<< quota_root
->ino
<< std::dec
11559 << " statfs result may be outdated" << dendl
;
11563 // Special case: if there is a size quota set on the Inode acting
11564 // as the root for this client mount, then report the quota status
11565 // as the filesystem statistics.
11566 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
11567 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
11568 // It is possible for a quota to be exceeded: arithmetic here must
11569 // handle case where used > total.
11570 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
11572 stbuf
->f_blocks
= total
;
11573 stbuf
->f_bfree
= free
;
11574 stbuf
->f_bavail
= free
;
11576 // General case: report the cluster statistics returned from RADOS. Because
11577 // multiple pools may be used without one filesystem namespace via
11578 // layouts, this is the most correct thing we can do.
11579 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
11580 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11581 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
11587 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
11588 struct flock
*fl
, uint64_t owner
, bool removing
)
11590 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
11591 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
11592 << " type " << fl
->l_type
<< " owner " << owner
11593 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
11595 if (in
->flags
& I_ERROR_FILELOCK
)
11596 return -CEPHFS_EIO
;
11599 if (F_RDLCK
== fl
->l_type
)
11600 lock_cmd
= CEPH_LOCK_SHARED
;
11601 else if (F_WRLCK
== fl
->l_type
)
11602 lock_cmd
= CEPH_LOCK_EXCL
;
11603 else if (F_UNLCK
== fl
->l_type
)
11604 lock_cmd
= CEPH_LOCK_UNLOCK
;
11606 return -CEPHFS_EIO
;
11608 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
11612 * Set the most significant bit, so that MDS knows the 'owner'
11613 * is sufficient to identify the owner of lock. (old code uses
11614 * both 'owner' and 'pid')
11616 owner
|= (1ULL << 63);
11618 MetaRequest
*req
= new MetaRequest(op
);
11620 in
->make_nosnap_relative_path(path
);
11621 req
->set_filepath(path
);
11622 req
->set_inode(in
);
11624 req
->head
.args
.filelock_change
.rule
= lock_type
;
11625 req
->head
.args
.filelock_change
.type
= lock_cmd
;
11626 req
->head
.args
.filelock_change
.owner
= owner
;
11627 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
11628 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
11629 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
11630 req
->head
.args
.filelock_change
.wait
= sleep
;
11635 if (sleep
&& switch_interrupt_cb
) {
11636 // enable interrupt
11637 switch_interrupt_cb(callback_handle
, req
->get());
11638 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11639 // disable interrupt
11640 switch_interrupt_cb(callback_handle
, NULL
);
11641 if (ret
== 0 && req
->aborted()) {
11642 // effect of this lock request has been revoked by the 'lock intr' request
11643 ret
= req
->get_abort_code();
11647 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
11651 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
11652 ceph_filelock filelock
;
11653 auto p
= bl
.cbegin();
11654 decode(filelock
, p
);
11656 if (CEPH_LOCK_SHARED
== filelock
.type
)
11657 fl
->l_type
= F_RDLCK
;
11658 else if (CEPH_LOCK_EXCL
== filelock
.type
)
11659 fl
->l_type
= F_WRLCK
;
11661 fl
->l_type
= F_UNLCK
;
11663 fl
->l_whence
= SEEK_SET
;
11664 fl
->l_start
= filelock
.start
;
11665 fl
->l_len
= filelock
.length
;
11666 fl
->l_pid
= filelock
.pid
;
11667 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
11668 ceph_lock_state_t
*lock_state
;
11669 if (lock_type
== CEPH_LOCK_FCNTL
) {
11670 if (!in
->fcntl_locks
)
11671 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11672 lock_state
= in
->fcntl_locks
.get();
11673 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
11674 if (!in
->flock_locks
)
11675 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11676 lock_state
= in
->flock_locks
.get();
11679 return -CEPHFS_EINVAL
;
11681 _update_lock_state(fl
, owner
, lock_state
);
11684 if (lock_type
== CEPH_LOCK_FCNTL
) {
11685 if (!fh
->fcntl_locks
)
11686 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
11687 lock_state
= fh
->fcntl_locks
.get();
11689 if (!fh
->flock_locks
)
11690 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
11691 lock_state
= fh
->flock_locks
.get();
11693 _update_lock_state(fl
, owner
, lock_state
);
11701 int Client::_interrupt_filelock(MetaRequest
*req
)
11703 // Set abort code, but do not kick. The abort code prevents the request
11704 // from being re-sent.
11705 req
->abort(-CEPHFS_EINTR
);
11707 return 0; // haven't sent the request
11709 Inode
*in
= req
->inode();
11712 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
11713 lock_type
= CEPH_LOCK_FLOCK_INTR
;
11714 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
11715 lock_type
= CEPH_LOCK_FCNTL_INTR
;
11718 return -CEPHFS_EINVAL
;
11721 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
11723 in
->make_nosnap_relative_path(path
);
11724 intr_req
->set_filepath(path
);
11725 intr_req
->set_inode(in
);
11726 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
11727 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
11728 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
11730 UserPerm
perms(req
->get_uid(), req
->get_gid());
11731 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
11734 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
11736 if (!in
->fcntl_locks
&& !in
->flock_locks
)
11739 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
11740 encode(nr_fcntl_locks
, bl
);
11741 if (nr_fcntl_locks
) {
11742 auto &lock_state
= in
->fcntl_locks
;
11743 for(auto p
= lock_state
->held_locks
.begin();
11744 p
!= lock_state
->held_locks
.end();
11746 encode(p
->second
, bl
);
11749 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
11750 encode(nr_flock_locks
, bl
);
11751 if (nr_flock_locks
) {
11752 auto &lock_state
= in
->flock_locks
;
11753 for(auto p
= lock_state
->held_locks
.begin();
11754 p
!= lock_state
->held_locks
.end();
11756 encode(p
->second
, bl
);
11759 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
11760 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
11763 void Client::_release_filelocks(Fh
*fh
)
11765 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
11768 Inode
*in
= fh
->inode
.get();
11769 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
11771 list
<ceph_filelock
> activated_locks
;
11773 list
<pair
<int, ceph_filelock
> > to_release
;
11775 if (fh
->fcntl_locks
) {
11776 auto &lock_state
= fh
->fcntl_locks
;
11777 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11779 if (in
->flags
& I_ERROR_FILELOCK
) {
11780 lock_state
->remove_lock(q
->second
, activated_locks
);
11782 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
11785 lock_state
.reset();
11787 if (fh
->flock_locks
) {
11788 auto &lock_state
= fh
->flock_locks
;
11789 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
11791 if (in
->flags
& I_ERROR_FILELOCK
) {
11792 lock_state
->remove_lock(q
->second
, activated_locks
);
11794 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
11797 lock_state
.reset();
11800 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
11801 in
->flags
&= ~I_ERROR_FILELOCK
;
11803 if (to_release
.empty())
11807 memset(&fl
, 0, sizeof(fl
));
11808 fl
.l_whence
= SEEK_SET
;
11809 fl
.l_type
= F_UNLCK
;
11811 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
11812 p
!= to_release
.end();
11814 fl
.l_start
= p
->second
.start
;
11815 fl
.l_len
= p
->second
.length
;
11816 fl
.l_pid
= p
->second
.pid
;
11817 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
11818 p
->second
.owner
, true);
11822 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
11823 ceph_lock_state_t
*lock_state
)
11826 if (F_RDLCK
== fl
->l_type
)
11827 lock_cmd
= CEPH_LOCK_SHARED
;
11828 else if (F_WRLCK
== fl
->l_type
)
11829 lock_cmd
= CEPH_LOCK_EXCL
;
11831 lock_cmd
= CEPH_LOCK_UNLOCK
;;
11833 ceph_filelock filelock
;
11834 filelock
.start
= fl
->l_start
;
11835 filelock
.length
= fl
->l_len
;
11836 filelock
.client
= 0;
11837 // see comment in _do_filelock()
11838 filelock
.owner
= owner
| (1ULL << 63);
11839 filelock
.pid
= fl
->l_pid
;
11840 filelock
.type
= lock_cmd
;
11842 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
11843 list
<ceph_filelock
> activated_locks
;
11844 lock_state
->remove_lock(filelock
, activated_locks
);
11846 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
11851 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
11853 Inode
*in
= fh
->inode
.get();
11854 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
11855 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
11859 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
11861 Inode
*in
= fh
->inode
.get();
11862 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
11863 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
11864 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11868 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
11870 Inode
*in
= fh
->inode
.get();
11871 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
11873 int sleep
= !(cmd
& LOCK_NB
);
11888 return -CEPHFS_EINVAL
;
11892 memset(&fl
, 0, sizeof(fl
));
11894 fl
.l_whence
= SEEK_SET
;
11896 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
11897 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
11901 int Client::get_snap_info(const char *path
, const UserPerm
&perms
, SnapInfo
*snap_info
) {
11902 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
11903 if (!mref_reader
.is_state_satisfied()) {
11904 return -CEPHFS_ENOTCONN
;
11907 std::scoped_lock
lock(client_lock
);
11909 int r
= Client::path_walk(path
, &in
, perms
, true);
11914 if (in
->snapid
== CEPH_NOSNAP
) {
11915 return -CEPHFS_EINVAL
;
11918 snap_info
->id
= in
->snapid
;
11919 snap_info
->metadata
= in
->snap_metadata
;
11923 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
11925 /* Since the only thing this does is wrap a call to statfs, and
11926 statfs takes a lock, it doesn't seem we have a need to split it
11928 return statfs(0, stbuf
, perms
);
11931 void Client::_ll_register_callbacks(struct ceph_client_callback_args
*args
)
11936 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
11937 << " invalidate_ino_cb " << args
->ino_cb
11938 << " invalidate_dentry_cb " << args
->dentry_cb
11939 << " switch_interrupt_cb " << args
->switch_intr_cb
11940 << " remount_cb " << args
->remount_cb
11942 callback_handle
= args
->handle
;
11943 if (args
->ino_cb
) {
11944 ino_invalidate_cb
= args
->ino_cb
;
11945 async_ino_invalidator
.start();
11947 if (args
->dentry_cb
) {
11948 dentry_invalidate_cb
= args
->dentry_cb
;
11949 async_dentry_invalidator
.start();
11951 if (args
->switch_intr_cb
) {
11952 switch_interrupt_cb
= args
->switch_intr_cb
;
11953 interrupt_finisher
.start();
11955 if (args
->remount_cb
) {
11956 remount_cb
= args
->remount_cb
;
11957 remount_finisher
.start();
11959 if (args
->ino_release_cb
) {
11960 ino_release_cb
= args
->ino_release_cb
;
11961 async_ino_releasor
.start();
11963 if (args
->umask_cb
)
11964 umask_cb
= args
->umask_cb
;
11967 // This is deprecated, use ll_register_callbacks2() instead.
11968 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
11970 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11972 _ll_register_callbacks(args
);
11975 int Client::ll_register_callbacks2(struct ceph_client_callback_args
*args
)
11977 if (is_mounting() || is_mounted() || is_unmounting())
11978 return -CEPHFS_EBUSY
;
11980 _ll_register_callbacks(args
);
11984 std::pair
<int, bool> Client::test_dentry_handling(bool can_invalidate
)
11986 std::pair
<int, bool> r(0, false);
11988 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
11989 if (!iref_reader
.is_state_satisfied())
11990 return std::make_pair(-CEPHFS_ENOTCONN
, false);
11992 can_invalidate_dentries
= can_invalidate
;
11995 * Force to use the old and slow method to invalidate the dcache
11996 * if the euid is non-root, or the remount may fail with return
11999 uid_t euid
= geteuid();
12000 ldout(cct
, 10) << "euid: " << euid
<< dendl
;
12002 can_invalidate_dentries
= true;
12005 if (can_invalidate_dentries
) {
12006 ceph_assert(dentry_invalidate_cb
);
12007 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
12009 ceph_assert(remount_cb
);
12010 ldout(cct
, 1) << "using remount_cb" << dendl
;
12011 r
= _do_remount(false);
12017 int Client::_sync_fs()
12019 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
12021 ldout(cct
, 10) << __func__
<< dendl
;
12024 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
12025 if (cct
->_conf
->client_oc
) {
12026 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
12027 objectcacher
->flush_all(cond
.get());
12032 ceph_tid_t flush_tid
= last_flush_tid
;
12034 // flush the mdlog before waiting for unsafe requests.
12035 flush_mdlog_sync();
12037 // wait for unsafe mds requests
12038 wait_unsafe_requests();
12040 wait_sync_caps(flush_tid
);
12042 if (nullptr != cond
) {
12043 client_lock
.unlock();
12044 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
12046 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
12047 client_lock
.lock();
12053 int Client::sync_fs()
12055 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12056 if (!mref_reader
.is_state_satisfied())
12057 return -CEPHFS_ENOTCONN
;
12059 std::scoped_lock
l(client_lock
);
12064 int64_t Client::drop_caches()
12066 std::scoped_lock
l(client_lock
);
12067 return objectcacher
->release_all();
12070 int Client::_lazyio(Fh
*fh
, int enable
)
12072 Inode
*in
= fh
->inode
.get();
12073 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
12075 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
12078 int orig_mode
= fh
->mode
;
12080 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
12081 in
->get_open_ref(fh
->mode
);
12082 in
->put_open_ref(orig_mode
);
12083 check_caps(in
, CHECK_CAPS_NODELAY
);
12085 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
12086 in
->get_open_ref(fh
->mode
);
12087 in
->put_open_ref(orig_mode
);
12094 int Client::lazyio(int fd
, int enable
)
12096 std::scoped_lock
l(client_lock
);
12097 Fh
*f
= get_filehandle(fd
);
12099 return -CEPHFS_EBADF
;
12101 return _lazyio(f
, enable
);
12104 int Client::ll_lazyio(Fh
*fh
, int enable
)
12106 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
12107 tout(cct
) << __func__
<< std::endl
;
12109 std::scoped_lock
lock(client_lock
);
12110 return _lazyio(fh
, enable
);
12113 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
12115 std::scoped_lock
l(client_lock
);
12116 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
12117 << ", " << offset
<< ", " << count
<< ")" << dendl
;
12119 Fh
*f
= get_filehandle(fd
);
12121 return -CEPHFS_EBADF
;
12129 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
12131 std::scoped_lock
l(client_lock
);
12132 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
12133 << ", " << offset
<< ", " << count
<< ")" << dendl
;
12135 Fh
*f
= get_filehandle(fd
);
12137 return -CEPHFS_EBADF
;
12138 Inode
*in
= f
->inode
.get();
12141 if (_release(in
)) {
12142 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
12150 // =============================
12153 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
,
12154 mode_t mode
, const std::map
<std::string
, std::string
> &metadata
)
12156 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12157 if (!mref_reader
.is_state_satisfied())
12158 return -CEPHFS_ENOTCONN
;
12160 std::scoped_lock
l(client_lock
);
12162 filepath
path(relpath
);
12164 int r
= path_walk(path
, &in
, perm
);
12167 if (cct
->_conf
->client_permissions
) {
12168 r
= may_create(in
.get(), perm
);
12172 Inode
*snapdir
= open_snapdir(in
.get());
12173 return _mkdir(snapdir
, name
, mode
, perm
, nullptr, metadata
);
12176 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
, bool check_perms
)
12178 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12179 if (!mref_reader
.is_state_satisfied())
12180 return -CEPHFS_ENOTCONN
;
12182 std::scoped_lock
l(client_lock
);
12184 filepath
path(relpath
);
12186 int r
= path_walk(path
, &in
, perms
);
12189 Inode
*snapdir
= open_snapdir(in
.get());
12190 if (cct
->_conf
->client_permissions
) {
12191 r
= may_delete(snapdir
, check_perms
? name
: NULL
, perms
);
12195 return _rmdir(snapdir
, name
, perms
);
12198 // =============================
12201 int Client::get_caps_issued(int fd
)
12203 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12204 if (!mref_reader
.is_state_satisfied())
12205 return -CEPHFS_ENOTCONN
;
12207 std::scoped_lock
lock(client_lock
);
12209 Fh
*f
= get_filehandle(fd
);
12211 return -CEPHFS_EBADF
;
12213 return f
->inode
->caps_issued();
12216 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
12218 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12219 if (!mref_reader
.is_state_satisfied())
12220 return -CEPHFS_ENOTCONN
;
12222 std::scoped_lock
lock(client_lock
);
12226 int r
= path_walk(p
, &in
, perms
, true);
12229 return in
->caps_issued();
12232 // =========================================
12235 void Client::refresh_snapdir_attrs(Inode
*in
, Inode
*diri
) {
12236 ldout(cct
, 10) << __func__
<< ": snapdir inode=" << *in
12237 << ", inode=" << *diri
<< dendl
;
12238 in
->ino
= diri
->ino
;
12239 in
->snapid
= CEPH_SNAPDIR
;
12240 in
->mode
= diri
->mode
;
12241 in
->uid
= diri
->uid
;
12242 in
->gid
= diri
->gid
;
12244 in
->mtime
= diri
->snaprealm
->last_modified
;
12245 in
->ctime
= in
->mtime
;
12246 in
->change_attr
= diri
->snaprealm
->change_attr
;
12247 in
->btime
= diri
->btime
;
12248 in
->atime
= diri
->atime
;
12249 in
->size
= diri
->size
;
12251 in
->dirfragtree
.clear();
12252 in
->snapdir_parent
= diri
;
12253 // copy posix acls to snapshotted inode
12254 in
->xattrs
.clear();
12255 for (auto &[xattr_key
, xattr_value
] : diri
->xattrs
) {
12256 if (xattr_key
.rfind("system.", 0) == 0) {
12257 in
->xattrs
[xattr_key
] = xattr_value
;
12262 Inode
*Client::open_snapdir(Inode
*diri
)
12265 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
12266 if (!inode_map
.count(vino
)) {
12267 in
= new Inode(this, vino
, &diri
->layout
);
12268 refresh_snapdir_attrs(in
, diri
);
12269 diri
->flags
|= I_SNAPDIR_OPEN
;
12270 inode_map
[vino
] = in
;
12271 if (use_faked_inos())
12272 _assign_faked_ino(in
);
12273 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
12275 in
= inode_map
[vino
];
12276 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
12281 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
12282 Inode
**out
, const UserPerm
& perms
)
12284 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12285 if (!mref_reader
.is_state_satisfied())
12286 return -CEPHFS_ENOTCONN
;
12288 vinodeno_t vparent
= _get_vino(parent
);
12289 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
12290 tout(cct
) << __func__
<< std::endl
;
12291 tout(cct
) << name
<< std::endl
;
12293 std::scoped_lock
lock(client_lock
);
12296 if (!fuse_default_permissions
) {
12297 if (strcmp(name
, ".") && strcmp(name
, "..")) {
12298 r
= may_lookup(parent
, perms
);
12304 string
dname(name
);
12307 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
12314 fill_stat(in
, attr
);
12318 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
12319 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12320 tout(cct
) << attr
->st_ino
<< std::endl
;
12325 int Client::ll_lookup_vino(
12327 const UserPerm
& perms
,
12330 ceph_assert(inode
!= NULL
);
12331 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12332 if (!mref_reader
.is_state_satisfied())
12333 return -CEPHFS_ENOTCONN
;
12335 if (is_reserved_vino(vino
))
12336 return -CEPHFS_ESTALE
;
12338 std::scoped_lock
lock(client_lock
);
12339 ldout(cct
, 3) << __func__
<< " " << vino
<< dendl
;
12341 // Check the cache first
12342 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12343 if (p
!= inode_map
.end()) {
12344 *inode
= p
->second
;
12349 uint64_t snapid
= vino
.snapid
;
12351 // for snapdir, find the non-snapped dir inode
12352 if (snapid
== CEPH_SNAPDIR
)
12353 vino
.snapid
= CEPH_NOSNAP
;
12355 int r
= _lookup_vino(vino
, perms
, inode
);
12358 ceph_assert(*inode
!= NULL
);
12360 if (snapid
== CEPH_SNAPDIR
) {
12361 Inode
*tmp
= *inode
;
12363 // open the snapdir and put the inode ref
12364 *inode
= open_snapdir(tmp
);
12365 _ll_forget(tmp
, 1);
12371 int Client::ll_lookup_inode(
12372 struct inodeno_t ino
,
12373 const UserPerm
& perms
,
12376 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
12377 return ll_lookup_vino(vino
, perms
, inode
);
12380 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
12381 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12382 const UserPerm
& perms
)
12384 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12385 if (!mref_reader
.is_state_satisfied())
12386 return -CEPHFS_ENOTCONN
;
12388 vinodeno_t vparent
= _get_vino(parent
);
12389 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
12390 tout(cct
) << "ll_lookupx" << std::endl
;
12391 tout(cct
) << name
<< std::endl
;
12393 std::scoped_lock
lock(client_lock
);
12396 if (!fuse_default_permissions
) {
12397 r
= may_lookup(parent
, perms
);
12402 string
dname(name
);
12405 unsigned mask
= statx_to_mask(flags
, want
);
12406 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
12412 fill_statx(in
, mask
, stx
);
12416 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
12417 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12418 tout(cct
) << stx
->stx_ino
<< std::endl
;
12423 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
12424 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
12426 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12427 if (!mref_reader
.is_state_satisfied())
12428 return -CEPHFS_ENOTCONN
;
12430 filepath
fp(name
, 0);
12433 unsigned mask
= statx_to_mask(flags
, want
);
12435 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
12436 tout(cct
) << __func__
<< std::endl
;
12437 tout(cct
) << name
<< std::endl
;
12439 std::scoped_lock
lock(client_lock
);
12440 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
12442 /* zero out mask, just in case... */
12449 fill_statx(in
, mask
, stx
);
12456 void Client::_ll_get(Inode
*in
)
12458 if (in
->ll_ref
== 0) {
12460 if (in
->is_dir() && !in
->dentries
.empty()) {
12461 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12462 in
->get_first_parent()->get(); // pin dentry
12464 if (in
->snapid
!= CEPH_NOSNAP
)
12465 ll_snap_ref
[in
->snapid
]++;
12468 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
12471 int Client::_ll_put(Inode
*in
, uint64_t num
)
12474 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
12475 if (in
->ll_ref
== 0) {
12476 if (in
->is_dir() && !in
->dentries
.empty()) {
12477 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
12478 in
->get_first_parent()->put(); // unpin dentry
12480 if (in
->snapid
!= CEPH_NOSNAP
) {
12481 auto p
= ll_snap_ref
.find(in
->snapid
);
12482 ceph_assert(p
!= ll_snap_ref
.end());
12483 ceph_assert(p
->second
> 0);
12484 if (--p
->second
== 0)
12485 ll_snap_ref
.erase(p
);
12494 void Client::_ll_drop_pins()
12496 ldout(cct
, 10) << __func__
<< dendl
;
12497 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
12498 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
12499 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
12500 it
!= inode_map
.end();
12502 Inode
*in
= it
->second
;
12506 to_be_put
.insert(in
);
12507 _ll_put(in
, in
->ll_ref
);
12512 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
12514 inodeno_t ino
= in
->ino
;
12516 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
12517 tout(cct
) << __func__
<< std::endl
;
12518 tout(cct
) << ino
.val
<< std::endl
;
12519 tout(cct
) << count
<< std::endl
;
12521 // Ignore forget if we're no longer mounted
12522 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12523 if (!mref_reader
.is_state_satisfied())
12526 if (ino
== 1) return true; // ignore forget on root.
12529 if (in
->ll_ref
< count
) {
12530 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
12531 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
12532 _ll_put(in
, in
->ll_ref
);
12535 if (_ll_put(in
, count
) == 0)
12542 bool Client::ll_forget(Inode
*in
, uint64_t count
)
12544 std::scoped_lock
lock(client_lock
);
12545 return _ll_forget(in
, count
);
12548 bool Client::ll_put(Inode
*in
)
12550 /* ll_forget already takes the lock */
12551 return ll_forget(in
, 1);
12554 int Client::ll_get_snap_ref(snapid_t snap
)
12556 std::scoped_lock
lock(client_lock
);
12557 auto p
= ll_snap_ref
.find(snap
);
12558 if (p
!= ll_snap_ref
.end())
12563 snapid_t
Client::ll_get_snapid(Inode
*in
)
12565 std::scoped_lock
lock(client_lock
);
12569 Inode
*Client::ll_get_inode(ino_t ino
)
12571 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12572 if (!mref_reader
.is_state_satisfied())
12575 std::scoped_lock
lock(client_lock
);
12577 vinodeno_t vino
= _map_faked_ino(ino
);
12578 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12579 if (p
== inode_map
.end())
12581 Inode
*in
= p
->second
;
12586 Inode
*Client::ll_get_inode(vinodeno_t vino
)
12588 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12589 if (!mref_reader
.is_state_satisfied())
12592 if (is_reserved_vino(vino
))
12595 std::scoped_lock
lock(client_lock
);
12597 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
12598 if (p
== inode_map
.end())
12600 Inode
*in
= p
->second
;
12605 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
12607 vinodeno_t vino
= _get_vino(in
);
12609 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
12610 tout(cct
) << __func__
<< std::endl
;
12611 tout(cct
) << vino
.ino
.val
<< std::endl
;
12613 if (vino
.snapid
< CEPH_NOSNAP
)
12616 return _getattr(in
, caps
, perms
);
12619 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
12621 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12622 if (!mref_reader
.is_state_satisfied())
12623 return -CEPHFS_ENOTCONN
;
12625 std::scoped_lock
lock(client_lock
);
12627 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
12630 fill_stat(in
, attr
);
12631 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12635 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
12636 unsigned int flags
, const UserPerm
& perms
)
12638 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12639 if (!mref_reader
.is_state_satisfied())
12640 return -CEPHFS_ENOTCONN
;
12642 std::scoped_lock
lock(client_lock
);
12645 unsigned mask
= statx_to_mask(flags
, want
);
12647 if (mask
&& !in
->caps_issued_mask(mask
, true))
12648 res
= _ll_getattr(in
, mask
, perms
);
12651 fill_statx(in
, mask
, stx
);
12652 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12656 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12657 const UserPerm
& perms
, InodeRef
*inp
)
12659 vinodeno_t vino
= _get_vino(in
);
12661 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
12663 tout(cct
) << __func__
<< std::endl
;
12664 tout(cct
) << vino
.ino
.val
<< std::endl
;
12665 tout(cct
) << stx
->stx_mode
<< std::endl
;
12666 tout(cct
) << stx
->stx_uid
<< std::endl
;
12667 tout(cct
) << stx
->stx_gid
<< std::endl
;
12668 tout(cct
) << stx
->stx_size
<< std::endl
;
12669 tout(cct
) << stx
->stx_mtime
<< std::endl
;
12670 tout(cct
) << stx
->stx_atime
<< std::endl
;
12671 tout(cct
) << stx
->stx_btime
<< std::endl
;
12672 tout(cct
) << mask
<< std::endl
;
12674 if (!fuse_default_permissions
) {
12675 int res
= may_setattr(in
, stx
, mask
, perms
);
12680 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
12682 return __setattrx(in
, stx
, mask
, perms
, inp
);
12685 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
12686 const UserPerm
& perms
)
12688 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12689 if (!mref_reader
.is_state_satisfied())
12690 return -CEPHFS_ENOTCONN
;
12692 std::scoped_lock
lock(client_lock
);
12694 InodeRef
target(in
);
12695 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
12697 ceph_assert(in
== target
.get());
12698 fill_statx(in
, in
->caps_issued(), stx
);
12701 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12705 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
12706 const UserPerm
& perms
)
12708 struct ceph_statx stx
;
12709 stat_to_statx(attr
, &stx
);
12711 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12712 if (!mref_reader
.is_state_satisfied())
12713 return -CEPHFS_ENOTCONN
;
12715 std::scoped_lock
lock(client_lock
);
12717 InodeRef
target(in
);
12718 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
12720 ceph_assert(in
== target
.get());
12721 fill_stat(in
, attr
);
12724 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
12732 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
12733 const UserPerm
& perms
)
12735 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12736 if (!mref_reader
.is_state_satisfied())
12737 return -CEPHFS_ENOTCONN
;
12739 std::scoped_lock
lock(client_lock
);
12742 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12745 return _getxattr(in
, name
, value
, size
, perms
);
12748 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
12749 const UserPerm
& perms
)
12751 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12752 if (!mref_reader
.is_state_satisfied())
12753 return -CEPHFS_ENOTCONN
;
12755 std::scoped_lock
lock(client_lock
);
12758 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12761 return _getxattr(in
, name
, value
, size
, perms
);
12764 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
12765 const UserPerm
& perms
)
12767 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12768 if (!mref_reader
.is_state_satisfied())
12769 return -CEPHFS_ENOTCONN
;
12771 std::scoped_lock
lock(client_lock
);
12773 Fh
*f
= get_filehandle(fd
);
12775 return -CEPHFS_EBADF
;
12776 return _getxattr(f
->inode
, name
, value
, size
, perms
);
12779 int Client::listxattr(const char *path
, char *list
, size_t size
,
12780 const UserPerm
& perms
)
12782 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12783 if (!mref_reader
.is_state_satisfied())
12784 return -CEPHFS_ENOTCONN
;
12786 std::scoped_lock
lock(client_lock
);
12789 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
12792 return Client::_listxattr(in
.get(), list
, size
, perms
);
12795 int Client::llistxattr(const char *path
, char *list
, size_t size
,
12796 const UserPerm
& perms
)
12798 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12799 if (!mref_reader
.is_state_satisfied())
12800 return -CEPHFS_ENOTCONN
;
12802 std::scoped_lock
lock(client_lock
);
12805 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
12808 return Client::_listxattr(in
.get(), list
, size
, perms
);
12811 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
12813 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12814 if (!mref_reader
.is_state_satisfied())
12815 return -CEPHFS_ENOTCONN
;
12817 std::scoped_lock
lock(client_lock
);
12819 Fh
*f
= get_filehandle(fd
);
12821 return -CEPHFS_EBADF
;
12822 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
12825 int Client::removexattr(const char *path
, const char *name
,
12826 const UserPerm
& perms
)
12828 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12829 if (!mref_reader
.is_state_satisfied())
12830 return -CEPHFS_ENOTCONN
;
12832 std::scoped_lock
lock(client_lock
);
12835 int r
= Client::path_walk(path
, &in
, perms
, true);
12838 return _removexattr(in
, name
, perms
);
12841 int Client::lremovexattr(const char *path
, const char *name
,
12842 const UserPerm
& perms
)
12844 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12845 if (!mref_reader
.is_state_satisfied())
12846 return -CEPHFS_ENOTCONN
;
12848 std::scoped_lock
lock(client_lock
);
12851 int r
= Client::path_walk(path
, &in
, perms
, false);
12854 return _removexattr(in
, name
, perms
);
12857 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
12859 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12860 if (!mref_reader
.is_state_satisfied())
12861 return -CEPHFS_ENOTCONN
;
12863 std::scoped_lock
lock(client_lock
);
12865 Fh
*f
= get_filehandle(fd
);
12867 return -CEPHFS_EBADF
;
12868 return _removexattr(f
->inode
, name
, perms
);
12871 int Client::setxattr(const char *path
, const char *name
, const void *value
,
12872 size_t size
, int flags
, const UserPerm
& perms
)
12874 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12875 if (!mref_reader
.is_state_satisfied())
12876 return -CEPHFS_ENOTCONN
;
12878 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12880 std::scoped_lock
lock(client_lock
);
12883 int r
= Client::path_walk(path
, &in
, perms
, true);
12886 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12889 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
12890 size_t size
, int flags
, const UserPerm
& perms
)
12892 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12893 if (!mref_reader
.is_state_satisfied())
12894 return -CEPHFS_ENOTCONN
;
12896 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12898 std::scoped_lock
lock(client_lock
);
12901 int r
= Client::path_walk(path
, &in
, perms
, false);
12904 return _setxattr(in
, name
, value
, size
, flags
, perms
);
12907 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
12908 int flags
, const UserPerm
& perms
)
12910 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
12911 if (!mref_reader
.is_state_satisfied())
12912 return -CEPHFS_ENOTCONN
;
12914 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
12916 std::scoped_lock
lock(client_lock
);
12918 Fh
*f
= get_filehandle(fd
);
12920 return -CEPHFS_EBADF
;
12921 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
12924 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
12925 const UserPerm
& perms
)
12928 const VXattr
*vxattr
= nullptr;
12930 vxattr
= _match_vxattr(in
, name
);
12932 r
= -CEPHFS_ENODATA
;
12934 // Do a force getattr to get the latest quota before returning
12935 // a value to userspace.
12937 if (vxattr
->flags
& VXATTR_RSTAT
) {
12938 flags
|= CEPH_STAT_RSTAT
;
12940 if (vxattr
->flags
& VXATTR_DIRSTAT
) {
12941 flags
|= CEPH_CAP_FILE_SHARED
;
12943 r
= _getattr(in
, flags
| CEPH_STAT_CAP_XATTR
, perms
, true);
12945 // Error from getattr!
12949 // call pointer-to-member function
12951 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
12952 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
12954 r
= -CEPHFS_ENODATA
;
12958 if (r
> (int)size
) {
12959 r
= -CEPHFS_ERANGE
;
12960 } else if (r
> 0) {
12961 memcpy(value
, buf
, r
);
12967 if (!strncmp(name
, "ceph.", 5)) {
12968 r
= _getvxattr(in
, perms
, name
, size
, value
, MDS_RANK_NONE
);
12972 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
12973 r
= -CEPHFS_EOPNOTSUPP
;
12977 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
12980 r
= -CEPHFS_ENODATA
;
12981 if (in
->xattrs
.count(n
)) {
12982 r
= in
->xattrs
[n
].length();
12983 if (r
> 0 && size
!= 0) {
12984 if (size
>= (unsigned)r
)
12985 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
12987 r
= -CEPHFS_ERANGE
;
12992 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
12996 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
12997 const UserPerm
& perms
)
12999 if (cct
->_conf
->client_permissions
) {
13000 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
13004 return _getxattr(in
.get(), name
, value
, size
, perms
);
13007 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
13008 size_t size
, const UserPerm
& perms
)
13010 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13011 if (!mref_reader
.is_state_satisfied())
13012 return -CEPHFS_ENOTCONN
;
13014 vinodeno_t vino
= _get_vino(in
);
13016 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
13017 tout(cct
) << __func__
<< std::endl
;
13018 tout(cct
) << vino
.ino
.val
<< std::endl
;
13019 tout(cct
) << name
<< std::endl
;
13021 std::scoped_lock
lock(client_lock
);
13022 if (!fuse_default_permissions
) {
13023 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
13028 return _getxattr(in
, name
, value
, size
, perms
);
13031 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
13032 const UserPerm
& perms
)
13034 bool len_only
= (size
== 0);
13035 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
13041 for ([[maybe_unused
]] const auto &[xattr_name
, xattr_value_bl
] : in
->xattrs
) {
13042 if (xattr_name
.rfind("ceph.", 0) == 0) {
13046 size_t this_len
= xattr_name
.length() + 1;
13051 if (this_len
> size
) {
13052 r
= -CEPHFS_ERANGE
;
13056 memcpy(name
, xattr_name
.c_str(), this_len
);
13061 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
13065 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
13066 const UserPerm
& perms
)
13068 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13069 if (!mref_reader
.is_state_satisfied())
13070 return -CEPHFS_ENOTCONN
;
13072 vinodeno_t vino
= _get_vino(in
);
13074 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
13075 tout(cct
) << __func__
<< std::endl
;
13076 tout(cct
) << vino
.ino
.val
<< std::endl
;
13077 tout(cct
) << size
<< std::endl
;
13079 std::scoped_lock
lock(client_lock
);
13080 return _listxattr(in
, names
, size
, perms
);
13083 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
13084 size_t size
, int flags
, const UserPerm
& perms
)
13087 int xattr_flags
= 0;
13089 xattr_flags
|= CEPH_XATTR_REMOVE
;
13090 if (flags
& XATTR_CREATE
)
13091 xattr_flags
|= CEPH_XATTR_CREATE
;
13092 if (flags
& XATTR_REPLACE
)
13093 xattr_flags
|= CEPH_XATTR_REPLACE
;
13095 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
13097 in
->make_nosnap_relative_path(path
);
13098 req
->set_filepath(path
);
13099 req
->set_string2(name
);
13100 req
->set_inode(in
);
13101 req
->head
.args
.setxattr
.flags
= xattr_flags
;
13104 ceph_assert(value
|| size
== 0);
13105 bl
.append((const char*)value
, size
);
13108 int res
= make_request(req
, perms
);
13111 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
13116 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
13117 size_t size
, int flags
, const UserPerm
& perms
)
13119 if (in
->snapid
!= CEPH_NOSNAP
) {
13120 return -CEPHFS_EROFS
;
13125 } else if (value
== NULL
) {
13126 return -CEPHFS_EINVAL
;
13129 bool posix_acl_xattr
= false;
13130 if (acl_type
== POSIX_ACL
)
13131 posix_acl_xattr
= !strncmp(name
, "system.", 7);
13133 if (strncmp(name
, "user.", 5) &&
13134 strncmp(name
, "security.", 9) &&
13135 strncmp(name
, "trusted.", 8) &&
13136 strncmp(name
, "ceph.", 5) &&
13138 return -CEPHFS_EOPNOTSUPP
;
13140 bool check_realm
= false;
13142 if (posix_acl_xattr
) {
13143 if (!strcmp(name
, ACL_EA_ACCESS
)) {
13144 mode_t new_mode
= in
->mode
;
13146 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
13153 if (new_mode
!= in
->mode
) {
13154 struct ceph_statx stx
;
13155 stx
.stx_mode
= new_mode
;
13156 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, nullptr);
13161 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
13163 if (!S_ISDIR(in
->mode
))
13164 return -CEPHFS_EACCES
;
13165 int ret
= posix_acl_check(value
, size
);
13167 return -CEPHFS_EINVAL
;
13174 return -CEPHFS_EOPNOTSUPP
;
13177 const VXattr
*vxattr
= _match_vxattr(in
, name
);
13179 if (vxattr
->readonly
)
13180 return -CEPHFS_EOPNOTSUPP
;
13181 if (vxattr
->setxattr_cb
)
13182 return (this->*(vxattr
->setxattr_cb
))(in
, value
, size
, perms
);
13183 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
13184 check_realm
= true;
13188 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
13189 if (ret
>= 0 && check_realm
) {
13190 // check if snaprealm was created for quota inode
13191 if (in
->quota
.is_enabled() &&
13192 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
13193 ret
= -CEPHFS_EOPNOTSUPP
;
13199 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
13200 size_t size
, int flags
, const UserPerm
& perms
)
13202 if (cct
->_conf
->client_permissions
) {
13203 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
13207 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
13210 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
13213 if (name
== "layout") {
13214 string::iterator begin
= value
.begin();
13215 string::iterator end
= value
.end();
13216 keys_and_values
<string::iterator
> p
; // create instance of parser
13217 std::map
<string
, string
> m
; // map to receive results
13218 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
13219 return -CEPHFS_EINVAL
;
13222 return -CEPHFS_EINVAL
;
13223 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
13224 if (q
->first
== "pool") {
13229 } else if (name
== "layout.pool") {
13233 if (tmp
.length()) {
13236 pool
= boost::lexical_cast
<unsigned>(tmp
);
13237 if (!osdmap
->have_pg_pool(pool
))
13238 return -CEPHFS_ENOENT
;
13239 } catch (boost::bad_lexical_cast
const&) {
13240 pool
= osdmap
->lookup_pg_pool_name(tmp
);
13242 return -CEPHFS_ENOENT
;
13250 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
13252 // For setting pool of layout, MetaRequest need osdmap epoch.
13253 // There is a race which create a new data pool but client and mds both don't have.
13254 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
13255 ldout(cct
, 15) << __func__
<< ": name = " << name
<< dendl
;
13256 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
13257 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
13258 string
rest(strstr(name
, "layout"));
13259 string
v((const char*)value
, size
);
13260 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
13261 return _setxattr_check_data_pool(rest
, v
, &o
);
13264 if (r
== -CEPHFS_ENOENT
) {
13266 ldout(cct
, 20) << __func__
<< ": waiting for latest osdmap" << dendl
;
13267 objecter
->wait_for_latest_osdmap(ca::use_blocked
[ec
]);
13268 ldout(cct
, 20) << __func__
<< ": got latest osdmap: " << ec
<< dendl
;
13273 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
13274 size_t size
, int flags
, const UserPerm
& perms
)
13276 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13277 if (!mref_reader
.is_state_satisfied())
13278 return -CEPHFS_ENOTCONN
;
13280 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
13282 vinodeno_t vino
= _get_vino(in
);
13284 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
13285 tout(cct
) << __func__
<< std::endl
;
13286 tout(cct
) << vino
.ino
.val
<< std::endl
;
13287 tout(cct
) << name
<< std::endl
;
13289 std::scoped_lock
lock(client_lock
);
13290 if (!fuse_default_permissions
) {
13291 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
13295 return _setxattr(in
, name
, value
, size
, flags
, perms
);
13298 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
13300 if (in
->snapid
!= CEPH_NOSNAP
) {
13301 return -CEPHFS_EROFS
;
13304 // same xattrs supported by kernel client
13305 if (strncmp(name
, "user.", 5) &&
13306 strncmp(name
, "system.", 7) &&
13307 strncmp(name
, "security.", 9) &&
13308 strncmp(name
, "trusted.", 8) &&
13309 strncmp(name
, "ceph.", 5))
13310 return -CEPHFS_EOPNOTSUPP
;
13312 const VXattr
*vxattr
= _match_vxattr(in
, name
);
13313 if (vxattr
&& vxattr
->readonly
)
13314 return -CEPHFS_EOPNOTSUPP
;
13316 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
13318 in
->make_nosnap_relative_path(path
);
13319 req
->set_filepath(path
);
13320 req
->set_filepath2(name
);
13321 req
->set_inode(in
);
13323 int res
= make_request(req
, perms
);
13326 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
13330 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
13332 if (cct
->_conf
->client_permissions
) {
13333 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
13337 return _removexattr(in
.get(), name
, perms
);
13340 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
13342 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13343 if (!mref_reader
.is_state_satisfied())
13344 return -CEPHFS_ENOTCONN
;
13346 vinodeno_t vino
= _get_vino(in
);
13348 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
13349 tout(cct
) << "ll_removexattr" << std::endl
;
13350 tout(cct
) << vino
.ino
.val
<< std::endl
;
13351 tout(cct
) << name
<< std::endl
;
13353 std::scoped_lock
lock(client_lock
);
13354 if (!fuse_default_permissions
) {
13355 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
13360 return _removexattr(in
, name
, perms
);
13363 bool Client::_vxattrcb_fscrypt_auth_exists(Inode
*in
)
13365 bool exists
= !in
->fscrypt_auth
.empty();
13367 ldout(cct
, 10) << "fscrypt_auth exists " << exists
<< dendl
;
13371 size_t Client::_vxattrcb_fscrypt_auth(Inode
*in
, char *val
, size_t size
)
13373 size_t count
= in
->fscrypt_auth
.size();
13376 memcpy(val
, in
->fscrypt_auth
.data(), count
);
13380 int Client::_vxattrcb_fscrypt_auth_set(Inode
*in
, const void *val
, size_t size
,
13381 const UserPerm
& perms
)
13383 struct ceph_statx stx
= { 0 };
13384 std::vector
<uint8_t> aux
;
13387 memcpy(aux
.data(), val
, size
);
13389 return _do_setattr(in
, &stx
, CEPH_SETATTR_FSCRYPT_AUTH
, perms
, nullptr, &aux
);
13392 bool Client::_vxattrcb_fscrypt_file_exists(Inode
*in
)
13394 return !in
->fscrypt_file
.empty();
13397 size_t Client::_vxattrcb_fscrypt_file(Inode
*in
, char *val
, size_t size
)
13399 size_t count
= in
->fscrypt_file
.size();
13402 memcpy(val
, in
->fscrypt_file
.data(), count
);
13406 int Client::_vxattrcb_fscrypt_file_set(Inode
*in
, const void *val
, size_t size
,
13407 const UserPerm
& perms
)
13409 struct ceph_statx stx
= { 0 };
13410 std::vector
<uint8_t> aux
;
13413 memcpy(aux
.data(), val
, size
);
13415 return _do_setattr(in
, &stx
, CEPH_SETATTR_FSCRYPT_FILE
, perms
, nullptr, &aux
);
13418 bool Client::_vxattrcb_quota_exists(Inode
*in
)
13420 return in
->quota
.is_enabled() &&
13421 (in
->snapid
!= CEPH_NOSNAP
||
13422 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
13424 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
13426 return snprintf(val
, size
,
13427 "max_bytes=%lld max_files=%lld",
13428 (long long int)in
->quota
.max_bytes
,
13429 (long long int)in
->quota
.max_files
);
13431 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
13433 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
13435 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
13437 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
13440 bool Client::_vxattrcb_layout_exists(Inode
*in
)
13442 return in
->layout
!= file_layout_t();
13444 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
13446 int r
= snprintf(val
, size
,
13447 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
13448 (unsigned long long)in
->layout
.stripe_unit
,
13449 (unsigned long long)in
->layout
.stripe_count
,
13450 (unsigned long long)in
->layout
.object_size
);
13451 objecter
->with_osdmap([&](const OSDMap
& o
) {
13452 if (o
.have_pg_pool(in
->layout
.pool_id
))
13453 r
+= snprintf(val
+ r
, size
- r
, "%s",
13454 o
.get_pool_name(in
->layout
.pool_id
).c_str());
13456 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
13457 (uint64_t)in
->layout
.pool_id
);
13459 if (in
->layout
.pool_ns
.length())
13460 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
13461 in
->layout
.pool_ns
.c_str());
13464 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
13466 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
13468 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
13470 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
13472 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
13474 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
13476 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
13479 objecter
->with_osdmap([&](const OSDMap
& o
) {
13480 if (o
.have_pg_pool(in
->layout
.pool_id
))
13481 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
13482 in
->layout
.pool_id
).c_str());
13484 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
13488 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
13490 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
13492 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
13494 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
13496 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
13498 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
13500 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
13502 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
13504 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
13506 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
13508 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
13510 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
13512 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
13514 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
13516 size_t Client::_vxattrcb_dir_rsnaps(Inode
*in
, char *val
, size_t size
)
13518 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsnaps
);
13520 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
13522 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
13524 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
13526 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
13527 (long)in
->rstat
.rctime
.nsec());
13529 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
13531 return in
->dir_pin
!= -CEPHFS_ENODATA
;
13533 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
13535 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
13538 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
13540 return !in
->snap_btime
.is_zero();
13543 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
13545 return snprintf(val
, size
, "%llu.%09lu",
13546 (long long unsigned)in
->snap_btime
.sec(),
13547 (long unsigned)in
->snap_btime
.nsec());
13550 size_t Client::_vxattrcb_caps(Inode
*in
, char *val
, size_t size
)
13554 in
->caps_issued(&issued
);
13555 return snprintf(val
, size
, "%s/0x%x", ccap_string(issued
).c_str(), issued
);
13558 bool Client::_vxattrcb_mirror_info_exists(Inode
*in
)
13560 // checking one of the xattrs would suffice
13561 return in
->xattrs
.count("ceph.mirror.info.cluster_id") != 0;
13564 size_t Client::_vxattrcb_mirror_info(Inode
*in
, char *val
, size_t size
)
13566 return snprintf(val
, size
, "cluster_id=%.*s fs_id=%.*s",
13567 in
->xattrs
["ceph.mirror.info.cluster_id"].length(),
13568 in
->xattrs
["ceph.mirror.info.cluster_id"].c_str(),
13569 in
->xattrs
["ceph.mirror.info.fs_id"].length(),
13570 in
->xattrs
["ceph.mirror.info.fs_id"].c_str());
13573 size_t Client::_vxattrcb_cluster_fsid(Inode
*in
, char *val
, size_t size
)
13575 return snprintf(val
, size
, "%s", monclient
->get_fsid().to_string().c_str());
13578 size_t Client::_vxattrcb_client_id(Inode
*in
, char *val
, size_t size
)
13580 auto name
= messenger
->get_myname();
13581 return snprintf(val
, size
, "%s%" PRId64
, name
.type_str(), name
.num());
13584 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13585 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13587 #define XATTR_NAME_CEPH(_type, _name, _flags) \
13589 name: CEPH_XATTR_NAME(_type, _name), \
13590 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13595 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13597 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13598 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13600 exists_cb: &Client::_vxattrcb_layout_exists, \
13603 #define XATTR_QUOTA_FIELD(_type, _name) \
13605 name: CEPH_XATTR_NAME(_type, _name), \
13606 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13608 exists_cb: &Client::_vxattrcb_quota_exists, \
13612 const Client::VXattr
Client::_dir_vxattrs
[] = {
13614 name
: "ceph.dir.layout",
13615 getxattr_cb
: &Client::_vxattrcb_layout
,
13617 exists_cb
: &Client::_vxattrcb_layout_exists
,
13621 // Delete the following dir layout field definitions for release "S"
13622 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
13623 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
13624 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
13625 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
13626 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
13627 XATTR_NAME_CEPH(dir
, entries
, VXATTR_DIRSTAT
),
13628 XATTR_NAME_CEPH(dir
, files
, VXATTR_DIRSTAT
),
13629 XATTR_NAME_CEPH(dir
, subdirs
, VXATTR_DIRSTAT
),
13630 XATTR_NAME_CEPH(dir
, rentries
, VXATTR_RSTAT
),
13631 XATTR_NAME_CEPH(dir
, rfiles
, VXATTR_RSTAT
),
13632 XATTR_NAME_CEPH(dir
, rsubdirs
, VXATTR_RSTAT
),
13633 XATTR_NAME_CEPH(dir
, rsnaps
, VXATTR_RSTAT
),
13634 XATTR_NAME_CEPH(dir
, rbytes
, VXATTR_RSTAT
),
13635 XATTR_NAME_CEPH(dir
, rctime
, VXATTR_RSTAT
),
13637 name
: "ceph.quota",
13638 getxattr_cb
: &Client::_vxattrcb_quota
,
13640 exists_cb
: &Client::_vxattrcb_quota_exists
,
13643 XATTR_QUOTA_FIELD(quota
, max_bytes
),
13644 XATTR_QUOTA_FIELD(quota
, max_files
),
13646 // Delete the following dir pin field definitions for release "S"
13648 name
: "ceph.dir.pin",
13649 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
13651 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
13655 name
: "ceph.snap.btime",
13656 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13658 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13662 name
: "ceph.mirror.info",
13663 getxattr_cb
: &Client::_vxattrcb_mirror_info
,
13665 exists_cb
: &Client::_vxattrcb_mirror_info_exists
,
13670 getxattr_cb
: &Client::_vxattrcb_caps
,
13675 { name
: "" } /* Required table terminator */
13678 const Client::VXattr
Client::_file_vxattrs
[] = {
13680 name
: "ceph.file.layout",
13681 getxattr_cb
: &Client::_vxattrcb_layout
,
13683 exists_cb
: &Client::_vxattrcb_layout_exists
,
13686 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
13687 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
13688 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
13689 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
13690 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
13692 name
: "ceph.snap.btime",
13693 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
13695 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
13700 getxattr_cb
: &Client::_vxattrcb_caps
,
13705 { name
: "" } /* Required table terminator */
13708 const Client::VXattr
Client::_common_vxattrs
[] = {
13710 name
: "ceph.cluster_fsid",
13711 getxattr_cb
: &Client::_vxattrcb_cluster_fsid
,
13713 exists_cb
: nullptr,
13717 name
: "ceph.client_id",
13718 getxattr_cb
: &Client::_vxattrcb_client_id
,
13720 exists_cb
: nullptr,
13724 name
: "ceph.fscrypt.auth",
13725 getxattr_cb
: &Client::_vxattrcb_fscrypt_auth
,
13726 setxattr_cb
: &Client::_vxattrcb_fscrypt_auth_set
,
13728 exists_cb
: &Client::_vxattrcb_fscrypt_auth_exists
,
13732 name
: "ceph.fscrypt.file",
13733 getxattr_cb
: &Client::_vxattrcb_fscrypt_file
,
13734 setxattr_cb
: &Client::_vxattrcb_fscrypt_file_set
,
13736 exists_cb
: &Client::_vxattrcb_fscrypt_file_exists
,
13739 { name
: "" } /* Required table terminator */
13742 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
13745 return _dir_vxattrs
;
13746 else if (in
->is_file())
13747 return _file_vxattrs
;
13751 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
13753 if (strncmp(name
, "ceph.", 5) == 0) {
13754 const VXattr
*vxattr
= _get_vxattrs(in
);
13756 while (!vxattr
->name
.empty()) {
13757 if (vxattr
->name
== name
)
13763 // for common vxattrs
13764 vxattr
= _common_vxattrs
;
13765 while (!vxattr
->name
.empty()) {
13766 if (vxattr
->name
== name
)
13775 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
13777 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13778 if (!mref_reader
.is_state_satisfied())
13779 return -CEPHFS_ENOTCONN
;
13781 vinodeno_t vino
= _get_vino(in
);
13783 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
13784 tout(cct
) << "ll_readlink" << std::endl
;
13785 tout(cct
) << vino
.ino
.val
<< std::endl
;
13787 std::scoped_lock
lock(client_lock
);
13788 for (auto dn
: in
->dentries
) {
13792 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
13793 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
13797 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
13798 const UserPerm
& perms
, InodeRef
*inp
)
13800 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
13801 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
13802 << ", gid " << perms
.gid() << ")" << dendl
;
13804 if (strlen(name
) > NAME_MAX
)
13805 return -CEPHFS_ENAMETOOLONG
;
13807 if (dir
->snapid
!= CEPH_NOSNAP
) {
13808 return -CEPHFS_EROFS
;
13810 if (is_quota_files_exceeded(dir
, perms
)) {
13811 return -CEPHFS_EDQUOT
;
13814 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
13816 req
->set_inode_owner_uid_gid(perms
.uid(), perms
.gid());
13819 dir
->make_nosnap_relative_path(path
);
13820 path
.push_dentry(name
);
13821 req
->set_filepath(path
);
13822 req
->set_inode(dir
);
13823 req
->head
.args
.mknod
.rdev
= rdev
;
13824 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13825 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13827 bufferlist xattrs_bl
;
13828 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13833 req
->head
.args
.mknod
.mode
= mode
;
13834 if (xattrs_bl
.length() > 0)
13835 req
->set_data(xattrs_bl
);
13837 Dentry
*de
= get_or_create(dir
, name
);
13838 req
->set_dentry(de
);
13840 res
= make_request(req
, perms
, inp
);
13844 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
13848 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
13849 dev_t rdev
, struct stat
*attr
, Inode
**out
,
13850 const UserPerm
& perms
)
13852 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13853 if (!mref_reader
.is_state_satisfied())
13854 return -CEPHFS_ENOTCONN
;
13856 vinodeno_t vparent
= _get_vino(parent
);
13858 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
13859 tout(cct
) << "ll_mknod" << std::endl
;
13860 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13861 tout(cct
) << name
<< std::endl
;
13862 tout(cct
) << mode
<< std::endl
;
13863 tout(cct
) << rdev
<< std::endl
;
13865 std::scoped_lock
lock(client_lock
);
13866 if (!fuse_default_permissions
) {
13867 int r
= may_create(parent
, perms
);
13873 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13875 fill_stat(in
, attr
);
13878 tout(cct
) << attr
->st_ino
<< std::endl
;
13879 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
13880 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
13885 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
13886 dev_t rdev
, Inode
**out
,
13887 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
13888 const UserPerm
& perms
)
13890 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
13891 if (!mref_reader
.is_state_satisfied())
13892 return -CEPHFS_ENOTCONN
;
13894 unsigned caps
= statx_to_mask(flags
, want
);
13896 vinodeno_t vparent
= _get_vino(parent
);
13898 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
13899 tout(cct
) << "ll_mknodx" << std::endl
;
13900 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13901 tout(cct
) << name
<< std::endl
;
13902 tout(cct
) << mode
<< std::endl
;
13903 tout(cct
) << rdev
<< std::endl
;
13905 std::scoped_lock
lock(client_lock
);
13907 if (!fuse_default_permissions
) {
13908 int r
= may_create(parent
, perms
);
13914 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
13916 fill_statx(in
, caps
, stx
);
13919 tout(cct
) << stx
->stx_ino
<< std::endl
;
13920 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
13921 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
13926 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
13927 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
13928 int object_size
, const char *data_pool
, bool *created
,
13929 const UserPerm
& perms
, std::string alternate_name
)
13931 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
13932 mode
<< dec
<< ")" << dendl
;
13934 if (strlen(name
) > NAME_MAX
)
13935 return -CEPHFS_ENAMETOOLONG
;
13936 if (dir
->snapid
!= CEPH_NOSNAP
) {
13937 return -CEPHFS_EROFS
;
13939 if (is_quota_files_exceeded(dir
, perms
)) {
13940 return -CEPHFS_EDQUOT
;
13943 // use normalized flags to generate cmode
13944 int cflags
= ceph_flags_sys2wire(flags
);
13945 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
13946 cflags
|= CEPH_O_LAZY
;
13948 int cmode
= ceph_flags_to_mode(cflags
);
13950 int64_t pool_id
= -1;
13951 if (data_pool
&& *data_pool
) {
13952 pool_id
= objecter
->with_osdmap(
13953 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
13955 return -CEPHFS_EINVAL
;
13956 if (pool_id
> 0xffffffffll
)
13957 return -CEPHFS_ERANGE
; // bummer!
13960 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
13962 req
->set_inode_owner_uid_gid(perms
.uid(), perms
.gid());
13965 dir
->make_nosnap_relative_path(path
);
13966 path
.push_dentry(name
);
13967 req
->set_filepath(path
);
13968 req
->set_alternate_name(std::move(alternate_name
));
13969 req
->set_inode(dir
);
13970 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
13972 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
13973 req
->head
.args
.open
.stripe_count
= stripe_count
;
13974 req
->head
.args
.open
.object_size
= object_size
;
13975 if (cct
->_conf
->client_debug_getattr_caps
)
13976 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
13978 req
->head
.args
.open
.mask
= 0;
13979 req
->head
.args
.open
.pool
= pool_id
;
13980 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
13981 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
13984 bufferlist xattrs_bl
;
13985 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
13990 req
->head
.args
.open
.mode
= mode
;
13991 if (xattrs_bl
.length() > 0)
13992 req
->set_data(xattrs_bl
);
13994 Dentry
*de
= get_or_create(dir
, name
);
13995 req
->set_dentry(de
);
13997 res
= make_request(req
, perms
, inp
, created
);
14002 /* If the caller passed a value in fhp, do the open */
14004 (*inp
)->get_open_ref(cmode
);
14005 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
14011 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
14012 << " layout " << stripe_unit
14013 << ' ' << stripe_count
14014 << ' ' << object_size
14015 <<") = " << res
<< dendl
;
14019 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
14020 InodeRef
*inp
, const std::map
<std::string
, std::string
> &metadata
,
14021 std::string alternate_name
)
14023 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
14024 << mode
<< dec
<< ", uid " << perm
.uid()
14025 << ", gid " << perm
.gid() << ")" << dendl
;
14027 if (strlen(name
) > NAME_MAX
)
14028 return -CEPHFS_ENAMETOOLONG
;
14030 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
14031 return -CEPHFS_EROFS
;
14033 if (is_quota_files_exceeded(dir
, perm
)) {
14034 return -CEPHFS_EDQUOT
;
14037 bool is_snap_op
= dir
->snapid
== CEPH_SNAPDIR
;
14038 MetaRequest
*req
= new MetaRequest(is_snap_op
?
14039 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
14042 req
->set_inode_owner_uid_gid(perm
.uid(), perm
.gid());
14045 dir
->make_nosnap_relative_path(path
);
14046 path
.push_dentry(name
);
14047 req
->set_filepath(path
);
14048 req
->set_inode(dir
);
14049 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14050 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14051 req
->set_alternate_name(std::move(alternate_name
));
14055 int res
= _posix_acl_create(dir
, &mode
, bl
, perm
);
14060 req
->head
.args
.mkdir
.mode
= mode
;
14062 SnapPayload payload
;
14063 // clear the bufferlist that may have been populated by the call
14064 // to _posix_acl_create(). MDS mksnap does not make use of it.
14065 // So, reuse it to pass metadata payload.
14067 payload
.metadata
= metadata
;
14068 encode(payload
, bl
);
14070 if (bl
.length() > 0) {
14074 Dentry
*de
= get_or_create(dir
, name
);
14075 req
->set_dentry(de
);
14077 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
14078 res
= make_request(req
, perm
, inp
);
14079 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
14083 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
14087 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
14088 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
14090 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14091 if (!mref_reader
.is_state_satisfied())
14092 return -CEPHFS_ENOTCONN
;
14094 vinodeno_t vparent
= _get_vino(parent
);
14096 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
14097 tout(cct
) << "ll_mkdir" << std::endl
;
14098 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14099 tout(cct
) << name
<< std::endl
;
14100 tout(cct
) << mode
<< std::endl
;
14102 std::scoped_lock
lock(client_lock
);
14104 if (!fuse_default_permissions
) {
14105 int r
= may_create(parent
, perm
);
14111 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
14113 fill_stat(in
, attr
);
14116 tout(cct
) << attr
->st_ino
<< std::endl
;
14117 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
14118 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
14123 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
14124 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
14125 const UserPerm
& perms
)
14127 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14128 if (!mref_reader
.is_state_satisfied())
14129 return -CEPHFS_ENOTCONN
;
14131 vinodeno_t vparent
= _get_vino(parent
);
14133 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
14134 tout(cct
) << "ll_mkdirx" << std::endl
;
14135 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14136 tout(cct
) << name
<< std::endl
;
14137 tout(cct
) << mode
<< std::endl
;
14139 std::scoped_lock
lock(client_lock
);
14141 if (!fuse_default_permissions
) {
14142 int r
= may_create(parent
, perms
);
14148 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
14150 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
14156 tout(cct
) << stx
->stx_ino
<< std::endl
;
14157 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
14158 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
14163 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
14164 const UserPerm
& perms
, std::string alternate_name
, InodeRef
*inp
)
14166 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
14167 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
14170 if (strlen(name
) > NAME_MAX
)
14171 return -CEPHFS_ENAMETOOLONG
;
14173 if (dir
->snapid
!= CEPH_NOSNAP
) {
14174 return -CEPHFS_EROFS
;
14176 if (is_quota_files_exceeded(dir
, perms
)) {
14177 return -CEPHFS_EDQUOT
;
14180 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
14182 req
->set_inode_owner_uid_gid(perms
.uid(), perms
.gid());
14185 dir
->make_nosnap_relative_path(path
);
14186 path
.push_dentry(name
);
14187 req
->set_filepath(path
);
14188 req
->set_alternate_name(std::move(alternate_name
));
14189 req
->set_inode(dir
);
14190 req
->set_string2(target
);
14191 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14192 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14194 Dentry
*de
= get_or_create(dir
, name
);
14195 req
->set_dentry(de
);
14197 int res
= make_request(req
, perms
, inp
);
14200 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
14205 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
14206 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
14208 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14209 if (!mref_reader
.is_state_satisfied())
14210 return -CEPHFS_ENOTCONN
;
14212 vinodeno_t vparent
= _get_vino(parent
);
14214 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
14216 tout(cct
) << "ll_symlink" << std::endl
;
14217 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14218 tout(cct
) << name
<< std::endl
;
14219 tout(cct
) << value
<< std::endl
;
14221 std::scoped_lock
lock(client_lock
);
14223 if (!fuse_default_permissions
) {
14224 int r
= may_create(parent
, perms
);
14230 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
14232 fill_stat(in
, attr
);
14235 tout(cct
) << attr
->st_ino
<< std::endl
;
14236 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
14237 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
14242 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
14243 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
14244 unsigned flags
, const UserPerm
& perms
)
14246 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14247 if (!mref_reader
.is_state_satisfied())
14248 return -CEPHFS_ENOTCONN
;
14250 vinodeno_t vparent
= _get_vino(parent
);
14252 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
14254 tout(cct
) << "ll_symlinkx" << std::endl
;
14255 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14256 tout(cct
) << name
<< std::endl
;
14257 tout(cct
) << value
<< std::endl
;
14259 std::scoped_lock
lock(client_lock
);
14261 if (!fuse_default_permissions
) {
14262 int r
= may_create(parent
, perms
);
14268 int r
= _symlink(parent
, name
, value
, perms
, "", &in
);
14270 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
14273 tout(cct
) << stx
->stx_ino
<< std::endl
;
14274 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
14275 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
14280 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
14282 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
14283 << " uid " << perm
.uid() << " gid " << perm
.gid()
14286 if (dir
->snapid
!= CEPH_NOSNAP
) {
14287 return -CEPHFS_EROFS
;
14290 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
14293 dir
->make_nosnap_relative_path(path
);
14294 path
.push_dentry(name
);
14295 req
->set_filepath(path
);
14299 Dentry
*de
= get_or_create(dir
, name
);
14300 req
->set_dentry(de
);
14301 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14302 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14304 int res
= _lookup(dir
, name
, 0, &otherin
, perm
);
14310 in
= otherin
.get();
14311 req
->set_other_inode(in
);
14312 in
->break_all_delegs();
14313 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
14315 req
->set_inode(dir
);
14317 res
= make_request(req
, perm
);
14320 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
14324 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
14326 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14327 if (!mref_reader
.is_state_satisfied())
14328 return -CEPHFS_ENOTCONN
;
14330 vinodeno_t vino
= _get_vino(in
);
14332 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
14333 tout(cct
) << "ll_unlink" << std::endl
;
14334 tout(cct
) << vino
.ino
.val
<< std::endl
;
14335 tout(cct
) << name
<< std::endl
;
14337 std::scoped_lock
lock(client_lock
);
14339 if (!fuse_default_permissions
) {
14340 int r
= may_delete(in
, name
, perm
);
14344 return _unlink(in
, name
, perm
);
14347 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
14349 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
14350 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
14352 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
14353 return -CEPHFS_EROFS
;
14356 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
14357 MetaRequest
*req
= new MetaRequest(op
);
14359 dir
->make_nosnap_relative_path(path
);
14360 path
.push_dentry(name
);
14361 req
->set_filepath(path
);
14362 req
->set_inode(dir
);
14364 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14365 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14366 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
14370 Dentry
*de
= get_or_create(dir
, name
);
14371 if (op
== CEPH_MDS_OP_RMDIR
)
14372 req
->set_dentry(de
);
14376 int res
= _lookup(dir
, name
, 0, &in
, perms
);
14382 if (op
== CEPH_MDS_OP_RMSNAP
) {
14383 unlink(de
, true, true);
14386 req
->set_other_inode(in
.get());
14388 res
= make_request(req
, perms
);
14391 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
14395 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
14397 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14398 if (!mref_reader
.is_state_satisfied())
14399 return -CEPHFS_ENOTCONN
;
14401 vinodeno_t vino
= _get_vino(in
);
14403 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
14404 tout(cct
) << "ll_rmdir" << std::endl
;
14405 tout(cct
) << vino
.ino
.val
<< std::endl
;
14406 tout(cct
) << name
<< std::endl
;
14408 std::scoped_lock
lock(client_lock
);
14410 if (!fuse_default_permissions
) {
14411 int r
= may_delete(in
, name
, perms
);
14416 return _rmdir(in
, name
, perms
);
14419 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
, std::string alternate_name
)
14421 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
14422 << todir
->ino
<< " " << toname
14423 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
14426 if (fromdir
->snapid
!= todir
->snapid
)
14427 return -CEPHFS_EXDEV
;
14429 int op
= CEPH_MDS_OP_RENAME
;
14430 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
14431 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
14432 op
= CEPH_MDS_OP_RENAMESNAP
;
14434 return -CEPHFS_EROFS
;
14437 // don't allow cross-quota renames
14438 if (cct
->_conf
.get_val
<bool>("client_quota") && fromdir
!= todir
) {
14439 Inode
*fromdir_root
=
14440 fromdir
->quota
.is_enabled() ? fromdir
: get_quota_root(fromdir
, perm
);
14441 Inode
*todir_root
=
14442 todir
->quota
.is_enabled() ? todir
: get_quota_root(todir
, perm
);
14443 if (fromdir_root
!= todir_root
) {
14444 return -CEPHFS_EXDEV
;
14449 MetaRequest
*req
= new MetaRequest(op
);
14452 fromdir
->make_nosnap_relative_path(from
);
14453 from
.push_dentry(fromname
);
14455 todir
->make_nosnap_relative_path(to
);
14456 to
.push_dentry(toname
);
14457 req
->set_filepath(to
);
14458 req
->set_filepath2(from
);
14459 req
->set_alternate_name(std::move(alternate_name
));
14461 Dentry
*oldde
= get_or_create(fromdir
, fromname
);
14462 Dentry
*de
= get_or_create(todir
, toname
);
14465 if (op
== CEPH_MDS_OP_RENAME
) {
14466 req
->set_old_dentry(oldde
);
14467 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
14468 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
14470 de
->is_renaming
= true;
14471 req
->set_dentry(de
);
14472 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
14473 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
14475 InodeRef oldin
, otherin
;
14476 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
, nullptr, true);
14480 Inode
*oldinode
= oldin
.get();
14481 oldinode
->break_all_delegs();
14482 req
->set_old_inode(oldinode
);
14483 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
14485 res
= _lookup(todir
, toname
, 0, &otherin
, perm
, nullptr, true);
14489 Inode
*in
= otherin
.get();
14490 req
->set_other_inode(in
);
14491 in
->break_all_delegs();
14493 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
14495 case -CEPHFS_ENOENT
:
14501 req
->set_inode(todir
);
14503 // renamesnap reply contains no tracedn, so we need to invalidate
14505 unlink(oldde
, true, true);
14506 unlink(de
, true, true);
14508 req
->set_inode(todir
);
14511 res
= make_request(req
, perm
, &target
);
14512 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
14514 // if rename fails it will miss waking up the waiters
14515 if (op
== CEPH_MDS_OP_RENAME
&& de
->is_renaming
) {
14516 de
->is_renaming
= false;
14517 signal_cond_list(waiting_for_rename
);
14520 // renamed item from our cache
14523 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
14531 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
14532 const char *newname
, const UserPerm
& perm
)
14534 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14535 if (!mref_reader
.is_state_satisfied())
14536 return -CEPHFS_ENOTCONN
;
14538 vinodeno_t vparent
= _get_vino(parent
);
14539 vinodeno_t vnewparent
= _get_vino(newparent
);
14541 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
14542 << vnewparent
<< " " << newname
<< dendl
;
14543 tout(cct
) << "ll_rename" << std::endl
;
14544 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14545 tout(cct
) << name
<< std::endl
;
14546 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
14547 tout(cct
) << newname
<< std::endl
;
14549 std::scoped_lock
lock(client_lock
);
14551 if (!fuse_default_permissions
) {
14552 int r
= may_delete(parent
, name
, perm
);
14555 r
= may_delete(newparent
, newname
, perm
);
14556 if (r
< 0 && r
!= -CEPHFS_ENOENT
)
14560 return _rename(parent
, name
, newparent
, newname
, perm
, "");
14563 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, std::string alternate_name
, InodeRef
*inp
)
14565 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
14566 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
14568 if (strlen(newname
) > NAME_MAX
)
14569 return -CEPHFS_ENAMETOOLONG
;
14571 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
14572 return -CEPHFS_EROFS
;
14574 if (is_quota_files_exceeded(dir
, perm
)) {
14575 return -CEPHFS_EDQUOT
;
14578 in
->break_all_delegs();
14579 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
14581 filepath
path(newname
, dir
->ino
);
14582 req
->set_filepath(path
);
14583 req
->set_alternate_name(std::move(alternate_name
));
14584 filepath
existing(in
->ino
);
14585 req
->set_filepath2(existing
);
14587 req
->set_inode(dir
);
14588 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
14589 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
14591 Dentry
*de
= get_or_create(dir
, newname
);
14592 req
->set_dentry(de
);
14594 int res
= make_request(req
, perm
, inp
);
14595 ldout(cct
, 10) << "link result is " << res
<< dendl
;
14598 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
14602 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
14603 const UserPerm
& perm
)
14605 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14606 if (!mref_reader
.is_state_satisfied())
14607 return -CEPHFS_ENOTCONN
;
14609 vinodeno_t vino
= _get_vino(in
);
14610 vinodeno_t vnewparent
= _get_vino(newparent
);
14612 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
14614 tout(cct
) << "ll_link" << std::endl
;
14615 tout(cct
) << vino
.ino
.val
<< std::endl
;
14616 tout(cct
) << vnewparent
<< std::endl
;
14617 tout(cct
) << newname
<< std::endl
;
14621 std::scoped_lock
lock(client_lock
);
14623 if (!fuse_default_permissions
) {
14624 if (S_ISDIR(in
->mode
))
14625 return -CEPHFS_EPERM
;
14627 int r
= may_hardlink(in
, perm
);
14631 r
= may_create(newparent
, perm
);
14636 return _link(in
, newparent
, newname
, perm
, "", &target
);
14639 int Client::ll_num_osds(void)
14641 std::scoped_lock
lock(client_lock
);
14642 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
14645 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
14647 std::scoped_lock
lock(client_lock
);
14650 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
14651 if (!o
.exists(osd
))
14653 g
= o
.get_addrs(osd
).front();
14658 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
14659 *addr
= ntohl(nb_addr
);
14663 uint32_t Client::ll_stripe_unit(Inode
*in
)
14665 std::scoped_lock
lock(client_lock
);
14666 return in
->layout
.stripe_unit
;
14669 uint64_t Client::ll_snap_seq(Inode
*in
)
14671 std::scoped_lock
lock(client_lock
);
14672 return in
->snaprealm
->seq
;
14675 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
14677 std::scoped_lock
lock(client_lock
);
14678 *layout
= in
->layout
;
14682 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
14684 return ll_file_layout(fh
->inode
.get(), layout
);
14687 /* Currently we cannot take advantage of redundancy in reads, since we
14688 would have to go through all possible placement groups (a
14689 potentially quite large number determined by a hash), and use CRUSH
14690 to calculate the appropriate set of OSDs for each placement group,
14691 then index into that. An array with one entry per OSD is much more
14692 tractable and works for demonstration purposes. */
14694 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
14695 file_layout_t
* layout
)
14697 std::scoped_lock
lock(client_lock
);
14699 inodeno_t ino
= in
->ino
;
14700 uint32_t object_size
= layout
->object_size
;
14701 uint32_t su
= layout
->stripe_unit
;
14702 uint32_t stripe_count
= layout
->stripe_count
;
14703 uint64_t stripes_per_object
= object_size
/ su
;
14704 uint64_t stripeno
= 0, stripepos
= 0;
14707 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
14708 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
14710 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
14711 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
14713 object_t oid
= file_object_t(ino
, objectno
);
14714 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14715 ceph_object_layout olayout
=
14716 o
.file_to_object_layout(oid
, *layout
);
14717 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
14720 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
14725 /* Return the offset of the block, internal to the object */
14727 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
14729 std::scoped_lock
lock(client_lock
);
14730 file_layout_t
*layout
=&(in
->layout
);
14731 uint32_t object_size
= layout
->object_size
;
14732 uint32_t su
= layout
->stripe_unit
;
14733 uint64_t stripes_per_object
= object_size
/ su
;
14735 return (blockno
% stripes_per_object
) * su
;
14738 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
14739 const UserPerm
& perms
)
14741 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14742 if (!mref_reader
.is_state_satisfied())
14743 return -CEPHFS_ENOTCONN
;
14745 vinodeno_t vino
= _get_vino(in
);
14747 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
14748 tout(cct
) << "ll_opendir" << std::endl
;
14749 tout(cct
) << vino
.ino
.val
<< std::endl
;
14751 std::scoped_lock
lock(client_lock
);
14753 if (!fuse_default_permissions
) {
14754 int r
= may_open(in
, flags
, perms
);
14759 int r
= _opendir(in
, dirpp
, perms
);
14760 tout(cct
) << (uintptr_t)*dirpp
<< std::endl
;
14762 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
14767 int Client::ll_releasedir(dir_result_t
*dirp
)
14769 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14770 if (!mref_reader
.is_state_satisfied())
14771 return -CEPHFS_ENOTCONN
;
14773 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
14774 tout(cct
) << "ll_releasedir" << std::endl
;
14775 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14777 std::scoped_lock
lock(client_lock
);
14783 int Client::ll_fsyncdir(dir_result_t
*dirp
)
14785 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14786 if (!mref_reader
.is_state_satisfied())
14787 return -CEPHFS_ENOTCONN
;
14789 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
14790 tout(cct
) << "ll_fsyncdir" << std::endl
;
14791 tout(cct
) << (uintptr_t)dirp
<< std::endl
;
14793 std::scoped_lock
lock(client_lock
);
14794 return _fsync(dirp
->inode
.get(), false);
14797 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
14799 ceph_assert(!(flags
& O_CREAT
));
14801 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14802 if (!mref_reader
.is_state_satisfied())
14803 return -CEPHFS_ENOTCONN
;
14805 vinodeno_t vino
= _get_vino(in
);
14807 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
14808 tout(cct
) << "ll_open" << std::endl
;
14809 tout(cct
) << vino
.ino
.val
<< std::endl
;
14810 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14812 std::scoped_lock
lock(client_lock
);
14815 if (!fuse_default_permissions
) {
14816 r
= may_open(in
, flags
, perms
);
14821 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
14824 Fh
*fhptr
= fhp
? *fhp
: NULL
;
14826 ll_unclosed_fh_set
.insert(fhptr
);
14828 tout(cct
) << (uintptr_t)fhptr
<< std::endl
;
14829 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
14830 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
14834 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14835 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
14836 const UserPerm
& perms
)
14840 vinodeno_t vparent
= _get_vino(parent
);
14842 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14843 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
14844 << ", gid " << perms
.gid() << dendl
;
14845 tout(cct
) << "ll_create" << std::endl
;
14846 tout(cct
) << vparent
.ino
.val
<< std::endl
;
14847 tout(cct
) << name
<< std::endl
;
14848 tout(cct
) << mode
<< std::endl
;
14849 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
14851 bool created
= false;
14852 int r
= _lookup(parent
, name
, caps
, in
, perms
);
14854 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
14855 return -CEPHFS_EEXIST
;
14857 if (r
== -CEPHFS_ENOENT
&& (flags
& O_CREAT
)) {
14858 if (!fuse_default_permissions
) {
14859 r
= may_create(parent
, perms
);
14863 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
14874 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
14876 if (!fuse_default_permissions
) {
14877 r
= may_open(in
->get(), flags
, perms
);
14880 int release_r
= _release_fh(*fhp
);
14881 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
14886 if (*fhp
== NULL
) {
14887 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
14895 ll_unclosed_fh_set
.insert(*fhp
);
14904 Inode
*inode
= in
->get();
14905 if (use_faked_inos())
14906 ino
= inode
->faked_ino
;
14911 tout(cct
) << (uintptr_t)*fhp
<< std::endl
;
14912 tout(cct
) << ino
<< std::endl
;
14913 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
14914 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
14915 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
14920 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
14921 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
14922 const UserPerm
& perms
)
14924 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14925 if (!mref_reader
.is_state_satisfied())
14926 return -CEPHFS_ENOTCONN
;
14928 std::scoped_lock
lock(client_lock
);
14931 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
14936 // passing an Inode in outp requires an additional ref
14941 fill_stat(in
, attr
);
14949 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
14950 int oflags
, Inode
**outp
, Fh
**fhp
,
14951 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
14952 const UserPerm
& perms
)
14954 unsigned caps
= statx_to_mask(lflags
, want
);
14955 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14956 if (!mref_reader
.is_state_satisfied())
14957 return -CEPHFS_ENOTCONN
;
14959 std::scoped_lock
lock(client_lock
);
14962 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
14966 // passing an Inode in outp requires an additional ref
14971 fill_statx(in
, caps
, stx
);
14980 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
14982 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14983 if (!mref_reader
.is_state_satisfied())
14984 return -CEPHFS_ENOTCONN
;
14986 tout(cct
) << "ll_lseek" << std::endl
;
14987 tout(cct
) << offset
<< std::endl
;
14988 tout(cct
) << whence
<< std::endl
;
14990 std::scoped_lock
lock(client_lock
);
14991 return _lseek(fh
, offset
, whence
);
14994 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
14996 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
14997 if (!mref_reader
.is_state_satisfied())
14998 return -CEPHFS_ENOTCONN
;
15000 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
15001 tout(cct
) << "ll_read" << std::endl
;
15002 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15003 tout(cct
) << off
<< std::endl
;
15004 tout(cct
) << len
<< std::endl
;
15006 /* We can't return bytes written larger than INT_MAX, clamp len to that */
15007 len
= std::min(len
, (loff_t
)INT_MAX
);
15008 std::scoped_lock
lock(client_lock
);
15010 int r
= _read(fh
, off
, len
, bl
);
15011 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
15016 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
15020 file_layout_t
* layout
)
15022 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15023 if (!mref_reader
.is_state_satisfied())
15024 return -CEPHFS_ENOTCONN
;
15026 vinodeno_t vino
= _get_vino(in
);
15027 object_t oid
= file_object_t(vino
.ino
, blockid
);
15028 C_SaferCond onfinish
;
15031 objecter
->read(oid
,
15032 object_locator_t(layout
->pool_id
),
15037 CEPH_OSD_FLAG_READ
,
15040 int r
= onfinish
.wait();
15042 bl
.begin().copy(bl
.length(), buf
);
15049 /* It appears that the OSD doesn't return success unless the entire
15050 buffer was written, return the write length on success. */
15052 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
15053 char* buf
, uint64_t offset
,
15054 uint64_t length
, file_layout_t
* layout
,
15055 uint64_t snapseq
, uint32_t sync
)
15057 vinodeno_t vino
= ll_get_vino(in
);
15059 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
15061 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15062 if (!mref_reader
.is_state_satisfied())
15063 return -CEPHFS_ENOTCONN
;
15066 return -CEPHFS_EINVAL
;
15068 if (true || sync
) {
15069 /* if write is stable, the epilogue is waiting on
15071 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
15073 object_t oid
= file_object_t(vino
.ino
, blockid
);
15074 SnapContext fakesnap
;
15075 ceph::bufferlist bl
;
15077 bl
.push_back(buffer::copy(buf
, length
));
15080 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
15083 fakesnap
.seq
= snapseq
;
15085 /* lock just in time */
15086 objecter
->write(oid
,
15087 object_locator_t(layout
->pool_id
),
15092 ceph::real_clock::now(),
15096 if (nullptr != onsafe
) {
15097 r
= onsafe
->wait();
15107 int Client::ll_commit_blocks(Inode
*in
,
15112 BarrierContext *bctx;
15113 vinodeno_t vino = _get_vino(in);
15114 uint64_t ino = vino.ino;
15116 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
15117 << offset << " to " << length << dendl;
15120 return -CEPHFS_EINVAL;
15123 std::scoped_lock lock(client_lock);
15124 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
15125 if (p != barriers.end()) {
15126 barrier_interval civ(offset, offset + length);
15127 p->second->commit_barrier(civ);
15133 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
15135 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
15136 "~" << len
<< dendl
;
15137 tout(cct
) << "ll_write" << std::endl
;
15138 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15139 tout(cct
) << off
<< std::endl
;
15140 tout(cct
) << len
<< std::endl
;
15142 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15143 if (!mref_reader
.is_state_satisfied())
15144 return -CEPHFS_ENOTCONN
;
15146 /* We can't return bytes written larger than INT_MAX, clamp len to that */
15147 len
= std::min(len
, (loff_t
)INT_MAX
);
15148 std::scoped_lock
lock(client_lock
);
15150 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
15151 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
15156 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
15158 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15159 if (!mref_reader
.is_state_satisfied())
15160 return -CEPHFS_ENOTCONN
;
15162 std::scoped_lock
cl(client_lock
);
15163 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
15166 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
15168 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15169 if (!mref_reader
.is_state_satisfied())
15170 return -CEPHFS_ENOTCONN
;
15172 std::scoped_lock
cl(client_lock
);
15173 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
15176 int Client::ll_flush(Fh
*fh
)
15178 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15179 if (!mref_reader
.is_state_satisfied())
15180 return -CEPHFS_ENOTCONN
;
15182 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
15183 tout(cct
) << "ll_flush" << std::endl
;
15184 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15186 std::scoped_lock
lock(client_lock
);
15190 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
15192 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15193 if (!mref_reader
.is_state_satisfied())
15194 return -CEPHFS_ENOTCONN
;
15196 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
15197 tout(cct
) << "ll_fsync" << std::endl
;
15198 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15200 std::scoped_lock
lock(client_lock
);
15201 int r
= _fsync(fh
, syncdataonly
);
15203 // If we're returning an error, clear it from the FH
15204 fh
->take_async_err();
15209 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
15211 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15212 if (!mref_reader
.is_state_satisfied())
15213 return -CEPHFS_ENOTCONN
;
15215 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
15216 tout(cct
) << "ll_sync_inode" << std::endl
;
15217 tout(cct
) << (uintptr_t)in
<< std::endl
;
15219 std::scoped_lock
lock(client_lock
);
15220 return _fsync(in
, syncdataonly
);
15223 int Client::clear_suid_sgid(Inode
*in
, const UserPerm
& perms
, bool defer
)
15225 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< " defer "
15228 if (!in
->is_file()) {
15232 if (likely(!(in
->mode
& (S_ISUID
|S_ISGID
)))) {
15236 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
15242 // always drop the suid
15243 if (unlikely(in
->mode
& S_ISUID
)) {
15244 mask
= CEPH_SETATTR_KILL_SUID
;
15247 // remove the sgid if S_IXUGO is set or the inode is
15248 // is not in the caller's group list.
15249 if ((in
->mode
& S_ISGID
) &&
15250 ((in
->mode
& S_IXUGO
) || !perms
.gid_in_groups(in
->gid
))) {
15251 mask
|= CEPH_SETATTR_KILL_SGID
;
15254 ldout(cct
, 20) << __func__
<< " mask " << mask
<< dendl
;
15259 struct ceph_statx stx
= { 0 };
15260 return __setattrx(in
, &stx
, mask
, perms
);
15263 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
15265 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15267 if (offset
< 0 || length
<= 0)
15268 return -CEPHFS_EINVAL
;
15270 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
15271 return -CEPHFS_EOPNOTSUPP
;
15273 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
15274 return -CEPHFS_EOPNOTSUPP
;
15276 Inode
*in
= fh
->inode
.get();
15278 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
15279 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
15280 return -CEPHFS_ENOSPC
;
15283 if (in
->snapid
!= CEPH_NOSNAP
)
15284 return -CEPHFS_EROFS
;
15286 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
15287 return -CEPHFS_EBADF
;
15289 uint64_t size
= offset
+ length
;
15290 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
15292 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
15293 return -CEPHFS_EDQUOT
;
15297 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
15301 r
= clear_suid_sgid(in
, fh
->actor_perms
);
15303 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
15307 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
15308 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
15309 if (in
->inline_version
< CEPH_INLINE_NONE
&&
15310 (have
& CEPH_CAP_FILE_BUFFER
)) {
15312 auto inline_iter
= in
->inline_data
.cbegin();
15313 int len
= in
->inline_data
.length();
15314 if (offset
< len
) {
15316 inline_iter
.copy(offset
, bl
);
15318 if (offset
+ size
> len
)
15319 size
= len
- offset
;
15321 bl
.append_zero(size
);
15322 if (offset
+ size
< len
) {
15323 inline_iter
+= size
;
15324 inline_iter
.copy(len
- offset
- size
, bl
);
15326 in
->inline_data
= bl
;
15327 in
->inline_version
++;
15329 in
->mtime
= in
->ctime
= ceph_clock_now();
15331 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
15333 if (in
->inline_version
< CEPH_INLINE_NONE
) {
15334 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
15335 uninline_data(in
, onuninline
.get());
15338 C_SaferCond
onfinish("Client::_punch_hole flock");
15340 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
15342 _invalidate_inode_cache(in
, offset
, length
);
15343 filer
->zero(in
->ino
, &in
->layout
,
15344 in
->snaprealm
->get_snap_context(),
15346 ceph::real_clock::now(),
15347 0, true, &onfinish
);
15348 in
->mtime
= in
->ctime
= ceph_clock_now();
15350 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
15352 client_lock
.unlock();
15354 client_lock
.lock();
15355 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
15357 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
15358 uint64_t size
= offset
+ length
;
15359 if (size
> in
->size
) {
15361 in
->mtime
= in
->ctime
= ceph_clock_now();
15363 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
15365 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
15366 check_caps(in
, CHECK_CAPS_NODELAY
);
15367 } else if (is_max_size_approaching(in
)) {
15373 if (nullptr != onuninline
) {
15374 client_lock
.unlock();
15375 int ret
= onuninline
->wait();
15376 client_lock
.lock();
15378 if (ret
>= 0 || ret
== -CEPHFS_ECANCELED
) {
15379 in
->inline_data
.clear();
15380 in
->inline_version
= CEPH_INLINE_NONE
;
15381 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
15387 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
15391 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
15393 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15394 if (!mref_reader
.is_state_satisfied())
15395 return -CEPHFS_ENOTCONN
;
15397 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
15398 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
15399 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15401 std::scoped_lock
lock(client_lock
);
15402 return _fallocate(fh
, mode
, offset
, length
);
15405 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
15407 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15408 if (!mref_reader
.is_state_satisfied())
15409 return -CEPHFS_ENOTCONN
;
15411 tout(cct
) << __func__
<< " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
15413 std::scoped_lock
lock(client_lock
);
15414 Fh
*fh
= get_filehandle(fd
);
15416 return -CEPHFS_EBADF
;
15417 #if defined(__linux__) && defined(O_PATH)
15418 if (fh
->flags
& O_PATH
)
15419 return -CEPHFS_EBADF
;
15421 return _fallocate(fh
, mode
, offset
, length
);
15424 int Client::ll_release(Fh
*fh
)
15426 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15427 if (!mref_reader
.is_state_satisfied())
15428 return -CEPHFS_ENOTCONN
;
15430 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
15432 tout(cct
) << __func__
<< " (fh)" << std::endl
;
15433 tout(cct
) << (uintptr_t)fh
<< std::endl
;
15435 std::scoped_lock
lock(client_lock
);
15437 if (ll_unclosed_fh_set
.count(fh
))
15438 ll_unclosed_fh_set
.erase(fh
);
15439 return _release_fh(fh
);
15442 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
15444 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15445 if (!mref_reader
.is_state_satisfied())
15446 return -CEPHFS_ENOTCONN
;
15448 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
15449 tout(cct
) << "ll_getk (fh)" << (uintptr_t)fh
<< std::endl
;
15451 std::scoped_lock
lock(client_lock
);
15452 return _getlk(fh
, fl
, owner
);
15455 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
15457 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15458 if (!mref_reader
.is_state_satisfied())
15459 return -CEPHFS_ENOTCONN
;
15461 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
15462 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
15464 std::scoped_lock
lock(client_lock
);
15465 return _setlk(fh
, fl
, owner
, sleep
);
15468 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
15470 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15471 if (!mref_reader
.is_state_satisfied())
15472 return -CEPHFS_ENOTCONN
;
15474 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
15475 tout(cct
) << __func__
<< " (fh)" << (uintptr_t)fh
<< std::endl
;
15477 std::scoped_lock
lock(client_lock
);
15478 return _flock(fh
, cmd
, owner
);
15481 int Client::set_deleg_timeout(uint32_t timeout
)
15483 std::scoped_lock
lock(client_lock
);
15486 * The whole point is to prevent blocklisting so we must time out the
15487 * delegation before the session autoclose timeout kicks in.
15489 if (timeout
>= mdsmap
->get_session_autoclose())
15490 return -CEPHFS_EINVAL
;
15492 deleg_timeout
= timeout
;
15496 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
15498 int ret
= -CEPHFS_EINVAL
;
15500 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15501 if (!mref_reader
.is_state_satisfied())
15502 return -CEPHFS_ENOTCONN
;
15504 std::scoped_lock
lock(client_lock
);
15506 Inode
*inode
= fh
->inode
.get();
15509 case CEPH_DELEGATION_NONE
:
15510 inode
->unset_deleg(fh
);
15515 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
15516 } catch (std::bad_alloc
&) {
15517 ret
= -CEPHFS_ENOMEM
;
15524 class C_Client_RequestInterrupt
: public Context
{
15529 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
15532 void finish(int r
) override
{
15533 std::scoped_lock
l(client
->client_lock
);
15534 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
15535 client
->_interrupt_filelock(req
);
15536 client
->put_request(req
);
15540 void Client::ll_interrupt(void *d
)
15542 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
15543 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
15544 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
15545 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
15548 // =========================================
15551 // expose file layouts
15553 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
15554 const UserPerm
& perms
)
15556 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15557 if (!mref_reader
.is_state_satisfied())
15558 return -CEPHFS_ENOTCONN
;
15560 std::scoped_lock
lock(client_lock
);
15562 filepath
path(relpath
);
15564 int r
= path_walk(path
, &in
, perms
);
15570 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
15574 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
15576 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15577 if (!mref_reader
.is_state_satisfied())
15578 return -CEPHFS_ENOTCONN
;
15580 std::scoped_lock
lock(client_lock
);
15582 Fh
*f
= get_filehandle(fd
);
15584 return -CEPHFS_EBADF
;
15585 Inode
*in
= f
->inode
.get();
15589 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
15593 int64_t Client::get_default_pool_id()
15595 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15596 if (!mref_reader
.is_state_satisfied())
15597 return -CEPHFS_ENOTCONN
;
15599 std::scoped_lock
lock(client_lock
);
15601 /* first data pool is the default */
15602 return mdsmap
->get_first_data_pool();
15607 int64_t Client::get_pool_id(const char *pool_name
)
15609 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15610 if (!mref_reader
.is_state_satisfied())
15611 return -CEPHFS_ENOTCONN
;
15613 std::scoped_lock
lock(client_lock
);
15615 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
15619 string
Client::get_pool_name(int64_t pool
)
15621 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15622 if (!mref_reader
.is_state_satisfied())
15625 std::scoped_lock
lock(client_lock
);
15627 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15628 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
15632 int Client::get_pool_replication(int64_t pool
)
15634 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15635 if (!mref_reader
.is_state_satisfied())
15636 return -CEPHFS_ENOTCONN
;
15638 std::scoped_lock
lock(client_lock
);
15640 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
15641 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -CEPHFS_ENOENT
;
15645 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
15647 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15648 if (!mref_reader
.is_state_satisfied())
15649 return -CEPHFS_ENOTCONN
;
15651 std::scoped_lock
lock(client_lock
);
15653 Fh
*f
= get_filehandle(fd
);
15655 return -CEPHFS_EBADF
;
15656 Inode
*in
= f
->inode
.get();
15658 vector
<ObjectExtent
> extents
;
15659 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
15660 ceph_assert(extents
.size() == 1);
15662 objecter
->with_osdmap([&](const OSDMap
& o
) {
15663 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15664 o
.pg_to_acting_osds(pg
, osds
);
15668 return -CEPHFS_EINVAL
;
15671 * Return the remainder of the extent (stripe unit)
15673 * If length = 1 is passed to Striper::file_to_extents we get a single
15674 * extent back, but its length is one so we still need to compute the length
15675 * to the end of the stripe unit.
15677 * If length = su then we may get 1 or 2 objects back in the extents vector
15678 * which would have to be examined. Even then, the offsets are local to the
15679 * object, so matching up to the file offset is extra work.
15681 * It seems simpler to stick with length = 1 and manually compute the
15685 uint64_t su
= in
->layout
.stripe_unit
;
15686 *len
= su
- (off
% su
);
15692 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
15694 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15695 if (!mref_reader
.is_state_satisfied())
15696 return -CEPHFS_ENOTCONN
;
15698 std::scoped_lock
lock(client_lock
);
15701 return -CEPHFS_EINVAL
;
15702 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15703 return o
.crush
->get_full_location_ordered(id
, path
);
15707 int Client::get_file_stripe_address(int fd
, loff_t offset
,
15708 vector
<entity_addr_t
>& address
)
15710 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15711 if (!mref_reader
.is_state_satisfied())
15712 return -CEPHFS_ENOTCONN
;
15714 std::scoped_lock
lock(client_lock
);
15716 Fh
*f
= get_filehandle(fd
);
15718 return -CEPHFS_EBADF
;
15719 Inode
*in
= f
->inode
.get();
15722 vector
<ObjectExtent
> extents
;
15723 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
15724 in
->truncate_size
, extents
);
15725 ceph_assert(extents
.size() == 1);
15727 // now we have the object and its 'layout'
15728 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15729 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
15731 o
.pg_to_acting_osds(pg
, osds
);
15733 return -CEPHFS_EINVAL
;
15734 for (unsigned i
= 0; i
< osds
.size(); i
++) {
15735 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
15736 address
.push_back(addr
);
15742 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
15744 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15745 if (!mref_reader
.is_state_satisfied())
15746 return -CEPHFS_ENOTCONN
;
15748 std::scoped_lock
lock(client_lock
);
15750 return objecter
->with_osdmap([&](const OSDMap
& o
) {
15751 if (!o
.exists(osd
))
15752 return -CEPHFS_ENOENT
;
15754 addr
= o
.get_addrs(osd
).front();
15759 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
15760 loff_t length
, loff_t offset
)
15762 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15763 if (!mref_reader
.is_state_satisfied())
15764 return -CEPHFS_ENOTCONN
;
15766 std::scoped_lock
lock(client_lock
);
15768 Fh
*f
= get_filehandle(fd
);
15770 return -CEPHFS_EBADF
;
15771 Inode
*in
= f
->inode
.get();
15773 // map to a list of extents
15774 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
15776 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
15781 /* find an osd with the same ip. -CEPHFS_ENXIO if none. */
15782 int Client::get_local_osd()
15784 RWRef_t
mref_reader(mount_state
, CLIENT_MOUNTING
);
15785 if (!mref_reader
.is_state_satisfied())
15786 return -CEPHFS_ENOTCONN
;
15788 std::scoped_lock
lock(client_lock
);
15790 objecter
->with_osdmap([this](const OSDMap
& o
) {
15791 if (o
.get_epoch() != local_osd_epoch
) {
15792 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
15793 local_osd_epoch
= o
.get_epoch();
15804 // ===============================
15806 void Client::ms_handle_connect(Connection
*con
)
15808 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15811 bool Client::ms_handle_reset(Connection
*con
)
15813 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15817 void Client::ms_handle_remote_reset(Connection
*con
)
15819 std::scoped_lock
lock(client_lock
);
15820 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15821 switch (con
->get_peer_type()) {
15822 case CEPH_ENTITY_TYPE_MDS
:
15824 // kludge to figure out which mds this is; fixme with a Connection* state
15825 mds_rank_t mds
= MDS_RANK_NONE
;
15826 MetaSessionRef s
= NULL
;
15827 for (auto &p
: mds_sessions
) {
15828 if (mdsmap
->have_inst(p
.first
) && mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
15834 ceph_assert(s
!= NULL
);
15835 switch (s
->state
) {
15836 case MetaSession::STATE_CLOSING
:
15837 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
15838 _closed_mds_session(s
.get());
15841 case MetaSession::STATE_OPENING
:
15843 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
15844 list
<Context
*> waiters
;
15845 waiters
.swap(s
->waiting_for_open
);
15846 _closed_mds_session(s
.get());
15847 auto news
= _get_or_open_mds_session(mds
);
15848 news
->waiting_for_open
.swap(waiters
);
15852 case MetaSession::STATE_OPEN
:
15854 objecter
->maybe_request_map(); /* to check if we are blocklisted */
15855 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
15856 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
15857 _closed_mds_session(s
.get());
15859 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
15860 s
->state
= MetaSession::STATE_STALE
;
15865 case MetaSession::STATE_NEW
:
15866 case MetaSession::STATE_CLOSED
:
15876 bool Client::ms_handle_refused(Connection
*con
)
15878 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
15882 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
, quota_max_t type
)
15884 Inode
*quota_in
= root_ancestor
;
15885 SnapRealm
*realm
= in
->snaprealm
;
15887 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15891 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
15892 if (realm
->ino
!= in
->ino
) {
15893 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
15894 if (p
== inode_map
.end())
15897 if (p
->second
->quota
.is_enabled(type
)) {
15898 quota_in
= p
->second
;
15902 realm
= realm
->pparent
;
15904 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
15909 * Traverse quota ancestors of the Inode, return true
15910 * if any of them passes the passed function
15912 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
15913 std::function
<bool (const Inode
&in
)> test
)
15915 if (!cct
->_conf
.get_val
<bool>("client_quota"))
15919 ceph_assert(in
!= NULL
);
15924 if (in
== root_ancestor
) {
15925 // We're done traversing, drop out
15928 // Continue up the tree
15929 in
= get_quota_root(in
, perms
);
15936 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
15938 return check_quota_condition(in
, perms
,
15939 [](const Inode
&in
) {
15940 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
15944 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
15945 const UserPerm
& perms
)
15947 return check_quota_condition(in
, perms
,
15948 [&new_bytes
](const Inode
&in
) {
15949 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
15950 > in
.quota
.max_bytes
;
15954 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
15956 ceph_assert(in
->size
>= in
->reported_size
);
15957 const uint64_t size
= in
->size
- in
->reported_size
;
15958 return check_quota_condition(in
, perms
,
15959 [&size
](const Inode
&in
) {
15960 if (in
.quota
.max_bytes
) {
15961 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
15965 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
15966 return (space
>> 4) < size
;
15980 int Client::check_pool_perm(Inode
*in
, int need
)
15982 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
15984 if (!cct
->_conf
->client_check_pool_perm
)
15987 /* Only need to do this for regular files */
15988 if (!in
->is_file())
15991 int64_t pool_id
= in
->layout
.pool_id
;
15992 std::string pool_ns
= in
->layout
.pool_ns
;
15993 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
15996 auto it
= pool_perms
.find(perm_key
);
15997 if (it
== pool_perms
.end())
15999 if (it
->second
== POOL_CHECKING
) {
16000 // avoid concurrent checkings
16001 wait_on_list(waiting_for_pool_perm
);
16004 ceph_assert(have
& POOL_CHECKED
);
16010 if (in
->snapid
!= CEPH_NOSNAP
) {
16011 // pool permission check needs to write to the first object. But for snapshot,
16012 // head of the first object may have already been deleted. To avoid creating
16013 // orphan object, skip the check for now.
16017 pool_perms
[perm_key
] = POOL_CHECKING
;
16020 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
16021 object_t oid
= oid_buf
;
16023 SnapContext nullsnapc
;
16025 C_SaferCond rd_cond
;
16026 ObjectOperation rd_op
;
16027 rd_op
.stat(nullptr, nullptr, nullptr);
16029 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
16030 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
16032 C_SaferCond wr_cond
;
16033 ObjectOperation wr_op
;
16034 wr_op
.create(true);
16036 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
16037 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
16039 client_lock
.unlock();
16040 int rd_ret
= rd_cond
.wait();
16041 int wr_ret
= wr_cond
.wait();
16042 client_lock
.lock();
16044 bool errored
= false;
16046 if (rd_ret
== 0 || rd_ret
== -CEPHFS_ENOENT
)
16048 else if (rd_ret
!= -CEPHFS_EPERM
) {
16049 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
16050 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
16054 if (wr_ret
== 0 || wr_ret
== -CEPHFS_EEXIST
)
16055 have
|= POOL_WRITE
;
16056 else if (wr_ret
!= -CEPHFS_EPERM
) {
16057 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
16058 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
16063 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
16064 // Raise EIO because actual error code might be misleading for
16065 // userspace filesystem user.
16066 pool_perms
.erase(perm_key
);
16067 signal_cond_list(waiting_for_pool_perm
);
16068 return -CEPHFS_EIO
;
16071 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
16072 signal_cond_list(waiting_for_pool_perm
);
16075 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
16076 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
16077 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
16078 return -CEPHFS_EPERM
;
16080 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
16081 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
16082 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
16083 return -CEPHFS_EPERM
;
16089 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
16091 if (acl_type
== POSIX_ACL
) {
16092 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
16093 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
16095 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
16098 return -CEPHFS_EAGAIN
;
16101 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
16103 if (acl_type
== NO_ACL
)
16106 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
16110 if (acl_type
== POSIX_ACL
) {
16111 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
16112 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
16113 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
16114 r
= posix_acl_access_chmod(acl
, mode
);
16117 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
16123 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
16127 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
16128 const UserPerm
& perms
)
16130 if (acl_type
== NO_ACL
)
16133 if (S_ISLNK(*mode
))
16136 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
16140 if (acl_type
== POSIX_ACL
) {
16141 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
16142 map
<string
, bufferptr
> xattrs
;
16144 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
16145 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
16146 r
= posix_acl_inherit_mode(acl
, mode
);
16151 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
16155 xattrs
[ACL_EA_ACCESS
] = acl
;
16158 if (S_ISDIR(*mode
))
16159 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
16163 encode(xattrs
, xattrs_bl
);
16166 *mode
&= ~umask_cb(callback_handle
);
16171 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
16175 void Client::set_filer_flags(int flags
)
16177 std::scoped_lock
l(client_lock
);
16178 ceph_assert(flags
== 0 ||
16179 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
16180 objecter
->add_global_op_flags(flags
);
16183 void Client::clear_filer_flags(int flags
)
16185 std::scoped_lock
l(client_lock
);
16186 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
16187 objecter
->clear_global_op_flag(flags
);
16190 // called before mount
16191 void Client::set_uuid(const std::string
& uuid
)
16193 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
16194 ceph_assert(iref_reader
.is_state_satisfied());
16196 std::scoped_lock
l(client_lock
);
16197 ceph_assert(!uuid
.empty());
16199 metadata
["uuid"] = uuid
;
16203 // called before mount. 0 means infinite
16204 void Client::set_session_timeout(unsigned timeout
)
16206 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
16207 ceph_assert(iref_reader
.is_state_satisfied());
16209 std::scoped_lock
l(client_lock
);
16211 metadata
["timeout"] = stringify(timeout
);
16214 // called before mount
16215 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
16216 const std::string
& fs_name
)
16218 RWRef_t
iref_reader(initialize_state
, CLIENT_INITIALIZED
);
16219 if (!iref_reader
.is_state_satisfied())
16220 return -CEPHFS_ENOTCONN
;
16223 return -CEPHFS_EINVAL
;
16225 std::unique_lock
l(client_lock
);
16227 auto it
= metadata
.find("uuid");
16228 if (it
!= metadata
.end() && it
->second
== uuid
)
16229 return -CEPHFS_EINVAL
;
16232 int r
= subscribe_mdsmap(fs_name
);
16234 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
16238 if (metadata
.empty())
16239 populate_metadata("");
16241 while (mdsmap
->get_epoch() == 0)
16242 wait_on_list(waiting_for_mdsmap
);
16245 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
16246 if (!mdsmap
->is_up(mds
)) {
16247 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
16248 wait_on_list(waiting_for_mdsmap
);
16252 MetaSessionRef session
;
16253 if (!have_open_session(mds
)) {
16254 session
= _get_or_open_mds_session(mds
);
16255 if (session
->state
== MetaSession::STATE_REJECTED
)
16256 return -CEPHFS_EPERM
;
16257 if (session
->state
!= MetaSession::STATE_OPENING
) {
16259 return -CEPHFS_EINVAL
;
16261 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
16262 wait_on_context_list(session
->waiting_for_open
);
16266 session
= mds_sessions
.at(mds
);
16267 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
16268 return -CEPHFS_EOPNOTSUPP
;
16270 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
16271 session
->reclaim_state
== MetaSession::RECLAIMING
) {
16272 session
->reclaim_state
= MetaSession::RECLAIMING
;
16273 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
16274 session
->con
->send_message2(std::move(m
));
16275 wait_on_list(waiting_for_reclaim
);
16276 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
16277 return reclaim_errno
? : -CEPHFS_ENOTRECOVERABLE
;
16283 // didn't find target session in any mds
16284 if (reclaim_target_addrs
.empty()) {
16285 if (flags
& CEPH_RECLAIM_RESET
)
16286 return -CEPHFS_ENOENT
;
16287 return -CEPHFS_ENOTRECOVERABLE
;
16290 if (flags
& CEPH_RECLAIM_RESET
)
16293 // use blocklist to check if target session was killed
16294 // (config option mds_session_blocklist_on_evict needs to be true)
16295 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
16298 objecter
->wait_for_map(reclaim_osd_epoch
, ca::use_blocked
[ec
]);
16302 return ceph::from_error_code(ec
);
16304 bool blocklisted
= objecter
->with_osdmap(
16305 [this](const OSDMap
&osd_map
) -> bool {
16306 return osd_map
.is_blocklisted(reclaim_target_addrs
);
16309 return -CEPHFS_ENOTRECOVERABLE
;
16311 metadata
["reclaiming_uuid"] = uuid
;
16315 void Client::finish_reclaim()
16317 auto it
= metadata
.find("reclaiming_uuid");
16318 if (it
== metadata
.end()) {
16319 for (auto &p
: mds_sessions
)
16320 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
16324 for (auto &p
: mds_sessions
) {
16325 p
.second
->reclaim_state
= MetaSession::RECLAIM_NULL
;
16326 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
16327 p
.second
->con
->send_message2(std::move(m
));
16330 metadata
["uuid"] = it
->second
;
16331 metadata
.erase(it
);
16334 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
16336 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
16337 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
16339 std::scoped_lock
cl(client_lock
);
16340 auto session
= _get_mds_session(from
, reply
->get_connection().get());
16342 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
16346 if (reply
->get_result() >= 0) {
16347 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
16348 if (reply
->get_epoch() > reclaim_osd_epoch
)
16349 reclaim_osd_epoch
= reply
->get_epoch();
16350 if (!reply
->get_addrs().empty())
16351 reclaim_target_addrs
= reply
->get_addrs();
16353 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
16354 reclaim_errno
= reply
->get_result();
16357 signal_cond_list(waiting_for_reclaim
);
16361 * This is included in cap release messages, to cause
16362 * the MDS to wait until this OSD map epoch. It is necessary
16363 * in corner cases where we cancel RADOS ops, so that
16364 * nobody else tries to do IO to the same objects in
16365 * the same epoch as the cancelled ops.
16367 void Client::set_cap_epoch_barrier(epoch_t e
)
16369 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
16370 cap_epoch_barrier
= e
;
16373 const char** Client::get_tracked_conf_keys() const
16375 static const char* keys
[] = {
16376 "client_cache_size",
16377 "client_cache_mid",
16379 "client_deleg_timeout",
16380 "client_deleg_break_on_open",
16382 "client_oc_max_objects",
16383 "client_oc_max_dirty",
16384 "client_oc_target_dirty",
16385 "client_oc_max_dirty_age",
16386 "client_caps_release_delay",
16387 "client_mount_timeout",
16393 void Client::handle_conf_change(const ConfigProxy
& conf
,
16394 const std::set
<std::string
> &changed
)
16396 std::scoped_lock
lock(client_lock
);
16398 if (changed
.count("client_cache_mid")) {
16399 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
16401 if (changed
.count("client_acl_type")) {
16403 if (cct
->_conf
->client_acl_type
== "posix_acl")
16404 acl_type
= POSIX_ACL
;
16406 if (changed
.count("client_oc_size")) {
16407 objectcacher
->set_max_size(cct
->_conf
->client_oc_size
);
16409 if (changed
.count("client_oc_max_objects")) {
16410 objectcacher
->set_max_objects(cct
->_conf
->client_oc_max_objects
);
16412 if (changed
.count("client_oc_max_dirty")) {
16413 objectcacher
->set_max_dirty(cct
->_conf
->client_oc_max_dirty
);
16415 if (changed
.count("client_oc_target_dirty")) {
16416 objectcacher
->set_target_dirty(cct
->_conf
->client_oc_target_dirty
);
16418 if (changed
.count("client_oc_max_dirty_age")) {
16419 objectcacher
->set_max_dirty_age(cct
->_conf
->client_oc_max_dirty_age
);
16421 if (changed
.count("client_collect_and_send_global_metrics")) {
16422 _collect_and_send_global_metrics
= cct
->_conf
.get_val
<bool>(
16423 "client_collect_and_send_global_metrics");
16425 if (changed
.count("client_caps_release_delay")) {
16426 caps_release_delay
= cct
->_conf
.get_val
<std::chrono::seconds
>(
16427 "client_caps_release_delay");
16429 if (changed
.count("client_mount_timeout")) {
16430 mount_timeout
= cct
->_conf
.get_val
<std::chrono::seconds
>(
16431 "client_mount_timeout");
16435 void intrusive_ptr_add_ref(Inode
*in
)
16440 void intrusive_ptr_release(Inode
*in
)
16442 in
->client
->put_inode(in
);
16445 mds_rank_t
Client::_get_random_up_mds() const
16447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
16449 std::set
<mds_rank_t
> up
;
16450 mdsmap
->get_up_mds_set(up
);
16453 return MDS_RANK_NONE
;
16454 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
16455 for (int n
= rand() % up
.size(); n
; n
--)
16461 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
,
16462 boost::asio::io_context
& ictx
)
16463 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, ictx
))
16465 monclient
->set_messenger(m
);
16466 objecter
->set_client_incarnation(0);
16469 StandaloneClient::~StandaloneClient()
16472 objecter
= nullptr;
16475 int StandaloneClient::init()
16477 RWRef_t
iref_writer(initialize_state
, CLIENT_INITIALIZING
, false);
16478 ceph_assert(iref_writer
.is_first_writer());
16483 client_lock
.lock();
16485 messenger
->add_dispatcher_tail(objecter
);
16486 messenger
->add_dispatcher_tail(this);
16488 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
16489 int r
= monclient
->init();
16491 // need to do cleanup because we're in an intermediate init state
16493 std::scoped_lock
l(timer_lock
);
16497 client_lock
.unlock();
16498 objecter
->shutdown();
16499 objectcacher
->stop();
16500 monclient
->shutdown();
16505 client_lock
.unlock();
16507 iref_writer
.update_state(CLIENT_INITIALIZED
);
16512 void StandaloneClient::shutdown()
16514 Client::shutdown();
16515 objecter
->shutdown();
16516 monclient
->shutdown();