1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
78 #define dout_subsys ceph_subsys_client
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
87 #include "Delegation.h"
89 #include "ClientSnapRealm.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
99 #include "include/cephfs/ceph_statx.h"
101 #if HAVE_GETGROUPLIST
108 #define dout_prefix *_dout << "client." << whoami << " "
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112 // FreeBSD fails to define this
116 // Darwin fails to define this
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
129 Client
*client
= static_cast<Client
*>(p
);
130 client
->flush_set_callback(oset
);
136 Client::CommandHook::CommandHook(Client
*client
) :
141 int Client::CommandHook::call(
142 std::string_view command
,
143 const cmdmap_t
& cmdmap
,
148 f
->open_object_section("result");
150 std::lock_guard l
{m_client
->client_lock
};
151 if (command
== "mds_requests")
152 m_client
->dump_mds_requests(f
);
153 else if (command
== "mds_sessions")
154 m_client
->dump_mds_sessions(f
);
155 else if (command
== "dump_cache")
156 m_client
->dump_cache(f
);
157 else if (command
== "kick_stale_sessions")
158 m_client
->_kick_stale_sessions();
159 else if (command
== "status")
160 m_client
->dump_status(f
);
162 ceph_abort_msg("bad command registered");
171 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
172 : inode(in
), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
177 void Client::_reset_faked_inos()
180 free_faked_inos
.clear();
181 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
182 last_used_faked_ino
= 0;
183 last_used_faked_root
= 0;
184 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
187 void Client::_assign_faked_ino(Inode
*in
)
189 if (0 == last_used_faked_ino
)
190 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
192 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
193 last_used_faked_ino
= 2048;
194 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
196 ceph_assert(it
!= free_faked_inos
.end());
197 if (last_used_faked_ino
< it
.get_start()) {
198 ceph_assert(it
.get_len() > 0);
199 last_used_faked_ino
= it
.get_start();
201 ++last_used_faked_ino
;
202 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
204 in
->faked_ino
= last_used_faked_ino
;
205 free_faked_inos
.erase(in
->faked_ino
);
206 faked_ino_map
[in
->faked_ino
] = in
->vino();
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
216 void Client::_assign_faked_root(Inode
*in
)
218 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
219 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
220 last_used_faked_root
= 0;
221 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
223 assert(it
!= free_faked_inos
.end());
224 vinodeno_t inode_info
= in
->vino();
225 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
226 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
227 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
230 in
->faked_ino
= last_used_faked_root
;
231 free_faked_inos
.erase(in
->faked_ino
);
232 faked_ino_map
[in
->faked_ino
] = in
->vino();
235 void Client::_release_faked_ino(Inode
*in
)
237 free_faked_inos
.insert(in
->faked_ino
);
238 faked_ino_map
.erase(in
->faked_ino
);
241 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
246 else if (faked_ino_map
.count(ino
))
247 vino
= faked_ino_map
[ino
];
249 vino
= vinodeno_t(0, CEPH_NOSNAP
);
250 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
254 vinodeno_t
Client::map_faked_ino(ino_t ino
)
256 std::lock_guard
lock(client_lock
);
257 return _map_faked_ino(ino
);
262 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
263 : Dispatcher(m
->cct
),
264 timer(m
->cct
, client_lock
),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 objecter_finisher(m
->cct
),
274 m_command_hook(this),
279 user_id
= cct
->_conf
->client_mount_uid
;
280 group_id
= cct
->_conf
->client_mount_gid
;
281 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
282 "fuse_default_permissions");
284 if (cct
->_conf
->client_acl_type
== "posix_acl")
285 acl_type
= POSIX_ACL
;
287 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
290 free_fd_set
.insert(10, 1<<30);
292 mdsmap
.reset(new MDSMap
);
295 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
297 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
298 client_flush_set_callback
, // all commit callback
300 cct
->_conf
->client_oc_size
,
301 cct
->_conf
->client_oc_max_objects
,
302 cct
->_conf
->client_oc_max_dirty
,
303 cct
->_conf
->client_oc_target_dirty
,
304 cct
->_conf
->client_oc_max_dirty_age
,
306 objecter_finisher
.start();
307 filer
.reset(new Filer(objecter
, &objecter_finisher
));
308 objecter
->enable_blacklist_events();
314 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
319 std::lock_guard l
{client_lock
};
323 void Client::tear_down_cache()
326 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
330 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
335 while (!opened_dirs
.empty()) {
336 dir_result_t
*dirp
= *opened_dirs
.begin();
337 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
346 ceph_assert(lru
.lru_get_size() == 0);
349 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
350 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
354 while (!root_parents
.empty())
355 root_parents
.erase(root_parents
.begin());
360 ceph_assert(inode_map
.empty());
363 inodeno_t
Client::get_root_ino()
365 std::lock_guard
l(client_lock
);
366 if (use_faked_inos())
367 return root
->faked_ino
;
372 Inode
*Client::get_root()
374 std::lock_guard
l(client_lock
);
382 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
385 in
->make_long_path(path
);
386 ldout(cct
, 1) << "dump_inode: "
387 << (disconnected
? "DISCONNECTED ":"")
388 << "inode " << in
->ino
390 << " ref " << in
->get_num_ref()
394 f
->open_object_section("inode");
395 f
->dump_stream("path") << path
;
397 f
->dump_int("disconnected", 1);
404 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
405 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
406 it
!= in
->dir
->dentries
.end();
408 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
410 f
->open_object_section("dentry");
414 if (it
->second
->inode
)
415 dump_inode(f
, it
->second
->inode
.get(), did
, false);
420 void Client::dump_cache(Formatter
*f
)
424 ldout(cct
, 1) << __func__
<< dendl
;
427 f
->open_array_section("cache");
430 dump_inode(f
, root
, did
, true);
432 // make a second pass to catch anything disconnected
433 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
434 it
!= inode_map
.end();
436 if (did
.count(it
->second
))
438 dump_inode(f
, it
->second
, did
, true);
445 void Client::dump_status(Formatter
*f
)
447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
449 ldout(cct
, 1) << __func__
<< dendl
;
451 const epoch_t osd_epoch
452 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
455 f
->open_object_section("metadata");
456 for (const auto& kv
: metadata
)
457 f
->dump_string(kv
.first
.c_str(), kv
.second
);
460 f
->dump_int("dentry_count", lru
.lru_get_size());
461 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
462 f
->dump_int("id", get_nodeid().v
);
463 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
464 f
->dump_object("inst", inst
);
465 f
->dump_object("addr", inst
.addr
);
466 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
467 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
468 f
->dump_int("inode_count", inode_map
.size());
469 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
470 f
->dump_int("osd_epoch", osd_epoch
);
471 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
472 f
->dump_bool("blacklisted", blacklisted
);
479 objectcacher
->start();
481 std::lock_guard l
{client_lock
};
482 ceph_assert(!initialized
);
483 messenger
->add_dispatcher_tail(this);
489 void Client::_finish_init()
492 std::lock_guard l
{client_lock
};
494 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
495 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
496 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
497 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
498 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
499 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
500 logger
.reset(plb
.create_perf_counters());
501 cct
->get_perfcounters_collection()->add(logger
.get());
504 cct
->_conf
.add_observer(this);
506 AdminSocket
* admin_socket
= cct
->get_admin_socket();
507 int ret
= admin_socket
->register_command("mds_requests",
509 "show in-progress mds requests");
511 lderr(cct
) << "error registering admin socket command: "
512 << cpp_strerror(-ret
) << dendl
;
514 ret
= admin_socket
->register_command("mds_sessions",
516 "show mds session state");
518 lderr(cct
) << "error registering admin socket command: "
519 << cpp_strerror(-ret
) << dendl
;
521 ret
= admin_socket
->register_command("dump_cache",
523 "show in-memory metadata cache contents");
525 lderr(cct
) << "error registering admin socket command: "
526 << cpp_strerror(-ret
) << dendl
;
528 ret
= admin_socket
->register_command("kick_stale_sessions",
530 "kick sessions that were remote reset");
532 lderr(cct
) << "error registering admin socket command: "
533 << cpp_strerror(-ret
) << dendl
;
535 ret
= admin_socket
->register_command("status",
537 "show overall client status");
539 lderr(cct
) << "error registering admin socket command: "
540 << cpp_strerror(-ret
) << dendl
;
543 std::lock_guard l
{client_lock
};
547 void Client::shutdown()
549 ldout(cct
, 1) << __func__
<< dendl
;
551 // If we were not mounted, but were being used for sending
552 // MDS commands, we may have sessions that need closing.
554 std::lock_guard l
{client_lock
};
557 cct
->_conf
.remove_observer(this);
559 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
561 if (ino_invalidate_cb
) {
562 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
563 async_ino_invalidator
.wait_for_empty();
564 async_ino_invalidator
.stop();
567 if (dentry_invalidate_cb
) {
568 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
569 async_dentry_invalidator
.wait_for_empty();
570 async_dentry_invalidator
.stop();
573 if (switch_interrupt_cb
) {
574 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
575 interrupt_finisher
.wait_for_empty();
576 interrupt_finisher
.stop();
580 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
581 remount_finisher
.wait_for_empty();
582 remount_finisher
.stop();
585 objectcacher
->stop(); // outside of client_lock! this does a join.
587 std::lock_guard l
{client_lock
};
588 ceph_assert(initialized
);
592 objecter_finisher
.wait_for_empty();
593 objecter_finisher
.stop();
596 cct
->get_perfcounters_collection()->remove(logger
.get());
602 // ===================
603 // metadata cache stuff
605 void Client::trim_cache(bool trim_kernel_dcache
)
607 uint64_t max
= cct
->_conf
->client_cache_size
;
608 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
610 while (lru
.lru_get_size() != last
) {
611 last
= lru
.lru_get_size();
613 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
616 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
623 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
624 _invalidate_kernel_dcache();
627 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
628 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
632 while (!root_parents
.empty())
633 root_parents
.erase(root_parents
.begin());
639 void Client::trim_cache_for_reconnect(MetaSession
*s
)
641 mds_rank_t mds
= s
->mds_num
;
642 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
645 list
<Dentry
*> skipped
;
646 while (lru
.lru_get_size() > 0) {
647 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
651 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
652 dn
->dir
->parent_inode
->caps
.count(mds
)) {
656 skipped
.push_back(dn
);
659 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
660 lru
.lru_insert_mid(*p
);
662 ldout(cct
, 20) << __func__
<< " mds." << mds
663 << " trimmed " << trimmed
<< " dentries" << dendl
;
665 if (s
->caps
.size() > 0)
666 _invalidate_kernel_dcache();
669 void Client::trim_dentry(Dentry
*dn
)
671 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
673 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
676 Inode
*diri
= dn
->dir
->parent_inode
;
677 diri
->dir_release_count
++;
678 clear_dir_complete_and_ordered(diri
, true);
680 unlink(dn
, false, false); // drop dir, drop dentry
684 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
685 uint64_t truncate_seq
, uint64_t truncate_size
)
687 uint64_t prior_size
= in
->size
;
689 if (truncate_seq
> in
->truncate_seq
||
690 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
691 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
693 in
->reported_size
= size
;
694 if (truncate_seq
!= in
->truncate_seq
) {
695 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
696 << truncate_seq
<< dendl
;
697 in
->truncate_seq
= truncate_seq
;
698 in
->oset
.truncate_seq
= truncate_seq
;
700 // truncate cached file data
701 if (prior_size
> size
) {
702 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
706 // truncate inline data
707 if (in
->inline_version
< CEPH_INLINE_NONE
) {
708 uint32_t len
= in
->inline_data
.length();
710 in
->inline_data
.splice(size
, len
- size
);
713 if (truncate_seq
>= in
->truncate_seq
&&
714 in
->truncate_size
!= truncate_size
) {
716 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
717 << truncate_size
<< dendl
;
718 in
->truncate_size
= truncate_size
;
719 in
->oset
.truncate_size
= truncate_size
;
721 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
726 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
727 utime_t ctime
, utime_t mtime
, utime_t atime
)
729 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
730 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
732 if (time_warp_seq
> in
->time_warp_seq
)
733 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
734 << " is higher than local time_warp_seq "
735 << in
->time_warp_seq
<< dendl
;
738 // be careful with size, mtime, atime
739 if (issued
& (CEPH_CAP_FILE_EXCL
|
741 CEPH_CAP_FILE_BUFFER
|
743 CEPH_CAP_XATTR_EXCL
)) {
744 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
745 if (ctime
> in
->ctime
)
747 if (time_warp_seq
> in
->time_warp_seq
) {
748 //the mds updated times, so take those!
751 in
->time_warp_seq
= time_warp_seq
;
752 } else if (time_warp_seq
== in
->time_warp_seq
) {
754 if (mtime
> in
->mtime
)
756 if (atime
> in
->atime
)
758 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
759 //ignore mds values as we have a higher seq
762 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
763 if (time_warp_seq
>= in
->time_warp_seq
) {
767 in
->time_warp_seq
= time_warp_seq
;
771 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
772 << time_warp_seq
<< " is lower than local time_warp_seq "
778 void Client::_fragmap_remove_non_leaves(Inode
*in
)
780 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
781 if (!in
->dirfragtree
.is_leaf(p
->first
))
782 in
->fragmap
.erase(p
++);
787 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
789 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
790 if (p
->second
== mds
)
791 in
->fragmap
.erase(p
++);
796 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
797 MetaSession
*session
,
798 const UserPerm
& request_perms
)
801 bool was_new
= false;
802 if (inode_map
.count(st
->vino
)) {
803 in
= inode_map
[st
->vino
];
804 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
806 in
= new Inode(this, st
->vino
, &st
->layout
);
807 inode_map
[st
->vino
] = in
;
809 if (use_faked_inos())
810 _assign_faked_ino(in
);
814 if (use_faked_inos())
815 _assign_faked_root(root
);
818 } else if (!mounted
) {
819 root_parents
[root_ancestor
] = in
;
824 in
->ino
= st
->vino
.ino
;
825 in
->snapid
= st
->vino
.snapid
;
826 in
->mode
= st
->mode
& S_IFMT
;
831 if (in
->is_symlink())
832 in
->symlink
= st
->symlink
;
834 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
835 bool new_version
= false;
836 if (in
->version
== 0 ||
837 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
838 (in
->version
& ~1) < st
->version
))
842 in
->caps_issued(&issued
);
843 issued
|= in
->caps_dirty();
844 int new_issued
= ~issued
& (int)st
->cap
.caps
;
846 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
847 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
851 in
->btime
= st
->btime
;
852 in
->snap_btime
= st
->snap_btime
;
855 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
856 !(issued
& CEPH_CAP_LINK_EXCL
)) {
857 in
->nlink
= st
->nlink
;
860 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
861 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
862 st
->ctime
, st
->mtime
, st
->atime
);
866 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
867 in
->layout
= st
->layout
;
868 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
872 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
873 in
->dirstat
= st
->dirstat
;
875 // dir_layout/rstat/quota are not tracked by capability, update them only if
876 // the inode stat is from auth mds
877 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
878 in
->dir_layout
= st
->dir_layout
;
879 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
880 in
->rstat
= st
->rstat
;
881 in
->quota
= st
->quota
;
882 in
->dir_pin
= st
->dir_pin
;
884 // move me if/when version reflects fragtree changes.
885 if (in
->dirfragtree
!= st
->dirfragtree
) {
886 in
->dirfragtree
= st
->dirfragtree
;
887 _fragmap_remove_non_leaves(in
);
891 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
892 st
->xattrbl
.length() &&
893 st
->xattr_version
> in
->xattr_version
) {
894 auto p
= st
->xattrbl
.cbegin();
895 decode(in
->xattrs
, p
);
896 in
->xattr_version
= st
->xattr_version
;
899 if (st
->inline_version
> in
->inline_version
) {
900 in
->inline_data
= st
->inline_data
;
901 in
->inline_version
= st
->inline_version
;
904 /* always take a newer change attr */
905 if (st
->change_attr
> in
->change_attr
)
906 in
->change_attr
= st
->change_attr
;
908 if (st
->version
> in
->version
)
909 in
->version
= st
->version
;
912 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
915 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
917 if (in
->snapid
== CEPH_NOSNAP
) {
918 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
919 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
920 st
->cap
.flags
, request_perms
);
921 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
922 in
->max_size
= st
->max_size
;
923 in
->rstat
= st
->rstat
;
926 // setting I_COMPLETE needs to happen after adding the cap
928 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
929 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
930 in
->dirstat
.nfiles
== 0 &&
931 in
->dirstat
.nsubdirs
== 0) {
932 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
933 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
935 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
936 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
937 in
->dir
->readdir_cache
.clear();
938 for (const auto& p
: in
->dir
->dentries
) {
939 unlink(p
.second
, true, true); // keep dir, keep dentry
941 if (in
->dir
->dentries
.empty())
946 in
->snap_caps
|= st
->cap
.caps
;
954 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
956 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
957 Inode
*in
, utime_t from
, MetaSession
*session
,
961 if (dir
->dentries
.count(dname
))
962 dn
= dir
->dentries
[dname
];
964 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
965 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
968 if (dn
&& dn
->inode
) {
969 if (dn
->inode
->vino() == in
->vino()) {
971 ldout(cct
, 12) << " had dentry " << dname
972 << " with correct vino " << dn
->inode
->vino()
975 ldout(cct
, 12) << " had dentry " << dname
976 << " with WRONG vino " << dn
->inode
->vino()
978 unlink(dn
, true, true); // keep dir, keep dentry
982 if (!dn
|| !dn
->inode
) {
983 InodeRef
tmp_ref(in
);
985 if (old_dentry
->dir
!= dir
) {
986 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
987 old_diri
->dir_ordered_count
++;
988 clear_dir_complete_and_ordered(old_diri
, false);
990 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
992 Inode
*diri
= dir
->parent_inode
;
993 diri
->dir_ordered_count
++;
994 clear_dir_complete_and_ordered(diri
, false);
995 dn
= link(dir
, dname
, in
, dn
);
998 update_dentry_lease(dn
, dlease
, from
, session
);
1002 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1004 utime_t dttl
= from
;
1005 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1009 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1010 if (dttl
> dn
->lease_ttl
) {
1011 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1012 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1013 dn
->lease_ttl
= dttl
;
1014 dn
->lease_mds
= session
->mds_num
;
1015 dn
->lease_seq
= dlease
->seq
;
1016 dn
->lease_gen
= session
->cap_gen
;
1019 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1024 * update MDS location cache for a single inode
1026 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1029 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1030 if (dst
->auth
>= 0) {
1031 in
->fragmap
[dst
->frag
] = dst
->auth
;
1033 in
->fragmap
.erase(dst
->frag
);
1035 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1036 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1037 _fragmap_remove_non_leaves(in
);
1041 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1044 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1046 if (diri
->flags
& I_COMPLETE
) {
1048 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1049 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1051 if (diri
->flags
& I_DIR_ORDERED
) {
1052 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1053 diri
->flags
&= ~I_DIR_ORDERED
;
1057 diri
->dir
->readdir_cache
.clear();
1062 * insert results from readdir or lssnap into the metadata cache.
1064 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1066 auto& reply
= request
->reply
;
1067 ConnectionRef con
= request
->reply
->get_connection();
1069 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1070 features
= (uint64_t)-1;
1073 features
= con
->get_features();
1076 dir_result_t
*dirp
= request
->dirp
;
1079 // the extra buffer list is only set for readdir and lssnap replies
1080 auto p
= reply
->get_extra_bl().cbegin();
1083 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1085 diri
= open_snapdir(diri
);
1088 // only open dir if we're actually adding stuff to it!
1089 Dir
*dir
= diri
->open_dir();
1093 DirStat
dst(p
, features
);
1099 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1100 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1102 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1103 unsigned readdir_offset
= dirp
->next_offset
;
1104 string readdir_start
= dirp
->last_name
;
1105 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1107 unsigned last_hash
= 0;
1109 if (!readdir_start
.empty()) {
1110 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1111 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1112 /* mds understands offset_hash */
1113 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1117 if (fg
!= dst
.frag
) {
1118 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1122 readdir_start
.clear();
1123 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1127 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1128 << ", hash_order=" << hash_order
1129 << ", readdir_start " << readdir_start
1130 << ", last_hash " << last_hash
1131 << ", next_offset " << readdir_offset
<< dendl
;
1133 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1134 fg
.is_leftmost() && readdir_offset
== 2 &&
1135 !(hash_order
&& last_hash
)) {
1136 dirp
->release_count
= diri
->dir_release_count
;
1137 dirp
->ordered_count
= diri
->dir_ordered_count
;
1138 dirp
->start_shared_gen
= diri
->shared_gen
;
1139 dirp
->cache_index
= 0;
1142 dirp
->buffer_frag
= fg
;
1144 _readdir_drop_dirp_buffer(dirp
);
1145 dirp
->buffer
.reserve(numdn
);
1149 for (unsigned i
=0; i
<numdn
; i
++) {
1151 dlease
.decode(p
, features
);
1152 InodeStat
ist(p
, features
);
1154 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1156 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1159 if (diri
->dir
->dentries
.count(dname
)) {
1160 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1161 if (olddn
->inode
!= in
) {
1162 // replace incorrect dentry
1163 unlink(olddn
, true, true); // keep dir, dentry
1164 dn
= link(dir
, dname
, in
, olddn
);
1165 ceph_assert(dn
== olddn
);
1173 dn
= link(dir
, dname
, in
, NULL
);
1176 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1178 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1179 if (hash
!= last_hash
)
1182 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1184 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1186 // add to readdir cache
1187 if (dirp
->release_count
== diri
->dir_release_count
&&
1188 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1189 dirp
->start_shared_gen
== diri
->shared_gen
) {
1190 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1192 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1193 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1195 dir
->readdir_cache
.push_back(dn
);
1196 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1197 if (dirp
->inode
->is_complete_and_ordered())
1198 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1200 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1202 ceph_abort_msg("unexpected readdir buffer idx");
1204 dirp
->cache_index
++;
1206 // add to cached result list
1207 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1208 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1212 dirp
->last_name
= dname
;
1214 dirp
->next_offset
= 2;
1216 dirp
->next_offset
= readdir_offset
;
1218 if (dir
->is_empty())
1225 * insert a trace from a MDS reply into the cache.
1227 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1229 auto& reply
= request
->reply
;
1230 int op
= request
->get_op();
1232 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1233 << " is_target=" << (int)reply
->head
.is_target
1234 << " is_dentry=" << (int)reply
->head
.is_dentry
1237 auto p
= reply
->get_trace_bl().cbegin();
1238 if (request
->got_unsafe
) {
1239 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1240 ceph_assert(p
.end());
1245 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1247 Dentry
*d
= request
->dentry();
1249 Inode
*diri
= d
->dir
->parent_inode
;
1250 diri
->dir_release_count
++;
1251 clear_dir_complete_and_ordered(diri
, true);
1254 if (d
&& reply
->get_result() == 0) {
1255 if (op
== CEPH_MDS_OP_RENAME
) {
1257 Dentry
*od
= request
->old_dentry();
1258 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1260 unlink(od
, true, true); // keep dir, dentry
1261 } else if (op
== CEPH_MDS_OP_RMDIR
||
1262 op
== CEPH_MDS_OP_UNLINK
) {
1264 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1265 unlink(d
, true, true); // keep dir, dentry
1271 ConnectionRef con
= request
->reply
->get_connection();
1273 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1274 features
= (uint64_t)-1;
1277 features
= con
->get_features();
1279 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1282 SnapRealm
*realm
= NULL
;
1283 if (reply
->snapbl
.length())
1284 update_snap_trace(reply
->snapbl
, &realm
);
1286 ldout(cct
, 10) << " hrm "
1287 << " is_target=" << (int)reply
->head
.is_target
1288 << " is_dentry=" << (int)reply
->head
.is_dentry
1297 if (reply
->head
.is_dentry
) {
1298 dirst
.decode(p
, features
);
1299 dst
.decode(p
, features
);
1301 dlease
.decode(p
, features
);
1305 if (reply
->head
.is_target
) {
1306 ist
.decode(p
, features
);
1307 if (cct
->_conf
->client_debug_getattr_caps
) {
1308 unsigned wanted
= 0;
1309 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1310 wanted
= request
->head
.args
.getattr
.mask
;
1311 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1312 wanted
= request
->head
.args
.open
.mask
;
1314 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1315 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1316 ceph_abort_msg("MDS reply does not contain xattrs");
1319 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1324 if (reply
->head
.is_dentry
) {
1325 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1327 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1330 Dir
*dir
= diri
->open_dir();
1331 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1332 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1335 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1336 dn
= diri
->dir
->dentries
[dname
];
1338 diri
->dir_ordered_count
++;
1339 clear_dir_complete_and_ordered(diri
, false);
1340 unlink(dn
, true, true); // keep dir, dentry
1343 if (dlease
.duration_ms
> 0) {
1345 Dir
*dir
= diri
->open_dir();
1346 dn
= link(dir
, dname
, NULL
, NULL
);
1348 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1351 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1352 op
== CEPH_MDS_OP_MKSNAP
) {
1353 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1354 // fake it for snap lookup
1355 vinodeno_t vino
= ist
.vino
;
1356 vino
.snapid
= CEPH_SNAPDIR
;
1357 ceph_assert(inode_map
.count(vino
));
1358 diri
= inode_map
[vino
];
1360 string dname
= request
->path
.last_dentry();
1363 dlease
.duration_ms
= 0;
1366 Dir
*dir
= diri
->open_dir();
1367 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1369 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1370 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1372 unlink(dn
, true, true); // keep dir, dentry
1378 if (op
== CEPH_MDS_OP_READDIR
||
1379 op
== CEPH_MDS_OP_LSSNAP
) {
1380 insert_readdir_results(request
, session
, in
);
1381 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1382 // hack: return parent inode instead
1386 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1387 // pin the target inode if its parent dentry is not pinned
1388 request
->set_other_inode(in
);
1393 put_snap_realm(realm
);
1395 request
->target
= in
;
1401 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1403 mds_rank_t mds
= MDS_RANK_NONE
;
1405 bool is_hash
= false;
1410 if (req
->resend_mds
>= 0) {
1411 mds
= req
->resend_mds
;
1412 req
->resend_mds
= -1;
1413 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1417 if (cct
->_conf
->client_use_random_mds
)
1423 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1424 if (req
->path
.depth()) {
1425 hash
= in
->hash_dentry_name(req
->path
[0]);
1426 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1427 << " on " << req
->path
[0]
1428 << " => " << hash
<< dendl
;
1433 in
= de
->inode
.get();
1434 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1436 in
= de
->dir
->parent_inode
;
1437 hash
= in
->hash_dentry_name(de
->name
);
1438 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1439 << " on " << de
->name
1440 << " => " << hash
<< dendl
;
1445 if (in
->snapid
!= CEPH_NOSNAP
) {
1446 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1447 while (in
->snapid
!= CEPH_NOSNAP
) {
1448 if (in
->snapid
== CEPH_SNAPDIR
)
1449 in
= in
->snapdir_parent
.get();
1450 else if (!in
->dentries
.empty())
1451 /* In most cases there will only be one dentry, so getting it
1452 * will be the correct action. If there are multiple hard links,
1453 * I think the MDS should be able to redirect as needed*/
1454 in
= in
->get_first_parent()->dir
->parent_inode
;
1456 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1463 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1464 << " hash=" << hash
<< dendl
;
1466 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1467 frag_t fg
= in
->dirfragtree
[hash
];
1468 if (in
->fragmap
.count(fg
)) {
1469 mds
= in
->fragmap
[fg
];
1472 } else if (in
->auth_cap
) {
1473 mds
= in
->auth_cap
->session
->mds_num
;
1476 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1481 if (in
->auth_cap
&& req
->auth_is_best()) {
1482 mds
= in
->auth_cap
->session
->mds_num
;
1483 } else if (!in
->caps
.empty()) {
1484 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1488 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1495 mds
= _get_random_up_mds();
1496 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1500 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1505 void Client::connect_mds_targets(mds_rank_t mds
)
1507 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1508 ceph_assert(mds_sessions
.count(mds
));
1509 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1510 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1511 q
!= info
.export_targets
.end();
1513 if (mds_sessions
.count(*q
) == 0 &&
1514 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1515 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1516 << " export target mds." << *q
<< dendl
;
1517 _open_mds_session(*q
);
1522 void Client::dump_mds_sessions(Formatter
*f
)
1524 f
->dump_int("id", get_nodeid().v
);
1525 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1526 f
->dump_object("inst", inst
);
1527 f
->dump_stream("inst_str") << inst
;
1528 f
->dump_stream("addr_str") << inst
.addr
;
1529 f
->open_array_section("sessions");
1530 for (const auto &p
: mds_sessions
) {
1531 f
->open_object_section("session");
1536 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1538 void Client::dump_mds_requests(Formatter
*f
)
1540 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1541 p
!= mds_requests
.end();
1543 f
->open_object_section("request");
1549 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1550 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1551 InodeRef
*ptarget
, bool *pcreated
,
1552 const UserPerm
& perms
)
1554 // check whether this request actually did the create, and set created flag
1555 bufferlist extra_bl
;
1556 inodeno_t created_ino
;
1557 bool got_created_ino
= false;
1558 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1560 extra_bl
= reply
->get_extra_bl();
1561 if (extra_bl
.length() >= 8) {
1562 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1563 struct openc_response_t ocres
;
1565 decode(ocres
, extra_bl
);
1566 created_ino
= ocres
.created_ino
;
1568 * The userland cephfs client doesn't have a way to do an async create
1569 * (yet), so just discard delegated_inos for now. Eventually we should
1570 * store them and use them in create calls, even if they are synchronous,
1571 * if only for testing purposes.
1573 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1575 // u64 containing number of created ino
1576 decode(created_ino
, extra_bl
);
1578 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1579 got_created_ino
= true;
1583 *pcreated
= got_created_ino
;
1585 if (request
->target
) {
1586 *ptarget
= request
->target
;
1587 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1589 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1590 (*ptarget
) = p
->second
;
1591 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1593 // we got a traceless reply, and need to look up what we just
1594 // created. for now, do this by name. someday, do this by the
1595 // ino... which we know! FIXME.
1597 Dentry
*d
= request
->dentry();
1600 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1601 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1602 << " got_ino " << got_created_ino
1603 << " ino " << created_ino
1605 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1608 // if the dentry is not linked, just do our best. see #5021.
1609 ceph_abort_msg("how did this happen? i want logs!");
1612 Inode
*in
= request
->inode();
1613 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1614 << in
->ino
<< dendl
;
1615 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1619 // verify ino returned in reply and trace_dist are the same
1620 if (got_created_ino
&&
1621 created_ino
.val
!= target
->ino
.val
) {
1622 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1626 ptarget
->swap(target
);
1638 * Blocking helper to make an MDS request.
1640 * If the ptarget flag is set, behavior changes slightly: the caller
1641 * expects to get a pointer to the inode we are creating or operating
1642 * on. As a result, we will follow up any traceless mutation reply
1643 * with a getattr or lookup to transparently handle a traceless reply
1644 * from the MDS (as when the MDS restarts and the client has to replay
1647 * @param request the MetaRequest to execute
1648 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1649 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1650 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1651 * @param use_mds [optional] prefer a specific mds (-1 for default)
1652 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1654 int Client::make_request(MetaRequest
*request
,
1655 const UserPerm
& perms
,
1656 InodeRef
*ptarget
, bool *pcreated
,
1662 // assign a unique tid
1663 ceph_tid_t tid
= ++last_tid
;
1664 request
->set_tid(tid
);
1667 request
->op_stamp
= ceph_clock_now();
1670 mds_requests
[tid
] = request
->get();
1671 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1674 request
->set_caller_perms(perms
);
1676 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1677 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1678 request
->set_oldest_client_tid(1);
1680 request
->set_oldest_client_tid(oldest_tid
);
1685 request
->resend_mds
= use_mds
;
1687 MetaSession
*session
= NULL
;
1689 if (request
->aborted())
1693 request
->abort(-EBLACKLISTED
);
1698 ceph::condition_variable caller_cond
;
1699 request
->caller_cond
= &caller_cond
;
1702 Inode
*hash_diri
= NULL
;
1703 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1704 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1705 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1706 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1708 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1709 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1711 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1712 request
->resend_mds
= _get_random_up_mds();
1715 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1716 wait_on_list(waiting_for_mdsmap
);
1722 if (!have_open_session(mds
)) {
1723 session
= _get_or_open_mds_session(mds
);
1726 if (session
->state
== MetaSession::STATE_OPENING
) {
1727 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1728 wait_on_context_list(session
->waiting_for_open
);
1729 // Abort requests on REJECT from MDS
1730 if (rejected_by_mds
.count(mds
)) {
1731 request
->abort(-EPERM
);
1737 if (!have_open_session(mds
))
1740 session
= &mds_sessions
.at(mds
);
1744 send_request(request
, session
);
1747 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1748 request
->kick
= false;
1749 std::unique_lock l
{client_lock
, std::adopt_lock
};
1750 caller_cond
.wait(l
, [request
] {
1751 return (request
->reply
|| // reply
1752 request
->resend_mds
>= 0 || // forward
1756 request
->caller_cond
= nullptr;
1758 // did we get a reply?
1763 if (!request
->reply
) {
1764 ceph_assert(request
->aborted());
1765 ceph_assert(!request
->got_unsafe
);
1766 r
= request
->get_abort_code();
1767 request
->item
.remove_myself();
1768 unregister_request(request
);
1769 put_request(request
);
1774 auto reply
= std::move(request
->reply
);
1775 r
= reply
->get_result();
1777 request
->success
= true;
1779 // kick dispatcher (we've got it!)
1780 ceph_assert(request
->dispatch_cond
);
1781 request
->dispatch_cond
->notify_all();
1782 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1783 request
->dispatch_cond
= 0;
1785 if (r
>= 0 && ptarget
)
1786 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1789 *pdirbl
= reply
->get_extra_bl();
1792 utime_t lat
= ceph_clock_now();
1793 lat
-= request
->sent_stamp
;
1794 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1795 logger
->tinc(l_c_lat
, lat
);
1796 logger
->tinc(l_c_reply
, lat
);
1798 put_request(request
);
1802 void Client::unregister_request(MetaRequest
*req
)
1804 mds_requests
.erase(req
->tid
);
1805 if (req
->tid
== oldest_tid
) {
1806 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1808 if (p
== mds_requests
.end()) {
1812 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1813 oldest_tid
= p
->first
;
1822 void Client::put_request(MetaRequest
*request
)
1824 if (request
->_put()) {
1826 if (request
->success
)
1827 op
= request
->get_op();
1829 request
->take_other_inode(&other_in
);
1833 (op
== CEPH_MDS_OP_RMDIR
||
1834 op
== CEPH_MDS_OP_RENAME
||
1835 op
== CEPH_MDS_OP_RMSNAP
)) {
1836 _try_to_trim_inode(other_in
.get(), false);
1841 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1842 mds_rank_t mds
, int drop
,
1843 int unless
, int force
)
1845 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1846 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1847 << ", force:" << force
<< ")" << dendl
;
1849 auto it
= in
->caps
.find(mds
);
1850 if (it
!= in
->caps
.end()) {
1851 Cap
&cap
= it
->second
;
1852 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1853 if ((drop
& cap
.issued
) &&
1854 !(unless
& cap
.issued
)) {
1855 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1856 cap
.issued
&= ~drop
;
1857 cap
.implemented
&= ~drop
;
1863 cap
.wanted
= in
->caps_wanted();
1864 if (&cap
== in
->auth_cap
&&
1865 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1866 in
->requested_max_size
= 0;
1867 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1869 ceph_mds_request_release rel
;
1871 rel
.cap_id
= cap
.cap_id
;
1873 rel
.issue_seq
= cap
.issue_seq
;
1874 rel
.mseq
= cap
.mseq
;
1875 rel
.caps
= cap
.implemented
;
1876 rel
.wanted
= cap
.wanted
;
1879 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1882 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1883 << released
<< dendl
;
1887 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1888 mds_rank_t mds
, int drop
, int unless
)
1890 ldout(cct
, 20) << __func__
<< " enter(dn:"
1891 << dn
<< ")" << dendl
;
1894 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1895 mds
, drop
, unless
, 1);
1896 if (released
&& dn
->lease_mds
== mds
) {
1897 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1898 auto& rel
= req
->cap_releases
.back();
1899 rel
.item
.dname_len
= dn
->name
.length();
1900 rel
.item
.dname_seq
= dn
->lease_seq
;
1901 rel
.dname
= dn
->name
;
1903 ldout(cct
, 25) << __func__
<< " exit(dn:"
1904 << dn
<< ")" << dendl
;
1909 * This requires the MClientRequest *request member to be set.
1910 * It will error out horribly without one.
1911 * Additionally, if you set any *drop member, you'd better have
1912 * set the corresponding dentry!
1914 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1916 ldout(cct
, 20) << __func__
<< " enter (req: "
1917 << req
<< ", mds: " << mds
<< ")" << dendl
;
1918 if (req
->inode_drop
&& req
->inode())
1919 encode_inode_release(req
->inode(), req
,
1920 mds
, req
->inode_drop
,
1923 if (req
->old_inode_drop
&& req
->old_inode())
1924 encode_inode_release(req
->old_inode(), req
,
1925 mds
, req
->old_inode_drop
,
1926 req
->old_inode_unless
);
1927 if (req
->other_inode_drop
&& req
->other_inode())
1928 encode_inode_release(req
->other_inode(), req
,
1929 mds
, req
->other_inode_drop
,
1930 req
->other_inode_unless
);
1932 if (req
->dentry_drop
&& req
->dentry())
1933 encode_dentry_release(req
->dentry(), req
,
1934 mds
, req
->dentry_drop
,
1935 req
->dentry_unless
);
1937 if (req
->old_dentry_drop
&& req
->old_dentry())
1938 encode_dentry_release(req
->old_dentry(), req
,
1939 mds
, req
->old_dentry_drop
,
1940 req
->old_dentry_unless
);
1941 ldout(cct
, 25) << __func__
<< " exit (req: "
1942 << req
<< ", mds " << mds
<<dendl
;
1945 bool Client::have_open_session(mds_rank_t mds
)
1947 const auto &it
= mds_sessions
.find(mds
);
1948 return it
!= mds_sessions
.end() &&
1949 (it
->second
.state
== MetaSession::STATE_OPEN
||
1950 it
->second
.state
== MetaSession::STATE_STALE
);
1953 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1955 const auto &it
= mds_sessions
.find(mds
);
1956 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1963 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1965 auto it
= mds_sessions
.find(mds
);
1966 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1970 * Populate a map of strings with client-identifying metadata,
1971 * such as the hostname. Call this once at initialization.
1973 void Client::populate_metadata(const std::string
&mount_root
)
1979 metadata
["hostname"] = u
.nodename
;
1980 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1982 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1985 metadata
["pid"] = stringify(getpid());
1987 // Ceph entity id (the '0' in "client.0")
1988 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1990 // Our mount position
1991 if (!mount_root
.empty()) {
1992 metadata
["root"] = mount_root
;
1996 metadata
["ceph_version"] = pretty_version_to_str();
1997 metadata
["ceph_sha1"] = git_version_to_str();
1999 // Apply any metadata from the user's configured overrides
2000 std::vector
<std::string
> tokens
;
2001 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2002 for (const auto &i
: tokens
) {
2003 auto eqpos
= i
.find("=");
2004 // Throw out anything that isn't of the form "<str>=<str>"
2005 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2006 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2009 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2014 * Optionally add or override client metadata fields.
2016 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2018 std::lock_guard
l(client_lock
);
2019 ceph_assert(initialized
);
2021 auto it
= metadata
.find(k
);
2022 if (it
!= metadata
.end()) {
2023 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2024 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2030 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2032 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2033 auto addrs
= mdsmap
->get_addrs(mds
);
2034 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2035 std::forward_as_tuple(mds
),
2036 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2037 ceph_assert(em
.second
); /* not already present */
2038 MetaSession
*session
= &em
.first
->second
;
2040 // Maybe skip sending a request to open if this MDS daemon
2041 // has previously sent us a REJECT.
2042 if (rejected_by_mds
.count(mds
)) {
2043 if (rejected_by_mds
[mds
] == session
->addrs
) {
2044 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2045 "because we were rejected" << dendl
;
2048 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2049 "rejected us, trying with new inst" << dendl
;
2050 rejected_by_mds
.erase(mds
);
2054 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2055 m
->metadata
= metadata
;
2056 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2057 session
->con
->send_message2(std::move(m
));
2061 void Client::_close_mds_session(MetaSession
*s
)
2063 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2064 s
->state
= MetaSession::STATE_CLOSING
;
2065 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2068 void Client::_closed_mds_session(MetaSession
*s
)
2070 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2071 s
->state
= MetaSession::STATE_CLOSED
;
2072 s
->con
->mark_down();
2073 signal_context_list(s
->waiting_for_open
);
2074 mount_cond
.notify_all();
2075 remove_session_caps(s
);
2076 kick_requests_closed(s
);
2077 mds_sessions
.erase(s
->mds_num
);
2080 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2082 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2083 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2085 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2087 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2091 switch (m
->get_op()) {
2092 case CEPH_SESSION_OPEN
:
2094 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2095 missing_features
-= m
->supported_features
;
2096 if (!missing_features
.empty()) {
2097 lderr(cct
) << "mds." << from
<< " lacks required features '"
2098 << missing_features
<< "', closing session " << dendl
;
2099 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2100 _close_mds_session(session
);
2101 _closed_mds_session(session
);
2104 session
->mds_features
= std::move(m
->supported_features
);
2106 renew_caps(session
);
2107 session
->state
= MetaSession::STATE_OPEN
;
2109 mount_cond
.notify_all();
2111 connect_mds_targets(from
);
2112 signal_context_list(session
->waiting_for_open
);
2116 case CEPH_SESSION_CLOSE
:
2117 _closed_mds_session(session
);
2120 case CEPH_SESSION_RENEWCAPS
:
2121 if (session
->cap_renew_seq
== m
->get_seq()) {
2122 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2124 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2126 wake_up_session_caps(session
, false);
2130 case CEPH_SESSION_STALE
:
2131 // invalidate session caps/leases
2133 session
->cap_ttl
= ceph_clock_now();
2134 session
->cap_ttl
-= 1;
2135 renew_caps(session
);
2138 case CEPH_SESSION_RECALL_STATE
:
2139 trim_caps(session
, m
->get_max_caps());
2142 case CEPH_SESSION_FLUSHMSG
:
2143 /* flush cap release */
2144 if (auto& m
= session
->release
; m
) {
2145 session
->con
->send_message2(std::move(m
));
2147 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2150 case CEPH_SESSION_FORCE_RO
:
2151 force_session_readonly(session
);
2154 case CEPH_SESSION_REJECT
:
2156 std::string_view error_str
;
2157 auto it
= m
->metadata
.find("error_string");
2158 if (it
!= m
->metadata
.end())
2159 error_str
= it
->second
;
2161 error_str
= "unknown error";
2162 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2164 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2165 _closed_mds_session(session
);
2174 bool Client::_any_stale_sessions() const
2176 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2178 for (const auto &p
: mds_sessions
) {
2179 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2187 void Client::_kick_stale_sessions()
2189 ldout(cct
, 1) << __func__
<< dendl
;
2191 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2192 MetaSession
&s
= it
->second
;
2194 if (s
.state
== MetaSession::STATE_STALE
)
2195 _closed_mds_session(&s
);
2199 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2200 bool drop_cap_releases
)
2203 mds_rank_t mds
= session
->mds_num
;
2204 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2205 << " for mds." << mds
<< dendl
;
2206 auto r
= build_client_request(request
);
2207 if (request
->dentry()) {
2208 r
->set_dentry_wanted();
2210 if (request
->got_unsafe
) {
2211 r
->set_replayed_op();
2212 if (request
->target
)
2213 r
->head
.ino
= request
->target
->ino
;
2215 encode_cap_releases(request
, mds
);
2216 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2217 request
->cap_releases
.clear();
2219 r
->releases
.swap(request
->cap_releases
);
2221 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2222 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2223 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2224 r
->set_osdmap_epoch(o
.get_epoch());
2228 if (request
->mds
== -1) {
2229 request
->sent_stamp
= ceph_clock_now();
2230 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2234 Inode
*in
= request
->inode();
2236 auto it
= in
->caps
.find(mds
);
2237 if (it
!= in
->caps
.end()) {
2238 request
->sent_on_mseq
= it
->second
.mseq
;
2242 session
->requests
.push_back(&request
->item
);
2244 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2245 session
->con
->send_message2(std::move(r
));
2248 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2250 auto req
= make_message
<MClientRequest
>(request
->get_op());
2251 req
->set_tid(request
->tid
);
2252 req
->set_stamp(request
->op_stamp
);
2253 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2255 // if the filepath's haven't been set, set them!
2256 if (request
->path
.empty()) {
2257 Inode
*in
= request
->inode();
2258 Dentry
*de
= request
->dentry();
2260 in
->make_nosnap_relative_path(request
->path
);
2263 de
->inode
->make_nosnap_relative_path(request
->path
);
2265 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2266 request
->path
.push_dentry(de
->name
);
2268 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2269 << " No path, inode, or appropriately-endowed dentry given!"
2271 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2272 << " No path, inode, or dentry given!"
2275 req
->set_filepath(request
->get_filepath());
2276 req
->set_filepath2(request
->get_filepath2());
2277 req
->set_data(request
->data
);
2278 req
->set_retry_attempt(request
->retry_attempt
++);
2279 req
->head
.num_fwd
= request
->num_fwd
;
2281 int gid_count
= request
->perms
.get_gids(&_gids
);
2282 req
->set_gid_list(gid_count
, _gids
);
2288 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2290 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2291 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2295 ceph_tid_t tid
= fwd
->get_tid();
2297 if (mds_requests
.count(tid
) == 0) {
2298 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2302 MetaRequest
*request
= mds_requests
[tid
];
2303 ceph_assert(request
);
2305 // reset retry counter
2306 request
->retry_attempt
= 0;
2308 // request not forwarded, or dest mds has no session.
2310 ldout(cct
, 10) << __func__
<< " tid " << tid
2311 << " fwd " << fwd
->get_num_fwd()
2312 << " to mds." << fwd
->get_dest_mds()
2313 << ", resending to " << fwd
->get_dest_mds()
2317 request
->item
.remove_myself();
2318 request
->num_fwd
= fwd
->get_num_fwd();
2319 request
->resend_mds
= fwd
->get_dest_mds();
2320 request
->caller_cond
->notify_all();
2323 bool Client::is_dir_operation(MetaRequest
*req
)
2325 int op
= req
->get_op();
2326 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2327 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2328 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2329 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2334 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2336 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2337 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2342 ceph_tid_t tid
= reply
->get_tid();
2343 bool is_safe
= reply
->is_safe();
2345 if (mds_requests
.count(tid
) == 0) {
2346 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2347 << " safe is:" << is_safe
<< dendl
;
2350 MetaRequest
*request
= mds_requests
.at(tid
);
2352 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2353 << " tid " << tid
<< dendl
;
2355 if (request
->got_unsafe
&& !is_safe
) {
2356 //duplicate response
2357 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2358 << mds_num
<< " safe:" << is_safe
<< dendl
;
2362 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2363 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2364 << " from mds." << request
->mds
<< dendl
;
2365 request
->send_to_auth
= true;
2366 request
->resend_mds
= choose_target_mds(request
);
2367 Inode
*in
= request
->inode();
2368 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2369 if (request
->resend_mds
>= 0 &&
2370 request
->resend_mds
== request
->mds
&&
2372 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2373 request
->sent_on_mseq
== it
->second
.mseq
)) {
2374 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2376 request
->caller_cond
->notify_all();
2381 ceph_assert(!request
->reply
);
2382 request
->reply
= reply
;
2383 insert_trace(request
, session
);
2385 // Handle unsafe reply
2387 request
->got_unsafe
= true;
2388 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2389 if (is_dir_operation(request
)) {
2390 Inode
*dir
= request
->inode();
2392 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2394 if (request
->target
) {
2395 InodeRef
&in
= request
->target
;
2396 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2400 // Only signal the caller once (on the first reply):
2401 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2402 if (!is_safe
|| !request
->got_unsafe
) {
2403 ceph::condition_variable cond
;
2404 request
->dispatch_cond
= &cond
;
2407 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2408 request
->caller_cond
->notify_all();
2410 // wake for kick back
2411 std::unique_lock l
{client_lock
, std::adopt_lock
};
2412 cond
.wait(l
, [tid
, request
, &cond
, this] {
2413 if (request
->dispatch_cond
) {
2414 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2415 << tid
<< " " << &cond
<< dendl
;
2417 return !request
->dispatch_cond
;
2423 // the filesystem change is committed to disk
2424 // we're done, clean up
2425 if (request
->got_unsafe
) {
2426 request
->unsafe_item
.remove_myself();
2427 request
->unsafe_dir_item
.remove_myself();
2428 request
->unsafe_target_item
.remove_myself();
2429 signal_cond_list(request
->waitfor_safe
);
2431 request
->item
.remove_myself();
2432 unregister_request(request
);
2435 mount_cond
.notify_all();
2438 void Client::_handle_full_flag(int64_t pool
)
2440 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2441 << "on " << pool
<< dendl
;
2442 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2443 // to do this rather than blocking, because otherwise when we fill up we
2444 // potentially lock caps forever on files with dirty pages, and we need
2445 // to be able to release those caps to the MDS so that it can delete files
2446 // and free up space.
2447 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2449 // For all inodes with layouts in this pool and a pending flush write op
2450 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2451 // from ObjectCacher so that it doesn't re-issue the write in response to
2452 // the ENOSPC error.
2453 // Fortunately since we're cancelling everything in a given pool, we don't
2454 // need to know which ops belong to which ObjectSet, we can just blow all
2455 // the un-flushed cached data away and mark any dirty inodes' async_err
2456 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2457 // affecting this pool, and all the objectsets we're purging were also
2459 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2460 i
!= inode_map
.end(); ++i
)
2462 Inode
*inode
= i
->second
;
2463 if (inode
->oset
.dirty_or_tx
2464 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2465 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2466 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2467 objectcacher
->purge_set(&inode
->oset
);
2468 inode
->set_async_err(-ENOSPC
);
2472 if (cancelled_epoch
!= (epoch_t
)-1) {
2473 set_cap_epoch_barrier(cancelled_epoch
);
2477 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2479 std::set
<entity_addr_t
> new_blacklists
;
2480 objecter
->consume_blacklist_events(&new_blacklists
);
2482 const auto myaddrs
= messenger
->get_myaddrs();
2483 bool new_blacklist
= false;
2484 bool prenautilus
= objecter
->with_osdmap(
2485 [&](const OSDMap
& o
) {
2486 return o
.require_osd_release
< ceph_release_t::nautilus
;
2489 for (auto a
: myaddrs
.v
) {
2490 // blacklist entries are always TYPE_ANY for nautilus+
2491 a
.set_type(entity_addr_t::TYPE_ANY
);
2492 if (new_blacklists
.count(a
)) {
2493 new_blacklist
= true;
2497 // ...except pre-nautilus, they were TYPE_LEGACY
2498 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2499 if (new_blacklists
.count(a
)) {
2500 new_blacklist
= true;
2506 if (new_blacklist
) {
2507 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2508 return o
.get_epoch();
2510 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2513 _abort_mds_sessions(-EBLACKLISTED
);
2515 // Since we know all our OSD ops will fail, cancel them all preemtively,
2516 // so that on an unhealthy cluster we can umount promptly even if e.g.
2517 // some PGs were inaccessible.
2518 objecter
->op_cancel_writes(-EBLACKLISTED
);
2520 } else if (blacklisted
) {
2521 // Handle case where we were blacklisted but no longer are
2522 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2523 return o
.is_blacklisted(myaddrs
);});
2526 // Always subscribe to next osdmap for blacklisted client
2527 // until this client is not blacklisted.
2529 objecter
->maybe_request_map();
2532 if (objecter
->osdmap_full_flag()) {
2533 _handle_full_flag(-1);
2535 // Accumulate local list of full pools so that I can drop
2536 // the objecter lock before re-entering objecter in
2538 std::vector
<int64_t> full_pools
;
2540 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2541 for (const auto& kv
: o
.get_pools()) {
2542 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2543 full_pools
.push_back(kv
.first
);
2548 for (auto p
: full_pools
)
2549 _handle_full_flag(p
);
2551 // Subscribe to subsequent maps to watch for the full flag going
2552 // away. For the global full flag objecter does this for us, but
2553 // it pays no attention to the per-pool full flag so in this branch
2554 // we do it ourselves.
2555 if (!full_pools
.empty()) {
2556 objecter
->maybe_request_map();
2562 // ------------------------
2563 // incoming messages
2566 bool Client::ms_dispatch2(const MessageRef
&m
)
2568 std::lock_guard
l(client_lock
);
2570 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2574 switch (m
->get_type()) {
2575 // mounting and mds sessions
2576 case CEPH_MSG_MDS_MAP
:
2577 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2579 case CEPH_MSG_FS_MAP
:
2580 handle_fs_map(ref_cast
<MFSMap
>(m
));
2582 case CEPH_MSG_FS_MAP_USER
:
2583 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2585 case CEPH_MSG_CLIENT_SESSION
:
2586 handle_client_session(ref_cast
<MClientSession
>(m
));
2589 case CEPH_MSG_OSD_MAP
:
2590 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2594 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2595 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2597 case CEPH_MSG_CLIENT_REPLY
:
2598 handle_client_reply(ref_cast
<MClientReply
>(m
));
2602 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2603 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2606 case CEPH_MSG_CLIENT_SNAP
:
2607 handle_snap(ref_cast
<MClientSnap
>(m
));
2609 case CEPH_MSG_CLIENT_CAPS
:
2610 handle_caps(ref_cast
<MClientCaps
>(m
));
2612 case CEPH_MSG_CLIENT_LEASE
:
2613 handle_lease(ref_cast
<MClientLease
>(m
));
2615 case MSG_COMMAND_REPLY
:
2616 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2617 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2622 case CEPH_MSG_CLIENT_QUOTA
:
2623 handle_quota(ref_cast
<MClientQuota
>(m
));
2632 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2633 << "+" << inode_map
.size() << dendl
;
2634 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2636 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2637 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2638 mount_cond
.notify_all();
2640 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2641 << "+" << inode_map
.size() << dendl
;
2648 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2650 fsmap
.reset(new FSMap(m
->get_fsmap()));
2652 signal_cond_list(waiting_for_fsmap
);
2654 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2657 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2659 fsmap_user
.reset(new FSMapUser
);
2660 *fsmap_user
= m
->get_fsmap();
2662 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2663 signal_cond_list(waiting_for_fsmap
);
2666 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2668 mds_gid_t old_inc
, new_inc
;
2669 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2670 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2671 << " is identical to or older than our "
2672 << mdsmap
->get_epoch() << dendl
;
2676 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2678 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2679 oldmap
.swap(mdsmap
);
2681 mdsmap
->decode(m
->get_encoded());
2683 // Cancel any commands for missing or laggy GIDs
2684 std::list
<ceph_tid_t
> cancel_ops
;
2685 auto &commands
= command_table
.get_commands();
2686 for (const auto &i
: commands
) {
2687 auto &op
= i
.second
;
2688 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2689 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2690 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2691 cancel_ops
.push_back(i
.first
);
2693 std::ostringstream ss
;
2694 ss
<< "MDS " << op_mds_gid
<< " went away";
2695 *(op
.outs
) = ss
.str();
2697 op
.con
->mark_down();
2699 op
.on_finish
->complete(-ETIMEDOUT
);
2704 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2705 i
!= cancel_ops
.end(); ++i
) {
2706 command_table
.erase(*i
);
2710 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2711 mds_rank_t mds
= p
->first
;
2712 MetaSession
*session
= &p
->second
;
2715 int oldstate
= oldmap
->get_state(mds
);
2716 int newstate
= mdsmap
->get_state(mds
);
2717 if (!mdsmap
->is_up(mds
)) {
2718 session
->con
->mark_down();
2719 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2720 old_inc
= oldmap
->get_incarnation(mds
);
2721 new_inc
= mdsmap
->get_incarnation(mds
);
2722 if (old_inc
!= new_inc
) {
2723 ldout(cct
, 1) << "mds incarnation changed from "
2724 << old_inc
<< " to " << new_inc
<< dendl
;
2725 oldstate
= MDSMap::STATE_NULL
;
2727 session
->con
->mark_down();
2728 session
->addrs
= mdsmap
->get_addrs(mds
);
2729 // When new MDS starts to take over, notify kernel to trim unused entries
2730 // in its dcache/icache. Hopefully, the kernel will release some unused
2731 // inodes before the new MDS enters reconnect state.
2732 trim_cache_for_reconnect(session
);
2733 } else if (oldstate
== newstate
)
2734 continue; // no change
2736 session
->mds_state
= newstate
;
2737 if (newstate
== MDSMap::STATE_RECONNECT
) {
2738 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2739 send_reconnect(session
);
2740 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2741 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2742 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2743 _closed_mds_session(session
);
2746 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2747 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2748 // kick new requests
2749 kick_requests(session
);
2750 kick_flushing_caps(session
);
2751 signal_context_list(session
->waiting_for_open
);
2752 wake_up_session_caps(session
, true);
2754 connect_mds_targets(mds
);
2756 } else if (newstate
== MDSMap::STATE_NULL
&&
2757 mds
>= mdsmap
->get_max_mds()) {
2758 _closed_mds_session(session
);
2762 // kick any waiting threads
2763 signal_cond_list(waiting_for_mdsmap
);
2765 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2768 void Client::send_reconnect(MetaSession
*session
)
2770 mds_rank_t mds
= session
->mds_num
;
2771 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2773 // trim unused caps to reduce MDS's cache rejoin time
2774 trim_cache_for_reconnect(session
);
2776 session
->readonly
= false;
2778 session
->release
.reset();
2780 // reset my cap seq number
2782 //connect to the mds' offload targets
2783 connect_mds_targets(mds
);
2784 //make sure unsafe requests get saved
2785 resend_unsafe_requests(session
);
2787 early_kick_flushing_caps(session
);
2789 auto m
= make_message
<MClientReconnect
>();
2790 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2792 // i have an open session.
2793 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2794 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2795 p
!= inode_map
.end();
2797 Inode
*in
= p
->second
;
2798 auto it
= in
->caps
.find(mds
);
2799 if (it
!= in
->caps
.end()) {
2801 m
->get_approx_size() >=
2802 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2804 session
->con
->send_message2(std::move(m
));
2806 m
= make_message
<MClientReconnect
>();
2809 Cap
&cap
= it
->second
;
2810 ldout(cct
, 10) << " caps on " << p
->first
2811 << " " << ccap_string(cap
.issued
)
2812 << " wants " << ccap_string(in
->caps_wanted())
2815 in
->make_long_path(path
);
2816 ldout(cct
, 10) << " path " << path
<< dendl
;
2819 _encode_filelocks(in
, flockbl
);
2821 cap
.seq
= 0; // reset seq.
2822 cap
.issue_seq
= 0; // reset seq.
2823 cap
.mseq
= 0; // reset seq.
2824 // cap gen should catch up with session cap_gen
2825 if (cap
.gen
< session
->cap_gen
) {
2826 cap
.gen
= session
->cap_gen
;
2827 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2829 cap
.issued
= cap
.implemented
;
2831 snapid_t snap_follows
= 0;
2832 if (!in
->cap_snaps
.empty())
2833 snap_follows
= in
->cap_snaps
.begin()->first
;
2835 m
->add_cap(p
->first
.ino
,
2837 path
.get_ino(), path
.get_path(), // ino
2838 in
->caps_wanted(), // wanted
2839 cap
.issued
, // issued
2844 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2845 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2846 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2847 did_snaprealm
.insert(in
->snaprealm
->ino
);
2853 m
->set_encoding_version(0); // use connection features to choose encoding
2854 session
->con
->send_message2(std::move(m
));
2856 mount_cond
.notify_all();
2858 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2859 signal_cond_list(waiting_for_reclaim
);
2863 void Client::kick_requests(MetaSession
*session
)
2865 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2866 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2867 p
!= mds_requests
.end();
2869 MetaRequest
*req
= p
->second
;
2870 if (req
->got_unsafe
)
2872 if (req
->aborted()) {
2873 if (req
->caller_cond
) {
2875 req
->caller_cond
->notify_all();
2879 if (req
->retry_attempt
> 0)
2880 continue; // new requests only
2881 if (req
->mds
== session
->mds_num
) {
2882 send_request(p
->second
, session
);
2887 void Client::resend_unsafe_requests(MetaSession
*session
)
2889 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2892 send_request(*iter
, session
);
2894 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2895 // process completed requests in clientreplay stage.
2896 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2897 p
!= mds_requests
.end();
2899 MetaRequest
*req
= p
->second
;
2900 if (req
->got_unsafe
)
2904 if (req
->retry_attempt
== 0)
2905 continue; // old requests only
2906 if (req
->mds
== session
->mds_num
)
2907 send_request(req
, session
, true);
2911 void Client::wait_unsafe_requests()
2913 list
<MetaRequest
*> last_unsafe_reqs
;
2914 for (const auto &p
: mds_sessions
) {
2915 const MetaSession
&s
= p
.second
;
2916 if (!s
.unsafe_requests
.empty()) {
2917 MetaRequest
*req
= s
.unsafe_requests
.back();
2919 last_unsafe_reqs
.push_back(req
);
2923 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2924 p
!= last_unsafe_reqs
.end();
2926 MetaRequest
*req
= *p
;
2927 if (req
->unsafe_item
.is_on_list())
2928 wait_on_list(req
->waitfor_safe
);
2933 void Client::kick_requests_closed(MetaSession
*session
)
2935 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2936 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2937 p
!= mds_requests
.end(); ) {
2938 MetaRequest
*req
= p
->second
;
2940 if (req
->mds
== session
->mds_num
) {
2941 if (req
->caller_cond
) {
2943 req
->caller_cond
->notify_all();
2945 req
->item
.remove_myself();
2946 if (req
->got_unsafe
) {
2947 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2948 req
->unsafe_item
.remove_myself();
2949 if (is_dir_operation(req
)) {
2950 Inode
*dir
= req
->inode();
2952 dir
->set_async_err(-EIO
);
2953 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2954 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2955 req
->unsafe_dir_item
.remove_myself();
2958 InodeRef
&in
= req
->target
;
2959 in
->set_async_err(-EIO
);
2960 lderr(cct
) << "kick_requests_closed drop req of inode : "
2961 << in
->ino
<< " " << req
->get_tid() << dendl
;
2962 req
->unsafe_target_item
.remove_myself();
2964 signal_cond_list(req
->waitfor_safe
);
2965 unregister_request(req
);
2969 ceph_assert(session
->requests
.empty());
2970 ceph_assert(session
->unsafe_requests
.empty());
2980 void Client::got_mds_push(MetaSession
*s
)
2983 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2984 if (s
->state
== MetaSession::STATE_CLOSING
) {
2985 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2989 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2991 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2993 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2995 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2996 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
3001 got_mds_push(session
);
3003 ceph_seq_t seq
= m
->get_seq();
3006 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3007 if (inode_map
.count(vino
) == 0) {
3008 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3011 in
= inode_map
[vino
];
3013 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3014 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3015 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3018 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3019 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3025 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3026 m
->get_mask(), m
->get_ino(),
3027 m
->get_first(), m
->get_last(), m
->dname
);
3028 m
->get_connection()->send_message2(std::move(reply
));
3032 void Client::put_inode(Inode
*in
, int n
)
3034 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3035 int left
= in
->_put(n
);
3038 remove_all_caps(in
);
3040 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3041 bool unclean
= objectcacher
->release_set(&in
->oset
);
3042 ceph_assert(!unclean
);
3043 inode_map
.erase(in
->vino());
3044 if (use_faked_inos())
3045 _release_faked_ino(in
);
3050 while (!root_parents
.empty())
3051 root_parents
.erase(root_parents
.begin());
3058 void Client::close_dir(Dir
*dir
)
3060 Inode
*in
= dir
->parent_inode
;
3061 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3062 ceph_assert(dir
->is_empty());
3063 ceph_assert(in
->dir
== dir
);
3064 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3065 if (!in
->dentries
.empty())
3066 in
->get_first_parent()->put(); // unpin dentry
3070 put_inode(in
); // unpin inode
3074 * Don't call this with in==NULL, use get_or_create for that
3075 * leave dn set to default NULL unless you're trying to add
3076 * a new inode to a pre-created Dentry
3078 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3081 // create a new Dentry
3082 dn
= new Dentry(dir
, name
);
3084 lru
.lru_insert_mid(dn
); // mid or top?
3086 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3087 << " dn " << dn
<< " (new dn)" << dendl
;
3089 ceph_assert(!dn
->inode
);
3090 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3091 << " dn " << dn
<< " (old dn)" << dendl
;
3094 if (in
) { // link to inode
3096 // only one parent for directories!
3097 if (in
->is_dir() && !in
->dentries
.empty()) {
3098 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3099 Dentry
*olddn
= in
->get_first_parent();
3100 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3101 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3102 old_diri
->dir_release_count
++;
3103 clear_dir_complete_and_ordered(old_diri
, true);
3104 unlink(olddn
, true, true); // keep dir, dentry
3108 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3114 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3116 InodeRef
in(dn
->inode
);
3117 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3118 << " inode " << dn
->inode
<< dendl
;
3120 // unlink from inode
3123 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3129 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3139 if (dir
->is_empty() && !keepdir
)
3145 * For asynchronous flushes, check for errors from the IO and
3146 * update the inode if necessary
3148 class C_Client_FlushComplete
: public Context
{
3153 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3154 void finish(int r
) override
{
3155 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3157 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3158 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3159 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3160 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3161 inode
->set_async_err(r
);
3171 void Client::get_cap_ref(Inode
*in
, int cap
)
3173 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3174 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3175 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3178 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3179 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3180 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3183 in
->get_cap_ref(cap
);
3186 void Client::put_cap_ref(Inode
*in
, int cap
)
3188 int last
= in
->put_cap_ref(cap
);
3191 int drop
= last
& ~in
->caps_issued();
3192 if (in
->snapid
== CEPH_NOSNAP
) {
3193 if ((last
& CEPH_CAP_FILE_WR
) &&
3194 !in
->cap_snaps
.empty() &&
3195 in
->cap_snaps
.rbegin()->second
.writing
) {
3196 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3197 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3198 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3199 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3201 if (last
& CEPH_CAP_FILE_BUFFER
) {
3202 for (auto &p
: in
->cap_snaps
)
3203 p
.second
.dirty_data
= 0;
3204 signal_cond_list(in
->waitfor_commit
);
3205 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3209 if (last
& CEPH_CAP_FILE_CACHE
) {
3210 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3216 put_inode(in
, put_nref
);
3220 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3222 int r
= check_pool_perm(in
, need
);
3227 int file_wanted
= in
->caps_file_wanted();
3228 if ((file_wanted
& need
) != need
) {
3229 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3230 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3236 int have
= in
->caps_issued(&implemented
);
3238 bool waitfor_caps
= false;
3239 bool waitfor_commit
= false;
3241 if (have
& need
& CEPH_CAP_FILE_WR
) {
3243 if ((endoff
>= (loff_t
)in
->max_size
||
3244 endoff
> (loff_t
)(in
->size
<< 1)) &&
3245 endoff
> (loff_t
)in
->wanted_max_size
) {
3246 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3247 in
->wanted_max_size
= endoff
;
3249 if (in
->wanted_max_size
> in
->max_size
&&
3250 in
->wanted_max_size
> in
->requested_max_size
)
3254 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3255 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3256 waitfor_caps
= true;
3258 if (!in
->cap_snaps
.empty()) {
3259 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3260 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3261 waitfor_caps
= true;
3263 for (auto &p
: in
->cap_snaps
) {
3264 if (p
.second
.dirty_data
) {
3265 waitfor_commit
= true;
3269 if (waitfor_commit
) {
3270 _flush(in
, new C_Client_FlushComplete(this, in
));
3271 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3276 if (!waitfor_caps
&& !waitfor_commit
) {
3277 if ((have
& need
) == need
) {
3278 int revoking
= implemented
& ~have
;
3279 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3280 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3281 << " revoking " << ccap_string(revoking
)
3283 if ((revoking
& want
) == 0) {
3284 *phave
= need
| (have
& want
);
3285 in
->get_cap_ref(need
);
3289 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3290 waitfor_caps
= true;
3293 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3294 in
->auth_cap
->session
->readonly
)
3297 if (in
->flags
& I_CAP_DROPPED
) {
3298 int mds_wanted
= in
->caps_mds_wanted();
3299 if ((mds_wanted
& need
) != need
) {
3300 int ret
= _renew_caps(in
);
3305 if (!(file_wanted
& ~mds_wanted
))
3306 in
->flags
&= ~I_CAP_DROPPED
;
3310 wait_on_list(in
->waitfor_caps
);
3311 else if (waitfor_commit
)
3312 wait_on_list(in
->waitfor_commit
);
3316 int Client::get_caps_used(Inode
*in
)
3318 unsigned used
= in
->caps_used();
3319 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3320 !objectcacher
->set_is_empty(&in
->oset
))
3321 used
|= CEPH_CAP_FILE_CACHE
;
3325 void Client::cap_delay_requeue(Inode
*in
)
3327 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3328 in
->hold_caps_until
= ceph_clock_now();
3329 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3330 delayed_list
.push_back(&in
->delay_cap_item
);
3333 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3334 int flags
, int used
, int want
, int retain
,
3335 int flush
, ceph_tid_t flush_tid
)
3337 int held
= cap
->issued
| cap
->implemented
;
3338 int revoking
= cap
->implemented
& ~cap
->issued
;
3339 retain
&= ~revoking
;
3340 int dropping
= cap
->issued
& ~retain
;
3341 int op
= CEPH_CAP_OP_UPDATE
;
3343 ldout(cct
, 10) << __func__
<< " " << *in
3344 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3345 << " used " << ccap_string(used
)
3346 << " want " << ccap_string(want
)
3347 << " flush " << ccap_string(flush
)
3348 << " retain " << ccap_string(retain
)
3349 << " held "<< ccap_string(held
)
3350 << " revoking " << ccap_string(revoking
)
3351 << " dropping " << ccap_string(dropping
)
3354 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3355 const int would_have_issued
= cap
->issued
& retain
;
3356 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3358 // - tell the server we think issued is whatever they issued plus whatever we implemented
3359 // - leave what we have implemented in place
3360 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3361 cap
->issued
= cap
->issued
| cap
->implemented
;
3363 // Make an exception for revoking xattr caps: we are injecting
3364 // failure to release other caps, but allow xattr because client
3365 // will block on xattr ops if it can't release these to MDS (#9800)
3366 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3367 cap
->issued
^= xattr_mask
& revoking
;
3368 cap
->implemented
^= xattr_mask
& revoking
;
3370 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3371 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3374 cap
->issued
&= retain
;
3375 cap
->implemented
&= cap
->issued
| used
;
3378 snapid_t follows
= 0;
3381 follows
= in
->snaprealm
->get_snap_context().seq
;
3383 auto m
= make_message
<MClientCaps
>(op
,
3386 cap
->cap_id
, cap
->seq
,
3392 m
->caller_uid
= in
->cap_dirtier_uid
;
3393 m
->caller_gid
= in
->cap_dirtier_gid
;
3395 m
->head
.issue_seq
= cap
->issue_seq
;
3396 m
->set_tid(flush_tid
);
3398 m
->head
.uid
= in
->uid
;
3399 m
->head
.gid
= in
->gid
;
3400 m
->head
.mode
= in
->mode
;
3402 m
->head
.nlink
= in
->nlink
;
3404 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3405 encode(in
->xattrs
, m
->xattrbl
);
3406 m
->head
.xattr_version
= in
->xattr_version
;
3410 m
->max_size
= in
->max_size
;
3411 m
->truncate_seq
= in
->truncate_seq
;
3412 m
->truncate_size
= in
->truncate_size
;
3413 m
->mtime
= in
->mtime
;
3414 m
->atime
= in
->atime
;
3415 m
->ctime
= in
->ctime
;
3416 m
->btime
= in
->btime
;
3417 m
->time_warp_seq
= in
->time_warp_seq
;
3418 m
->change_attr
= in
->change_attr
;
3420 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3421 !in
->cap_snaps
.empty() &&
3422 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3423 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3426 if (flush
& CEPH_CAP_FILE_WR
) {
3427 m
->inline_version
= in
->inline_version
;
3428 m
->inline_data
= in
->inline_data
;
3431 in
->reported_size
= in
->size
;
3432 m
->set_snap_follows(follows
);
3434 if (cap
== in
->auth_cap
) {
3435 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3436 m
->set_max_size(in
->wanted_max_size
);
3437 in
->requested_max_size
= in
->wanted_max_size
;
3438 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3440 in
->requested_max_size
= 0;
3441 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3445 if (!session
->flushing_caps_tids
.empty())
3446 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3448 session
->con
->send_message2(std::move(m
));
3451 static bool is_max_size_approaching(Inode
*in
)
3453 /* mds will adjust max size according to the reported size */
3454 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3456 if (in
->size
>= in
->max_size
)
3458 /* half of previous max_size increment has been used */
3459 if (in
->max_size
> in
->reported_size
&&
3460 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3465 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3467 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3469 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3472 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3473 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3474 used
&= ~CEPH_CAP_FILE_CACHE
;
3475 used
|= CEPH_CAP_FILE_LAZYIO
;
3477 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3478 used
&= ~CEPH_CAP_FILE_BUFFER
;
3479 used
|= CEPH_CAP_FILE_LAZYIO
;
3482 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3483 used
&= ~CEPH_CAP_FILE_CACHE
;
3484 used
|= CEPH_CAP_FILE_LAZYIO
;
3486 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3487 used
&= ~CEPH_CAP_FILE_BUFFER
;
3488 used
|= CEPH_CAP_FILE_LAZYIO
;
3497 * Examine currently used and wanted versus held caps. Release, flush or ack
3498 * revoked caps to the MDS as appropriate.
3500 * @param in the inode to check
3501 * @param flags flags to apply to cap check
3503 void Client::check_caps(Inode
*in
, unsigned flags
)
3505 unsigned wanted
= in
->caps_wanted();
3506 unsigned used
= get_caps_used(in
);
3510 int issued
= in
->caps_issued(&implemented
);
3511 int revoking
= implemented
& ~issued
;
3513 int orig_used
= used
;
3514 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3516 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3517 if (!unmounting
&& in
->nlink
> 0) {
3519 retain
|= CEPH_CAP_ANY
;
3520 } else if (in
->is_dir() &&
3521 (issued
& CEPH_CAP_FILE_SHARED
) &&
3522 (in
->flags
& I_COMPLETE
)) {
3523 // we do this here because we don't want to drop to Fs (and then
3524 // drop the Fs if we do a create!) if that alone makes us send lookups
3525 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3526 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3529 retain
|= CEPH_CAP_ANY_SHARED
;
3530 // keep RD only if we didn't have the file open RW,
3531 // because then the mds would revoke it anyway to
3532 // journal max_size=0.
3533 if (in
->max_size
== 0)
3534 retain
|= CEPH_CAP_ANY_RD
;
3538 ldout(cct
, 10) << __func__
<< " on " << *in
3539 << " wanted " << ccap_string(wanted
)
3540 << " used " << ccap_string(used
)
3541 << " issued " << ccap_string(issued
)
3542 << " revoking " << ccap_string(revoking
)
3543 << " flags=" << flags
3546 if (in
->snapid
!= CEPH_NOSNAP
)
3547 return; //snap caps last forever, can't write
3549 if (in
->caps
.empty())
3550 return; // guard if at end of func
3552 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3553 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3555 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3559 for (auto &p
: in
->caps
) {
3560 mds_rank_t mds
= p
.first
;
3561 Cap
&cap
= p
.second
;
3563 MetaSession
*session
= &mds_sessions
.at(mds
);
3566 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3567 cap_used
&= ~in
->auth_cap
->issued
;
3569 revoking
= cap
.implemented
& ~cap
.issued
;
3571 ldout(cct
, 10) << " cap mds." << mds
3572 << " issued " << ccap_string(cap
.issued
)
3573 << " implemented " << ccap_string(cap
.implemented
)
3574 << " revoking " << ccap_string(revoking
) << dendl
;
3576 if (in
->wanted_max_size
> in
->max_size
&&
3577 in
->wanted_max_size
> in
->requested_max_size
&&
3578 &cap
== in
->auth_cap
)
3581 /* approaching file_max? */
3582 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3583 &cap
== in
->auth_cap
&&
3584 is_max_size_approaching(in
)) {
3585 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3586 << ", reported " << in
->reported_size
<< dendl
;
3590 /* completed revocation? */
3591 if (revoking
&& (revoking
& cap_used
) == 0) {
3592 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3596 /* want more caps from mds? */
3597 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3600 if (!revoking
&& unmounting
&& (cap_used
== 0))
3603 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3604 !in
->dirty_caps
) // and we have no dirty caps
3607 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3608 ldout(cct
, 10) << "delaying cap release" << dendl
;
3609 cap_delay_requeue(in
);
3614 if (&cap
== in
->auth_cap
) {
3615 if (in
->flags
& I_KICK_FLUSH
) {
3616 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3617 << " to mds." << mds
<< dendl
;
3618 kick_flushing_caps(in
, session
);
3620 if (!in
->cap_snaps
.empty() &&
3621 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3626 ceph_tid_t flush_tid
;
3627 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3628 flushing
= mark_caps_flushing(in
, &flush_tid
);
3634 int msg_flags
= (flags
& CHECK_CAPS_SYNCHRONOUS
) ? MClientCaps::FLAG_SYNC
: 0;
3635 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3636 flushing
, flush_tid
);
3641 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3643 int used
= get_caps_used(in
);
3644 int dirty
= in
->caps_dirty();
3645 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3647 if (in
->cap_snaps
.size() &&
3648 in
->cap_snaps
.rbegin()->second
.writing
) {
3649 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3651 } else if (in
->caps_dirty() ||
3652 (used
& CEPH_CAP_FILE_WR
) ||
3653 (dirty
& CEPH_CAP_ANY_WR
)) {
3654 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3655 ceph_assert(capsnapem
.second
); /* element inserted */
3656 CapSnap
&capsnap
= capsnapem
.first
->second
;
3657 capsnap
.context
= old_snapc
;
3658 capsnap
.issued
= in
->caps_issued();
3659 capsnap
.dirty
= in
->caps_dirty();
3661 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3663 capsnap
.uid
= in
->uid
;
3664 capsnap
.gid
= in
->gid
;
3665 capsnap
.mode
= in
->mode
;
3666 capsnap
.btime
= in
->btime
;
3667 capsnap
.xattrs
= in
->xattrs
;
3668 capsnap
.xattr_version
= in
->xattr_version
;
3669 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3670 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3672 if (used
& CEPH_CAP_FILE_WR
) {
3673 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3674 capsnap
.writing
= 1;
3676 finish_cap_snap(in
, capsnap
, used
);
3679 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3683 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3685 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3686 capsnap
.size
= in
->size
;
3687 capsnap
.mtime
= in
->mtime
;
3688 capsnap
.atime
= in
->atime
;
3689 capsnap
.ctime
= in
->ctime
;
3690 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3691 capsnap
.change_attr
= in
->change_attr
;
3692 capsnap
.dirty
|= in
->caps_dirty();
3694 /* Only reset it if it wasn't set before */
3695 if (capsnap
.cap_dirtier_uid
== -1) {
3696 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3697 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3700 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3701 capsnap
.inline_data
= in
->inline_data
;
3702 capsnap
.inline_version
= in
->inline_version
;
3705 if (used
& CEPH_CAP_FILE_BUFFER
) {
3706 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3707 << " WRBUFFER, delaying" << dendl
;
3709 capsnap
.dirty_data
= 0;
3714 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3716 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3717 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3721 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3722 snapid_t follows
, CapSnap
& capsnap
)
3724 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3725 in
->ino
, in
->snaprealm
->ino
, 0,
3726 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3727 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3728 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3730 m
->set_client_tid(capsnap
.flush_tid
);
3731 m
->head
.snap_follows
= follows
;
3733 m
->head
.caps
= capsnap
.issued
;
3734 m
->head
.dirty
= capsnap
.dirty
;
3736 m
->head
.uid
= capsnap
.uid
;
3737 m
->head
.gid
= capsnap
.gid
;
3738 m
->head
.mode
= capsnap
.mode
;
3739 m
->btime
= capsnap
.btime
;
3741 m
->size
= capsnap
.size
;
3743 m
->head
.xattr_version
= capsnap
.xattr_version
;
3744 encode(capsnap
.xattrs
, m
->xattrbl
);
3746 m
->ctime
= capsnap
.ctime
;
3747 m
->btime
= capsnap
.btime
;
3748 m
->mtime
= capsnap
.mtime
;
3749 m
->atime
= capsnap
.atime
;
3750 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3751 m
->change_attr
= capsnap
.change_attr
;
3753 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3754 m
->inline_version
= in
->inline_version
;
3755 m
->inline_data
= in
->inline_data
;
3758 ceph_assert(!session
->flushing_caps_tids
.empty());
3759 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3761 session
->con
->send_message2(std::move(m
));
3764 void Client::flush_snaps(Inode
*in
)
3766 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3767 ceph_assert(in
->cap_snaps
.size());
3770 ceph_assert(in
->auth_cap
);
3771 MetaSession
*session
= in
->auth_cap
->session
;
3773 for (auto &p
: in
->cap_snaps
) {
3774 CapSnap
&capsnap
= p
.second
;
3775 // only do new flush
3776 if (capsnap
.flush_tid
> 0)
3779 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3780 << " follows " << p
.first
3781 << " size " << capsnap
.size
3782 << " mtime " << capsnap
.mtime
3783 << " dirty_data=" << capsnap
.dirty_data
3784 << " writing=" << capsnap
.writing
3785 << " on " << *in
<< dendl
;
3786 if (capsnap
.dirty_data
|| capsnap
.writing
)
3789 capsnap
.flush_tid
= ++last_flush_tid
;
3790 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3791 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3792 if (!in
->flushing_cap_item
.is_on_list())
3793 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3795 send_flush_snap(in
, session
, p
.first
, capsnap
);
3799 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3801 ceph::condition_variable cond
;
3802 ls
.push_back(&cond
);
3803 std::unique_lock l
{client_lock
, std::adopt_lock
};
3809 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3811 for (auto cond
: ls
) {
3816 void Client::wait_on_context_list(list
<Context
*>& ls
)
3818 ceph::condition_variable cond
;
3821 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3822 std::unique_lock l
{client_lock
, std::adopt_lock
};
3823 cond
.wait(l
, [&done
] { return done
;});
3827 void Client::signal_context_list(list
<Context
*>& ls
)
3829 while (!ls
.empty()) {
3830 ls
.front()->complete(0);
3835 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3837 for (const auto &cap
: s
->caps
) {
3838 auto &in
= cap
->inode
;
3840 in
.requested_max_size
= 0;
3841 in
.wanted_max_size
= 0;
3843 if (cap
->gen
< s
->cap_gen
) {
3844 // mds did not re-issue stale cap.
3845 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3846 // make sure mds knows what we want.
3847 if (in
.caps_file_wanted() & ~cap
->wanted
)
3848 in
.flags
|= I_CAP_DROPPED
;
3851 signal_cond_list(in
.waitfor_caps
);
3856 // flush dirty data (from objectcache)
3858 class C_Client_CacheInvalidate
: public Context
{
3862 int64_t offset
, length
;
3864 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3865 client(c
), offset(off
), length(len
) {
3866 if (client
->use_faked_inos())
3867 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3871 void finish(int r
) override
{
3872 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3873 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
3874 client
->_async_invalidate(ino
, offset
, length
);
3878 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3882 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3883 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3886 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3888 if (ino_invalidate_cb
)
3889 // we queue the invalidate, which calls the callback and decrements the ref
3890 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3893 void Client::_invalidate_inode_cache(Inode
*in
)
3895 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3897 // invalidate our userspace inode cache
3898 if (cct
->_conf
->client_oc
) {
3899 objectcacher
->release_set(&in
->oset
);
3900 if (!objectcacher
->set_is_empty(&in
->oset
))
3901 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3904 _schedule_invalidate_callback(in
, 0, 0);
3907 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3909 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3911 // invalidate our userspace inode cache
3912 if (cct
->_conf
->client_oc
) {
3913 vector
<ObjectExtent
> ls
;
3914 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3915 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3918 _schedule_invalidate_callback(in
, off
, len
);
3921 bool Client::_release(Inode
*in
)
3923 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3924 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3925 _invalidate_inode_cache(in
);
3931 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3933 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3935 if (!in
->oset
.dirty_or_tx
) {
3936 ldout(cct
, 10) << " nothing to flush" << dendl
;
3937 onfinish
->complete(0);
3941 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3942 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3943 objectcacher
->purge_set(&in
->oset
);
3945 onfinish
->complete(-ENOSPC
);
3950 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3953 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3955 ceph_assert(ceph_mutex_is_locked(client_lock
));
3956 if (!in
->oset
.dirty_or_tx
) {
3957 ldout(cct
, 10) << " nothing to flush" << dendl
;
3961 C_SaferCond
onflush("Client::_flush_range flock");
3962 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3963 offset
, size
, &onflush
);
3966 client_lock
.unlock();
3972 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3974 // std::lock_guard l(client_lock);
3975 ceph_assert(ceph_mutex_is_locked(client_lock
)); // will be called via dispatch() -> objecter -> ...
3976 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3981 void Client::_flushed(Inode
*in
)
3983 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3985 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3990 // checks common to add_update_cap, handle_cap_grant
3991 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
3993 unsigned had
= in
->caps_issued();
3995 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3996 !(had
& CEPH_CAP_FILE_CACHE
))
3999 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
4000 !(had
& CEPH_CAP_FILE_SHARED
)) {
4004 clear_dir_complete_and_ordered(in
, true);
4008 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4009 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4010 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4012 if (!in
->is_any_caps()) {
4013 ceph_assert(in
->snaprealm
== 0);
4014 in
->snaprealm
= get_snap_realm(realm
);
4015 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4016 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4018 ceph_assert(in
->snaprealm
);
4019 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4020 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4021 in
->snaprealm_item
.remove_myself();
4022 auto oldrealm
= in
->snaprealm
;
4023 in
->snaprealm
= get_snap_realm(realm
);
4024 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4025 put_snap_realm(oldrealm
);
4029 mds_rank_t mds
= mds_session
->mds_num
;
4030 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4031 Cap
&cap
= capem
.first
->second
;
4032 if (!capem
.second
) {
4033 if (cap
.gen
< mds_session
->cap_gen
)
4034 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4037 * auth mds of the inode changed. we received the cap export
4038 * message, but still haven't received the cap import message.
4039 * handle_cap_export() updated the new auth MDS' cap.
4041 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4042 * a message that was send before the cap import message. So
4043 * don't remove caps.
4045 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4046 if (&cap
!= in
->auth_cap
)
4047 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4049 ceph_assert(cap
.cap_id
== cap_id
);
4052 issued
|= cap
.issued
;
4053 flags
|= CEPH_CAP_FLAG_AUTH
;
4057 check_cap_issue(in
, issued
);
4059 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4060 if (in
->auth_cap
!= &cap
&&
4061 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4062 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4063 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4064 << "add myself to new auth MDS' flushing caps list" << dendl
;
4065 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4067 in
->auth_cap
= &cap
;
4071 unsigned old_caps
= cap
.issued
;
4072 cap
.cap_id
= cap_id
;
4073 cap
.issued
= issued
;
4074 cap
.implemented
|= issued
;
4075 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4076 cap
.wanted
= wanted
;
4078 cap
.wanted
|= wanted
;
4080 cap
.issue_seq
= seq
;
4082 cap
.gen
= mds_session
->cap_gen
;
4083 cap
.latest_perms
= cap_perms
;
4084 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4085 << " from mds." << mds
4089 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4090 // non-auth MDS is revoking the newly grant caps ?
4091 for (auto &p
: in
->caps
) {
4092 if (&p
.second
== &cap
)
4094 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4095 check_caps(in
, CHECK_CAPS_NODELAY
);
4101 if (issued
& ~old_caps
)
4102 signal_cond_list(in
->waitfor_caps
);
4105 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4107 auto &in
= cap
->inode
;
4108 MetaSession
*session
= cap
->session
;
4109 mds_rank_t mds
= cap
->session
->mds_num
;
4111 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4113 if (queue_release
) {
4114 session
->enqueue_cap_release(
4122 if (in
.auth_cap
== cap
) {
4123 if (in
.flushing_cap_item
.is_on_list()) {
4124 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4125 in
.flushing_cap_item
.remove_myself();
4129 size_t n
= in
.caps
.erase(mds
);
4130 ceph_assert(n
== 1);
4133 if (!in
.is_any_caps()) {
4134 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4135 in
.snaprealm_item
.remove_myself();
4136 put_snap_realm(in
.snaprealm
);
4141 void Client::remove_all_caps(Inode
*in
)
4143 while (!in
->caps
.empty())
4144 remove_cap(&in
->caps
.begin()->second
, true);
4147 void Client::remove_session_caps(MetaSession
*s
)
4149 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4151 while (s
->caps
.size()) {
4152 Cap
*cap
= *s
->caps
.begin();
4153 InodeRef
in(&cap
->inode
);
4154 bool dirty_caps
= false;
4155 if (in
->auth_cap
== cap
) {
4156 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4157 in
->wanted_max_size
= 0;
4158 in
->requested_max_size
= 0;
4160 if (cap
->wanted
| cap
->issued
)
4161 in
->flags
|= I_CAP_DROPPED
;
4162 remove_cap(cap
, false);
4163 in
->cap_snaps
.clear();
4165 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4166 if (in
->flushing_caps
) {
4167 num_flushing_caps
--;
4168 in
->flushing_cap_tids
.clear();
4170 in
->flushing_caps
= 0;
4171 in
->mark_caps_clean();
4172 put_inode(in
.get());
4174 signal_cond_list(in
->waitfor_caps
);
4176 s
->flushing_caps_tids
.clear();
4177 sync_cond
.notify_all();
4180 int Client::_do_remount(bool retry_on_error
)
4182 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4185 int r
= remount_cb(callback_handle
);
4187 retries_on_invalidate
= 0;
4190 client_t whoami
= get_nodeid();
4193 "failed to remount (to trim kernel dentries): "
4194 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4197 "failed to remount (to trim kernel dentries): "
4198 "return code = " << r
<< dendl
;
4201 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4202 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4203 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4204 if (should_abort
&& !unmounting
) {
4205 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4212 class C_Client_Remount
: public Context
{
4216 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4217 void finish(int r
) override
{
4218 ceph_assert(r
== 0);
4219 client
->_do_remount(true);
4223 void Client::_invalidate_kernel_dcache()
4227 if (can_invalidate_dentries
) {
4228 if (dentry_invalidate_cb
&& root
->dir
) {
4229 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4230 p
!= root
->dir
->dentries
.end();
4232 if (p
->second
->inode
)
4233 _schedule_invalidate_dentry_callback(p
->second
, false);
4236 } else if (remount_cb
) {
4238 // when remounting a file system, linux kernel trims all unused dentries in the fs
4239 remount_finisher
.queue(new C_Client_Remount(this));
4243 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4249 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4250 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4251 Dentry
*dn
= p
->second
;
4253 ceph_assert(!dn
->inode
);
4254 if (dn
->lru_is_expireable())
4255 unlink(dn
, true, false); // keep dir, drop dentry
4257 if (dir
->dentries
.empty()) {
4262 if (in
->flags
& I_SNAPDIR_OPEN
) {
4263 InodeRef snapdir
= open_snapdir(in
.get());
4264 _trim_negative_child_dentries(snapdir
);
4268 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4270 mds_rank_t mds
= s
->mds_num
;
4271 size_t caps_size
= s
->caps
.size();
4272 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4273 << " caps " << caps_size
<< dendl
;
4275 uint64_t trimmed
= 0;
4276 auto p
= s
->caps
.begin();
4277 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4278 * looking at from getting deleted during traversal. */
4279 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4281 InodeRef
in(&cap
->inode
);
4283 // Increment p early because it will be invalidated if cap
4284 // is deleted inside remove_cap
4287 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4288 int mine
= cap
->issued
| cap
->implemented
;
4289 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4290 // disposable non-auth cap
4291 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4292 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4293 cap
= (remove_cap(cap
, true), nullptr);
4297 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4298 _trim_negative_child_dentries(in
);
4300 auto q
= in
->dentries
.begin();
4301 while (q
!= in
->dentries
.end()) {
4304 if (dn
->lru_is_expireable()) {
4305 if (can_invalidate_dentries
&&
4306 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4307 // Only issue one of these per DN for inodes in root: handle
4308 // others more efficiently by calling for root-child DNs at
4309 // the end of this function.
4310 _schedule_invalidate_dentry_callback(dn
, true);
4312 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4315 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4319 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4320 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4325 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4326 for (const auto &dn
: to_trim
) {
4331 caps_size
= s
->caps
.size();
4332 if (caps_size
> (size_t)max
)
4333 _invalidate_kernel_dcache();
4336 void Client::force_session_readonly(MetaSession
*s
)
4339 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4340 auto &in
= (*p
)->inode
;
4341 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4342 signal_cond_list(in
.waitfor_caps
);
4346 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4348 MetaSession
*session
= in
->auth_cap
->session
;
4350 int flushing
= in
->dirty_caps
;
4351 ceph_assert(flushing
);
4353 ceph_tid_t flush_tid
= ++last_flush_tid
;
4354 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4356 if (!in
->flushing_caps
) {
4357 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4358 num_flushing_caps
++;
4360 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4363 in
->flushing_caps
|= flushing
;
4364 in
->mark_caps_clean();
4366 if (!in
->flushing_cap_item
.is_on_list())
4367 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4368 session
->flushing_caps_tids
.insert(flush_tid
);
4374 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4376 for (auto &p
: in
->cap_snaps
) {
4377 CapSnap
&capsnap
= p
.second
;
4378 if (capsnap
.flush_tid
> 0) {
4379 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4380 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4383 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4384 it
!= in
->flushing_cap_tids
.end();
4386 old_s
->flushing_caps_tids
.erase(it
->first
);
4387 new_s
->flushing_caps_tids
.insert(it
->first
);
4389 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4393 * Flush all caps back to the MDS. Because the callers generally wait on the
4394 * result of this function (syncfs and umount cases), we set
4395 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4397 void Client::flush_caps_sync()
4399 ldout(cct
, 10) << __func__
<< dendl
;
4400 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4402 unsigned flags
= CHECK_CAPS_NODELAY
;
4406 delayed_list
.pop_front();
4407 if (p
.end() && dirty_list
.empty())
4408 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4409 check_caps(in
, flags
);
4413 p
= dirty_list
.begin();
4415 unsigned flags
= CHECK_CAPS_NODELAY
;
4420 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4421 check_caps(in
, flags
);
4425 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4427 while (in
->flushing_caps
) {
4428 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4429 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4430 if (it
->first
> want
)
4432 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4433 << ccap_string(it
->second
) << " want " << want
4434 << " last " << it
->first
<< dendl
;
4435 wait_on_list(in
->waitfor_caps
);
4439 void Client::wait_sync_caps(ceph_tid_t want
)
4442 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4443 << num_flushing_caps
<< " total flushing)" << dendl
;
4444 for (auto &p
: mds_sessions
) {
4445 MetaSession
*s
= &p
.second
;
4446 if (s
->flushing_caps_tids
.empty())
4448 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4449 if (oldest_tid
<= want
) {
4450 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4451 << " (want " << want
<< ")" << dendl
;
4452 std::unique_lock l
{client_lock
, std::adopt_lock
};
4460 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4462 in
->flags
&= ~I_KICK_FLUSH
;
4464 Cap
*cap
= in
->auth_cap
;
4465 ceph_assert(cap
->session
== session
);
4467 ceph_tid_t last_snap_flush
= 0;
4468 for (auto p
= in
->flushing_cap_tids
.rbegin();
4469 p
!= in
->flushing_cap_tids
.rend();
4472 last_snap_flush
= p
->first
;
4477 int wanted
= in
->caps_wanted();
4478 int used
= get_caps_used(in
) | in
->caps_dirty();
4479 auto it
= in
->cap_snaps
.begin();
4480 for (auto& p
: in
->flushing_cap_tids
) {
4482 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4483 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4486 ceph_assert(it
!= in
->cap_snaps
.end());
4487 ceph_assert(it
->second
.flush_tid
== p
.first
);
4488 send_flush_snap(in
, session
, it
->first
, it
->second
);
4494 void Client::kick_flushing_caps(MetaSession
*session
)
4496 mds_rank_t mds
= session
->mds_num
;
4497 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4499 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4501 if (in
->flags
& I_KICK_FLUSH
) {
4502 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4503 kick_flushing_caps(in
, session
);
4508 void Client::early_kick_flushing_caps(MetaSession
*session
)
4510 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4512 Cap
*cap
= in
->auth_cap
;
4515 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4516 // stage. This guarantees that MDS processes the cap flush message before issuing
4517 // the flushing caps to other client.
4518 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4519 in
->flags
|= I_KICK_FLUSH
;
4523 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4524 << " to mds." << session
->mds_num
<< dendl
;
4525 // send_reconnect() also will reset these sequence numbers. make sure
4526 // sequence numbers in cap flush message match later reconnect message.
4530 cap
->issued
= cap
->implemented
;
4532 kick_flushing_caps(in
, session
);
4536 void SnapRealm::build_snap_context()
4538 set
<snapid_t
> snaps
;
4539 snapid_t max_seq
= seq
;
4541 // start with prior_parents?
4542 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4543 snaps
.insert(prior_parent_snaps
[i
]);
4545 // current parent's snaps
4547 const SnapContext
& psnapc
= pparent
->get_snap_context();
4548 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4549 if (psnapc
.snaps
[i
] >= parent_since
)
4550 snaps
.insert(psnapc
.snaps
[i
]);
4551 if (psnapc
.seq
> max_seq
)
4552 max_seq
= psnapc
.seq
;
4556 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4557 snaps
.insert(my_snaps
[i
]);
4560 cached_snap_context
.seq
= max_seq
;
4561 cached_snap_context
.snaps
.resize(0);
4562 cached_snap_context
.snaps
.reserve(snaps
.size());
4563 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4564 cached_snap_context
.snaps
.push_back(*p
);
4567 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4572 while (!q
.empty()) {
4576 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4577 realm
->invalidate_cache();
4579 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4580 p
!= realm
->pchildren
.end();
4586 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4588 SnapRealm
*realm
= snap_realms
[r
];
4590 snap_realms
[r
] = realm
= new SnapRealm(r
);
4591 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4596 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4598 if (snap_realms
.count(r
) == 0) {
4599 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4602 SnapRealm
*realm
= snap_realms
[r
];
4603 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4608 void Client::put_snap_realm(SnapRealm
*realm
)
4610 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4611 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4612 if (--realm
->nref
== 0) {
4613 snap_realms
.erase(realm
->ino
);
4614 if (realm
->pparent
) {
4615 realm
->pparent
->pchildren
.erase(realm
);
4616 put_snap_realm(realm
->pparent
);
4622 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4624 if (realm
->parent
!= parent
) {
4625 ldout(cct
, 10) << __func__
<< " " << *realm
4626 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4627 realm
->parent
= parent
;
4628 if (realm
->pparent
) {
4629 realm
->pparent
->pchildren
.erase(realm
);
4630 put_snap_realm(realm
->pparent
);
4632 realm
->pparent
= get_snap_realm(parent
);
4633 realm
->pparent
->pchildren
.insert(realm
);
4639 static bool has_new_snaps(const SnapContext
& old_snapc
,
4640 const SnapContext
& new_snapc
)
4642 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4646 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4648 SnapRealm
*first_realm
= NULL
;
4649 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4651 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4653 auto p
= bl
.cbegin();
4657 SnapRealm
*realm
= get_snap_realm(info
.ino());
4659 bool invalidate
= false;
4661 if (info
.seq() > realm
->seq
) {
4662 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4666 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4667 // flush me + children
4670 while (!q
.empty()) {
4671 SnapRealm
*realm
= q
.front();
4674 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4675 p
!= realm
->pchildren
.end();
4679 if (dirty_realms
.count(realm
) == 0) {
4681 dirty_realms
[realm
] = realm
->get_snap_context();
4687 realm
->seq
= info
.seq();
4688 realm
->created
= info
.created();
4689 realm
->parent_since
= info
.parent_since();
4690 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4691 realm
->my_snaps
= info
.my_snaps
;
4695 // _always_ verify parent
4696 if (adjust_realm_parent(realm
, info
.parent()))
4700 invalidate_snaprealm_and_children(realm
);
4701 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4702 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4704 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4705 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4709 first_realm
= realm
;
4711 put_snap_realm(realm
);
4714 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4715 q
!= dirty_realms
.end();
4717 SnapRealm
*realm
= q
->first
;
4718 // if there are new snaps ?
4719 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4720 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4721 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4725 queue_cap_snap(in
, q
->second
);
4728 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4730 put_snap_realm(realm
);
4734 *realm_ret
= first_realm
;
4736 put_snap_realm(first_realm
);
4739 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4741 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4742 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4743 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4748 got_mds_push(session
);
4750 map
<Inode
*, SnapContext
> to_move
;
4751 SnapRealm
*realm
= 0;
4753 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4754 ceph_assert(m
->head
.split
);
4756 auto p
= m
->bl
.cbegin();
4758 ceph_assert(info
.ino() == m
->head
.split
);
4760 // flush, then move, ino's.
4761 realm
= get_snap_realm(info
.ino());
4762 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4763 for (auto& ino
: m
->split_inos
) {
4764 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4765 if (inode_map
.count(vino
)) {
4766 Inode
*in
= inode_map
[vino
];
4767 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4769 if (in
->snaprealm
->created
> info
.created()) {
4770 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4771 << *in
->snaprealm
<< dendl
;
4774 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4777 in
->snaprealm_item
.remove_myself();
4778 to_move
[in
] = in
->snaprealm
->get_snap_context();
4779 put_snap_realm(in
->snaprealm
);
4783 // move child snaprealms, too
4784 for (auto& child_realm
: m
->split_realms
) {
4785 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4786 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4789 adjust_realm_parent(child
, realm
->ino
);
4790 put_snap_realm(child
);
4794 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4797 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4798 Inode
*in
= p
->first
;
4799 in
->snaprealm
= realm
;
4800 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4802 // queue for snap writeback
4803 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4804 queue_cap_snap(in
, p
->second
);
4806 put_snap_realm(realm
);
4810 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4812 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4813 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4818 got_mds_push(session
);
4820 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4822 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4823 if (inode_map
.count(vino
)) {
4825 in
= inode_map
[vino
];
4828 in
->quota
= m
->quota
;
4829 in
->rstat
= m
->rstat
;
4834 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4836 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4837 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4842 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4843 // Pause RADOS operations until we see the required epoch
4844 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4847 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4848 // Record the barrier so that we will transmit it to MDS when releasing
4849 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4852 got_mds_push(session
);
4855 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4856 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4859 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4860 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4861 session
->enqueue_cap_release(
4868 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4871 // in case the mds is waiting on e.g. a revocation
4872 flush_cap_releases();
4876 switch (m
->get_op()) {
4877 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4878 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4879 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4882 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4883 Cap
&cap
= in
->caps
.at(mds
);
4885 switch (m
->get_op()) {
4886 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4887 case CEPH_CAP_OP_IMPORT
:
4888 case CEPH_CAP_OP_REVOKE
:
4889 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4890 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4893 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4898 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4900 mds_rank_t mds
= session
->mds_num
;
4902 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4903 << " IMPORT from mds." << mds
<< dendl
;
4905 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4908 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4910 cap_perms
= cap
->latest_perms
;
4914 SnapRealm
*realm
= NULL
;
4915 update_snap_trace(m
->snapbl
, &realm
);
4917 int issued
= m
->get_caps();
4918 int wanted
= m
->get_wanted();
4919 add_update_cap(in
, session
, m
->get_cap_id(),
4920 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
4921 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4923 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4924 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4928 put_snap_realm(realm
);
4930 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4931 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
4932 in
->requested_max_size
> m
->get_max_size()) {
4933 in
->requested_max_size
= 0;
4934 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
4936 // reflush any/all caps (if we are now the auth_cap)
4937 kick_flushing_caps(in
, session
);
4941 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4943 mds_rank_t mds
= session
->mds_num
;
4945 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4946 << " EXPORT from mds." << mds
<< dendl
;
4948 auto it
= in
->caps
.find(mds
);
4949 if (it
!= in
->caps
.end()) {
4950 Cap
&cap
= it
->second
;
4951 if (cap
.cap_id
== m
->get_cap_id()) {
4952 if (m
->peer
.cap_id
) {
4953 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
4954 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4955 auto it
= in
->caps
.find(peer_mds
);
4956 if (it
!= in
->caps
.end()) {
4957 Cap
&tcap
= it
->second
;
4958 if (tcap
.cap_id
== m
->peer
.cap_id
&&
4959 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
4960 tcap
.cap_id
= m
->peer
.cap_id
;
4961 tcap
.seq
= m
->peer
.seq
- 1;
4962 tcap
.issue_seq
= tcap
.seq
;
4963 tcap
.issued
|= cap
.issued
;
4964 tcap
.implemented
|= cap
.issued
;
4965 if (&cap
== in
->auth_cap
)
4966 in
->auth_cap
= &tcap
;
4967 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
4968 adjust_session_flushing_caps(in
, session
, tsession
);
4971 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
4972 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4973 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4977 if (cap
.wanted
| cap
.issued
)
4978 in
->flags
|= I_CAP_DROPPED
;
4981 remove_cap(&cap
, false);
4986 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4988 mds_rank_t mds
= session
->mds_num
;
4989 ceph_assert(in
->caps
.count(mds
));
4991 ldout(cct
, 10) << __func__
<< " on ino " << *in
4992 << " size " << in
->size
<< " -> " << m
->get_size()
4996 in
->caps_issued(&issued
);
4997 issued
|= in
->caps_dirty();
4998 update_inode_file_size(in
, issued
, m
->get_size(),
4999 m
->get_truncate_seq(), m
->get_truncate_size());
5002 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5004 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5005 int dirty
= m
->get_dirty();
5009 auto it
= in
->flushing_cap_tids
.begin();
5010 if (it
->first
< flush_ack_tid
) {
5011 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5012 << " got unexpected flush ack tid " << flush_ack_tid
5013 << " expected is " << it
->first
<< dendl
;
5015 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5021 if (it
->first
== flush_ack_tid
)
5022 cleaned
= it
->second
;
5023 if (it
->first
<= flush_ack_tid
) {
5024 session
->flushing_caps_tids
.erase(it
->first
);
5025 in
->flushing_cap_tids
.erase(it
++);
5029 cleaned
&= ~it
->second
;
5035 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5036 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5037 << " with " << ccap_string(dirty
) << dendl
;
5040 signal_cond_list(in
->waitfor_caps
);
5041 if (session
->flushing_caps_tids
.empty() ||
5042 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5043 sync_cond
.notify_all();
5047 in
->cap_dirtier_uid
= -1;
5048 in
->cap_dirtier_gid
= -1;
5052 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5054 if (in
->flushing_caps
) {
5055 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5056 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5057 in
->flushing_caps
&= ~cleaned
;
5058 if (in
->flushing_caps
== 0) {
5059 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5060 num_flushing_caps
--;
5061 if (in
->flushing_cap_tids
.empty())
5062 in
->flushing_cap_item
.remove_myself();
5064 if (!in
->caps_dirty())
5071 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5073 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5074 mds_rank_t mds
= session
->mds_num
;
5075 ceph_assert(in
->caps
.count(mds
));
5076 snapid_t follows
= m
->get_snap_follows();
5078 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5079 auto& capsnap
= it
->second
;
5080 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5081 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5083 InodeRef
tmp_ref(in
);
5084 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5085 << " on " << *in
<< dendl
;
5086 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5087 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5088 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5089 in
->flushing_cap_item
.remove_myself();
5090 in
->cap_snaps
.erase(it
);
5092 signal_cond_list(in
->waitfor_caps
);
5093 if (session
->flushing_caps_tids
.empty() ||
5094 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5095 sync_cond
.notify_all();
5098 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5099 << " on " << *in
<< dendl
;
5100 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5104 class C_Client_DentryInvalidate
: public Context
{
5111 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5112 client(c
), name(dn
->name
) {
5113 if (client
->use_faked_inos()) {
5114 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5116 ino
.ino
= dn
->inode
->faked_ino
;
5118 dirino
= dn
->dir
->parent_inode
->vino();
5120 ino
= dn
->inode
->vino();
5123 ino
.ino
= inodeno_t();
5125 void finish(int r
) override
{
5126 // _async_dentry_invalidate is responsible for its own locking
5127 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5128 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5132 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5136 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5137 << " in dir " << dirino
<< dendl
;
5138 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5141 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5143 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5144 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5147 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5149 int ref
= in
->get_num_ref();
5150 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5152 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5153 for (auto p
= in
->dir
->dentries
.begin();
5154 p
!= in
->dir
->dentries
.end(); ) {
5155 Dentry
*dn
= p
->second
;
5157 /* rmsnap removes whole subtree, need trim inodes recursively.
5158 * we don't need to invalidate dentries recursively. because
5159 * invalidating a directory dentry effectively invalidate
5161 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5162 _try_to_trim_inode(dn
->inode
.get(), false);
5164 if (dn
->lru_is_expireable())
5165 unlink(dn
, true, false); // keep dir, drop dentry
5167 if (in
->dir
->dentries
.empty()) {
5173 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5174 InodeRef snapdir
= open_snapdir(in
);
5175 _try_to_trim_inode(snapdir
.get(), false);
5180 auto q
= in
->dentries
.begin();
5181 while (q
!= in
->dentries
.end()) {
5184 if( in
->ll_ref
> 0 && sched_inval
) {
5185 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5186 // so in->dentries doesn't always reflect the state of kernel's dcache.
5187 _schedule_invalidate_dentry_callback(dn
, true);
5189 unlink(dn
, true, true);
5194 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5196 mds_rank_t mds
= session
->mds_num
;
5197 int used
= get_caps_used(in
);
5198 int wanted
= in
->caps_wanted();
5200 const unsigned new_caps
= m
->get_caps();
5201 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5202 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5203 << " mds." << mds
<< " seq " << m
->get_seq()
5204 << " caps now " << ccap_string(new_caps
)
5205 << " was " << ccap_string(cap
->issued
)
5206 << (was_stale
? " (stale)" : "") << dendl
;
5209 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5210 cap
->seq
= m
->get_seq();
5211 cap
->gen
= session
->cap_gen
;
5213 check_cap_issue(in
, new_caps
);
5217 in
->caps_issued(&issued
);
5218 issued
|= in
->caps_dirty();
5220 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5221 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5222 in
->mode
= m
->head
.mode
;
5223 in
->uid
= m
->head
.uid
;
5224 in
->gid
= m
->head
.gid
;
5225 in
->btime
= m
->btime
;
5227 bool deleted_inode
= false;
5228 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5229 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5230 in
->nlink
= m
->head
.nlink
;
5231 if (in
->nlink
== 0 &&
5232 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5233 deleted_inode
= true;
5235 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5236 m
->xattrbl
.length() &&
5237 m
->head
.xattr_version
> in
->xattr_version
) {
5238 auto p
= m
->xattrbl
.cbegin();
5239 decode(in
->xattrs
, p
);
5240 in
->xattr_version
= m
->head
.xattr_version
;
5243 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5244 in
->dirstat
.nfiles
= m
->get_nfiles();
5245 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5248 if (new_caps
& CEPH_CAP_ANY_RD
) {
5249 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5250 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5253 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5254 in
->layout
= m
->get_layout();
5255 update_inode_file_size(in
, issued
, m
->get_size(),
5256 m
->get_truncate_seq(), m
->get_truncate_size());
5259 if (m
->inline_version
> in
->inline_version
) {
5260 in
->inline_data
= m
->inline_data
;
5261 in
->inline_version
= m
->inline_version
;
5264 /* always take a newer change attr */
5265 if (m
->get_change_attr() > in
->change_attr
)
5266 in
->change_attr
= m
->get_change_attr();
5269 if (cap
== in
->auth_cap
&&
5270 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5271 (m
->get_max_size() != in
->max_size
)) {
5272 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5273 in
->max_size
= m
->get_max_size();
5274 if (in
->max_size
> in
->wanted_max_size
) {
5275 in
->wanted_max_size
= 0;
5276 in
->requested_max_size
= 0;
5281 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5282 (wanted
& ~(cap
->wanted
| new_caps
))) {
5283 // If mds is importing cap, prior cap messages that update 'wanted'
5284 // may get dropped by mds (migrate seq mismatch).
5286 // We don't send cap message to update 'wanted' if what we want are
5287 // already issued. If mds revokes caps, cap message that releases caps
5288 // also tells mds what we want. But if caps got revoked by mds forcedly
5289 // (session stale). We may haven't told mds what we want.
5295 auto revoked
= cap
->issued
& ~new_caps
;
5297 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5298 cap
->issued
= new_caps
;
5299 cap
->implemented
|= new_caps
;
5301 // recall delegations if we're losing caps necessary for them
5302 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5303 in
->recall_deleg(false);
5304 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5305 in
->recall_deleg(true);
5307 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5308 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5309 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5310 // waitin' for flush
5311 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5315 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5318 } else if (cap
->issued
== new_caps
) {
5319 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5321 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5322 cap
->issued
= new_caps
;
5323 cap
->implemented
|= new_caps
;
5325 if (cap
== in
->auth_cap
) {
5326 // non-auth MDS is revoking the newly grant caps ?
5327 for (const auto &p
: in
->caps
) {
5328 if (&p
.second
== cap
)
5330 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5343 signal_cond_list(in
->waitfor_caps
);
5345 // may drop inode's last ref
5347 _try_to_trim_inode(in
, true);
5350 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5352 if (perms
.uid() == 0)
5355 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5356 int ret
= _posix_acl_permission(in
, perms
, want
);
5361 // check permissions before doing anything else
5362 if (!in
->check_mode(perms
, want
))
5367 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5368 const UserPerm
& perms
)
5370 int r
= _getattr_for_perm(in
, perms
);
5375 if (strncmp(name
, "system.", 7) == 0) {
5376 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5379 r
= inode_permission(in
, perms
, want
);
5382 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5386 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5387 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5391 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5392 const UserPerm
& perms
)
5394 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5395 int r
= _getattr_for_perm(in
, perms
);
5399 if (mask
& CEPH_SETATTR_SIZE
) {
5400 r
= inode_permission(in
, perms
, MAY_WRITE
);
5406 if (mask
& CEPH_SETATTR_UID
) {
5407 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5410 if (mask
& CEPH_SETATTR_GID
) {
5411 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5412 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5416 if (mask
& CEPH_SETATTR_MODE
) {
5417 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5420 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5421 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5422 stx
->stx_mode
&= ~S_ISGID
;
5425 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5426 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5427 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5428 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5429 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5430 check_mask
|= CEPH_SETATTR_MTIME
;
5431 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5432 check_mask
|= CEPH_SETATTR_ATIME
;
5433 if (check_mask
& mask
) {
5436 r
= inode_permission(in
, perms
, MAY_WRITE
);
5444 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5448 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5450 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5453 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5455 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5456 want
= MAY_READ
| MAY_WRITE
;
5457 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5459 if (flags
& O_TRUNC
)
5463 switch (in
->mode
& S_IFMT
) {
5468 if (want
& MAY_WRITE
) {
5475 r
= _getattr_for_perm(in
, perms
);
5479 r
= inode_permission(in
, perms
, want
);
5481 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5485 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5487 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5488 int r
= _getattr_for_perm(dir
, perms
);
5492 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5494 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5498 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5500 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5501 int r
= _getattr_for_perm(dir
, perms
);
5505 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5507 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5511 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5513 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5514 int r
= _getattr_for_perm(dir
, perms
);
5518 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5522 /* 'name == NULL' means rmsnap */
5523 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5525 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5528 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5532 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5536 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5538 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5539 int r
= _getattr_for_perm(in
, perms
);
5543 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5549 if (!S_ISREG(in
->mode
))
5552 if (in
->mode
& S_ISUID
)
5555 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5558 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5560 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5564 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5566 int mask
= CEPH_STAT_CAP_MODE
;
5568 if (acl_type
!= NO_ACL
) {
5569 mask
|= CEPH_STAT_CAP_XATTR
;
5570 force
= in
->xattr_version
== 0;
5572 return _getattr(in
, mask
, perms
, force
);
5575 vinodeno_t
Client::_get_vino(Inode
*in
)
5577 /* The caller must hold the client lock */
5578 return vinodeno_t(in
->ino
, in
->snapid
);
5582 * Resolve an MDS spec to a list of MDS daemon GIDs.
5584 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5585 * It may be '*' in which case it matches all GIDs.
5587 * If no error is returned, the `targets` vector will be populated with at least
5590 int Client::resolve_mds(
5591 const std::string
&mds_spec
,
5592 std::vector
<mds_gid_t
> *targets
)
5595 ceph_assert(targets
!= nullptr);
5598 std::stringstream ss
;
5599 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5601 // We got a role, resolve it to a GID
5602 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5603 << role
<< "'" << dendl
;
5605 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5609 std::string strtol_err
;
5610 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5611 if (strtol_err
.empty()) {
5612 // It is a possible GID
5613 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5614 if (fsmap
->gid_exists(mds_gid
)) {
5615 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5616 targets
->push_back(mds_gid
);
5618 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5622 } else if (mds_spec
== "*") {
5623 // It is a wildcard: use all MDSs
5624 const auto mds_info
= fsmap
->get_mds_info();
5626 if (mds_info
.empty()) {
5627 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5631 for (const auto i
: mds_info
) {
5632 targets
->push_back(i
.first
);
5635 // It did not parse as an integer, it is not a wildcard, it must be a name
5636 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5638 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5640 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5644 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5645 << "' to GID " << mds_gid
<< dendl
;
5646 targets
->push_back(mds_gid
);
5655 * Authenticate with mon and establish global ID
5657 int Client::authenticate()
5659 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5661 if (monclient
->is_authenticated()) {
5665 client_lock
.unlock();
5666 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5672 whoami
= monclient
->get_global_id();
5673 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5678 int Client::fetch_fsmap(bool user
)
5681 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5682 // rather than MDSMap because no one MDSMap contains all the daemons, and
5683 // a `tell` can address any daemon.
5684 version_t fsmap_latest
;
5687 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5688 client_lock
.unlock();
5691 } while (r
== -EAGAIN
);
5694 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5698 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5701 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5702 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5703 monclient
->renew_subs();
5704 wait_on_list(waiting_for_fsmap
);
5706 ceph_assert(fsmap_user
);
5707 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5709 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5710 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5711 monclient
->renew_subs();
5712 wait_on_list(waiting_for_fsmap
);
5715 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5717 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5718 << fsmap_latest
<< dendl
;
5724 * @mds_spec one of ID, rank, GID, "*"
5727 int Client::mds_command(
5728 const std::string
&mds_spec
,
5729 const vector
<string
>& cmd
,
5730 const bufferlist
& inbl
,
5735 std::lock_guard
lock(client_lock
);
5746 r
= fetch_fsmap(false);
5751 // Look up MDS target(s) of the command
5752 std::vector
<mds_gid_t
> targets
;
5753 r
= resolve_mds(mds_spec
, &targets
);
5758 // If daemons are laggy, we won't send them commands. If all
5759 // are laggy then we fail.
5760 std::vector
<mds_gid_t
> non_laggy
;
5761 for (const auto gid
: targets
) {
5762 const auto info
= fsmap
->get_info_gid(gid
);
5763 if (!info
.laggy()) {
5764 non_laggy
.push_back(gid
);
5767 if (non_laggy
.size() == 0) {
5768 *outs
= "All targeted MDS daemons are laggy";
5772 if (metadata
.empty()) {
5773 // We are called on an unmounted client, so metadata
5774 // won't be initialized yet.
5775 populate_metadata("");
5778 // Send commands to targets
5779 C_GatherBuilder
gather(cct
, onfinish
);
5780 for (const auto target_gid
: non_laggy
) {
5781 const auto info
= fsmap
->get_info_gid(target_gid
);
5783 // Open a connection to the target MDS
5784 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5786 // Generate MDSCommandOp state
5787 auto &op
= command_table
.start_command();
5789 op
.on_finish
= gather
.new_sub();
5794 op
.mds_gid
= target_gid
;
5797 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5798 << " tid=" << op
.tid
<< cmd
<< dendl
;
5800 // Construct and send MCommand
5801 auto m
= op
.get_message(monclient
->get_fsid());
5802 conn
->send_message2(std::move(m
));
5809 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5811 ceph_tid_t
const tid
= m
->get_tid();
5813 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5815 if (!command_table
.exists(tid
)) {
5816 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5820 auto &op
= command_table
.get_command(tid
);
5822 *op
.outbl
= m
->get_data();
5829 op
.on_finish
->complete(m
->r
);
5832 command_table
.erase(tid
);
5835 // -------------------
5838 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5840 int r
= authenticate();
5842 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5846 std::string resolved_fs_name
;
5847 if (fs_name
.empty()) {
5848 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
5849 if (resolved_fs_name
.empty())
5850 // Try the backwards compatibility fs name option
5851 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5853 resolved_fs_name
= fs_name
;
5856 std::string want
= "mdsmap";
5857 if (!resolved_fs_name
.empty()) {
5858 r
= fetch_fsmap(true);
5861 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5862 if (fscid
== FS_CLUSTER_ID_NONE
) {
5866 std::ostringstream oss
;
5867 oss
<< want
<< "." << fscid
;
5870 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5872 monclient
->sub_want(want
, 0, 0);
5873 monclient
->renew_subs();
5878 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5879 bool require_mds
, const std::string
&fs_name
)
5881 std::lock_guard
lock(client_lock
);
5884 ldout(cct
, 5) << "already mounted" << dendl
;
5890 int r
= subscribe_mdsmap(fs_name
);
5892 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5896 tick(); // start tick
5900 auto availability
= mdsmap
->is_cluster_available();
5901 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5903 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5904 return CEPH_FUSE_NO_MDS_UP
;
5905 } else if (availability
== MDSMap::AVAILABLE
) {
5906 // Continue to mount
5908 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5909 // Else, wait. MDSMonitor will update the map to bring
5910 // us to a conclusion eventually.
5911 wait_on_list(waiting_for_mdsmap
);
5913 // Unexpected value!
5919 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5921 filepath
fp(CEPH_INO_ROOT
);
5922 if (!mount_root
.empty()) {
5923 fp
= filepath(mount_root
.c_str());
5926 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5927 req
->set_filepath(fp
);
5928 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5929 int res
= make_request(req
, perms
);
5931 if (res
== -EACCES
&& root
) {
5932 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5950 if (!cct
->_conf
->client_trace
.empty()) {
5951 traceout
.open(cct
->_conf
->client_trace
.c_str());
5952 if (traceout
.is_open()) {
5953 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5955 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5960 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5961 ldout(cct, 3) << "op: struct stat st;" << dendl;
5962 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5963 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5964 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5965 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5966 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5967 ldout(cct, 3) << "op: int fd;" << dendl;
5974 void Client::_close_sessions()
5976 while (!mds_sessions
.empty()) {
5977 // send session closes!
5978 for (auto &p
: mds_sessions
) {
5979 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
5980 _close_mds_session(&p
.second
);
5984 // wait for sessions to close
5985 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5986 std::unique_lock l
{client_lock
, std::adopt_lock
};
5992 void Client::flush_mdlog_sync()
5994 if (mds_requests
.empty())
5996 for (auto &p
: mds_sessions
) {
5997 flush_mdlog(&p
.second
);
6001 void Client::flush_mdlog(MetaSession
*session
)
6003 // Only send this to Luminous or newer MDS daemons, older daemons
6004 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6005 const uint64_t features
= session
->con
->get_features();
6006 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6007 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6008 session
->con
->send_message2(std::move(m
));
6013 void Client::_abort_mds_sessions(int err
)
6015 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6016 auto req
= p
->second
;
6018 // unsafe requests will be removed during close session below.
6019 if (req
->got_unsafe
)
6023 if (req
->caller_cond
) {
6025 req
->caller_cond
->notify_all();
6029 // Process aborts on any requests that were on this waitlist.
6030 // Any requests that were on a waiting_for_open session waitlist
6031 // will get kicked during close session below.
6032 signal_cond_list(waiting_for_mdsmap
);
6034 // Force-close all sessions
6035 while(!mds_sessions
.empty()) {
6036 auto& session
= mds_sessions
.begin()->second
;
6037 _closed_mds_session(&session
);
6041 void Client::_unmount(bool abort
)
6043 std::unique_lock lock
{client_lock
, std::adopt_lock
};
6047 if (abort
|| blacklisted
) {
6048 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6050 ldout(cct
, 2) << "unmounting" << dendl
;
6057 // Abort all mds sessions
6058 _abort_mds_sessions(-ENOTCONN
);
6060 objecter
->op_cancel_writes(-ENOTCONN
);
6062 // flush the mdlog for pending requests, if any
6066 mount_cond
.wait(lock
, [this] {
6067 if (!mds_requests
.empty()) {
6068 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6071 return mds_requests
.empty();
6074 timer
.cancel_event(tick_event
);
6079 // clean up any unclosed files
6080 while (!fd_map
.empty()) {
6081 Fh
*fh
= fd_map
.begin()->second
;
6082 fd_map
.erase(fd_map
.begin());
6083 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6087 while (!ll_unclosed_fh_set
.empty()) {
6088 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6090 ll_unclosed_fh_set
.erase(fh
);
6091 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6095 while (!opened_dirs
.empty()) {
6096 dir_result_t
*dirp
= *opened_dirs
.begin();
6097 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6103 mount_cond
.wait(lock
, [this] {
6104 if (unsafe_sync_write
> 0) {
6105 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting"
6108 return unsafe_sync_write
<= 0;
6111 if (cct
->_conf
->client_oc
) {
6112 // flush/release all buffered data
6113 std::list
<InodeRef
> anchor
;
6114 for (auto& p
: inode_map
) {
6115 Inode
*in
= p
.second
;
6117 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6121 // prevent inode from getting freed
6122 anchor
.emplace_back(in
);
6124 if (abort
|| blacklisted
) {
6125 objectcacher
->purge_set(&in
->oset
);
6126 } else if (!in
->caps
.empty()) {
6128 _flush(in
, new C_Client_FlushComplete(this, in
));
6133 if (abort
|| blacklisted
) {
6134 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6137 if (in
->dirty_caps
) {
6138 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6139 in
->mark_caps_clean();
6145 wait_sync_caps(last_flush_tid
);
6151 while (lru
.lru_get_size() > 0 ||
6152 !inode_map
.empty()) {
6153 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6154 << "+" << inode_map
.size() << " items"
6155 << ", waiting (for caps to release?)"
6157 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6158 r
== std::cv_status::timeout
) {
6162 ceph_assert(lru
.lru_get_size() == 0);
6163 ceph_assert(inode_map
.empty());
6166 if (!cct
->_conf
->client_trace
.empty()) {
6167 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6176 ldout(cct
, 2) << "unmounted." << dendl
;
6179 void Client::unmount()
6181 std::lock_guard
lock(client_lock
);
6185 void Client::abort_conn()
6187 std::lock_guard
lock(client_lock
);
6191 void Client::flush_cap_releases()
6193 // send any cap releases
6194 for (auto &p
: mds_sessions
) {
6195 auto &session
= p
.second
;
6196 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6198 if (cct
->_conf
->client_inject_release_failure
) {
6199 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6201 session
.con
->send_message2(std::move(session
.release
));
6203 session
.release
.reset();
6210 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6211 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6212 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6213 cct
->_conf
.apply_changes(nullptr);
6216 ldout(cct
, 21) << "tick" << dendl
;
6217 tick_event
= timer
.add_event_after(
6218 cct
->_conf
->client_tick_interval
,
6219 new LambdaContext([this](int) {
6220 // Called back via Timer, which takes client_lock for us
6221 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6224 utime_t now
= ceph_clock_now();
6226 if (!mounted
&& !mds_requests
.empty()) {
6227 MetaRequest
*req
= mds_requests
.begin()->second
;
6228 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6229 req
->abort(-ETIMEDOUT
);
6230 if (req
->caller_cond
) {
6232 req
->caller_cond
->notify_all();
6234 signal_cond_list(waiting_for_mdsmap
);
6235 for (auto &p
: mds_sessions
) {
6236 signal_context_list(p
.second
.waiting_for_open
);
6241 if (mdsmap
->get_epoch()) {
6243 utime_t el
= now
- last_cap_renew
;
6244 if (el
> mdsmap
->get_session_timeout() / 3.0)
6247 flush_cap_releases();
6251 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6255 if (in
->hold_caps_until
> now
)
6257 delayed_list
.pop_front();
6258 check_caps(in
, CHECK_CAPS_NODELAY
);
6264 void Client::renew_caps()
6266 ldout(cct
, 10) << "renew_caps()" << dendl
;
6267 last_cap_renew
= ceph_clock_now();
6269 for (auto &p
: mds_sessions
) {
6270 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6271 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6272 renew_caps(&p
.second
);
6276 void Client::renew_caps(MetaSession
*session
)
6278 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6279 session
->last_cap_renew_request
= ceph_clock_now();
6280 uint64_t seq
= ++session
->cap_renew_seq
;
6281 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6285 // ===============================================================
6286 // high level (POSIXy) interface
6288 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6289 InodeRef
*target
, const UserPerm
& perms
)
6291 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6292 MetaRequest
*req
= new MetaRequest(op
);
6294 dir
->make_nosnap_relative_path(path
);
6295 path
.push_dentry(name
);
6296 req
->set_filepath(path
);
6297 req
->set_inode(dir
);
6298 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6299 mask
|= DEBUG_GETATTR_CAPS
;
6300 req
->head
.args
.getattr
.mask
= mask
;
6302 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6304 int r
= make_request(req
, perms
, target
);
6305 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6309 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6310 const UserPerm
& perms
)
6315 if (dname
== "..") {
6316 if (dir
->dentries
.empty()) {
6317 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6318 filepath
path(dir
->ino
);
6319 req
->set_filepath(path
);
6322 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6325 Inode
*tempino
= tmptarget
.get();
6328 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6334 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6343 if (!dir
->is_dir()) {
6348 if (dname
.length() > NAME_MAX
) {
6353 if (dname
== cct
->_conf
->client_snapdir
&&
6354 dir
->snapid
== CEPH_NOSNAP
) {
6355 *target
= open_snapdir(dir
);
6360 dir
->dir
->dentries
.count(dname
)) {
6361 dn
= dir
->dir
->dentries
[dname
];
6363 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6364 << " seq " << dn
->lease_seq
6367 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6368 // is dn lease valid?
6369 utime_t now
= ceph_clock_now();
6370 if (dn
->lease_mds
>= 0 &&
6371 dn
->lease_ttl
> now
&&
6372 mds_sessions
.count(dn
->lease_mds
)) {
6373 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6374 if (s
.cap_ttl
> now
&&
6375 s
.cap_gen
== dn
->lease_gen
) {
6376 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6377 // make trim_caps() behave.
6378 dir
->try_touch_cap(dn
->lease_mds
);
6381 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6382 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6385 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6386 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6387 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6389 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6390 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6391 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6396 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6399 // can we conclude ENOENT locally?
6400 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6401 (dir
->flags
& I_COMPLETE
)) {
6402 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6407 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6412 *target
= dn
->inode
;
6420 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6422 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6426 int Client::get_or_create(Inode
*dir
, const char* name
,
6427 Dentry
**pdn
, bool expect_null
)
6430 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6432 if (dir
->dir
->dentries
.count(name
)) {
6433 Dentry
*dn
= dir
->dir
->dentries
[name
];
6435 // is dn lease valid?
6436 utime_t now
= ceph_clock_now();
6438 dn
->lease_mds
>= 0 &&
6439 dn
->lease_ttl
> now
&&
6440 mds_sessions
.count(dn
->lease_mds
)) {
6441 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6442 if (s
.cap_ttl
> now
&&
6443 s
.cap_gen
== dn
->lease_gen
) {
6450 // otherwise link up a new one
6451 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6458 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6459 const UserPerm
& perms
, bool followsym
, int mask
)
6461 filepath path
= origpath
;
6463 if (origpath
.absolute())
6469 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6474 while (i
< path
.depth() && cur
) {
6476 const string
&dname
= path
[i
];
6477 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6478 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6480 if (cct
->_conf
->client_permissions
) {
6481 int r
= may_lookup(cur
.get(), perms
);
6484 caps
= CEPH_CAP_AUTH_SHARED
;
6487 /* Get extra requested caps on the last component */
6488 if (i
== (path
.depth() - 1))
6490 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6493 // only follow trailing symlink if followsym. always follow
6494 // 'directory' symlinks.
6495 if (next
&& next
->is_symlink()) {
6497 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6498 if (symlinks
> MAXSYMLINKS
) {
6502 if (i
< path
.depth() - 1) {
6504 // replace consumed components of path with symlink dir target
6505 filepath
resolved(next
->symlink
.c_str());
6506 resolved
.append(path
.postfixpath(i
+ 1));
6509 if (next
->symlink
[0] == '/') {
6513 } else if (followsym
) {
6514 if (next
->symlink
[0] == '/') {
6515 path
= next
->symlink
.c_str();
6520 filepath
more(next
->symlink
.c_str());
6521 // we need to remove the symlink component from off of the path
6522 // before adding the target that the symlink points to. remain
6523 // at the same position in the path.
6543 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6545 std::lock_guard
lock(client_lock
);
6546 tout(cct
) << "link" << std::endl
;
6547 tout(cct
) << relexisting
<< std::endl
;
6548 tout(cct
) << relpath
<< std::endl
;
6553 filepath
existing(relexisting
);
6556 int r
= path_walk(existing
, &in
, perm
, true);
6559 if (std::string(relpath
) == "/") {
6563 filepath
path(relpath
);
6564 string name
= path
.last_dentry();
6567 r
= path_walk(path
, &dir
, perm
, true);
6570 if (cct
->_conf
->client_permissions
) {
6571 if (S_ISDIR(in
->mode
)) {
6575 r
= may_hardlink(in
.get(), perm
);
6578 r
= may_create(dir
.get(), perm
);
6582 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6586 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6588 std::lock_guard
lock(client_lock
);
6589 tout(cct
) << __func__
<< std::endl
;
6590 tout(cct
) << relpath
<< std::endl
;
6595 if (std::string(relpath
) == "/")
6598 filepath
path(relpath
);
6599 string name
= path
.last_dentry();
6602 int r
= path_walk(path
, &dir
, perm
);
6605 if (cct
->_conf
->client_permissions
) {
6606 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6610 return _unlink(dir
.get(), name
.c_str(), perm
);
6613 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6615 std::lock_guard
lock(client_lock
);
6616 tout(cct
) << __func__
<< std::endl
;
6617 tout(cct
) << relfrom
<< std::endl
;
6618 tout(cct
) << relto
<< std::endl
;
6623 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6626 filepath
from(relfrom
);
6628 string fromname
= from
.last_dentry();
6630 string toname
= to
.last_dentry();
6633 InodeRef fromdir
, todir
;
6634 int r
= path_walk(from
, &fromdir
, perm
);
6637 r
= path_walk(to
, &todir
, perm
);
6641 if (cct
->_conf
->client_permissions
) {
6642 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6645 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6646 if (r
< 0 && r
!= -ENOENT
)
6649 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6656 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6658 std::lock_guard
lock(client_lock
);
6659 tout(cct
) << __func__
<< std::endl
;
6660 tout(cct
) << relpath
<< std::endl
;
6661 tout(cct
) << mode
<< std::endl
;
6662 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6667 if (std::string(relpath
) == "/")
6670 filepath
path(relpath
);
6671 string name
= path
.last_dentry();
6674 int r
= path_walk(path
, &dir
, perm
);
6677 if (cct
->_conf
->client_permissions
) {
6678 r
= may_create(dir
.get(), perm
);
6682 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6685 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6687 std::lock_guard
lock(client_lock
);
6688 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6689 tout(cct
) << __func__
<< std::endl
;
6690 tout(cct
) << relpath
<< std::endl
;
6691 tout(cct
) << mode
<< std::endl
;
6696 //get through existing parts of path
6697 filepath
path(relpath
);
6699 int r
= 0, caps
= 0;
6702 for (i
=0; i
<path
.depth(); ++i
) {
6703 if (cct
->_conf
->client_permissions
) {
6704 r
= may_lookup(cur
.get(), perms
);
6707 caps
= CEPH_CAP_AUTH_SHARED
;
6709 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6714 if (r
!=-ENOENT
) return r
;
6715 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6716 //make new directory at each level
6717 for (; i
<path
.depth(); ++i
) {
6718 if (cct
->_conf
->client_permissions
) {
6719 r
= may_create(cur
.get(), perms
);
6724 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6726 //check proper creation/existence
6727 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6728 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6732 //move to new dir and continue
6734 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6735 << filepath(cur
->ino
).get_path() << dendl
;
6740 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6742 std::lock_guard
lock(client_lock
);
6743 tout(cct
) << __func__
<< std::endl
;
6744 tout(cct
) << relpath
<< std::endl
;
6749 if (std::string(relpath
) == "/")
6752 filepath
path(relpath
);
6753 string name
= path
.last_dentry();
6756 int r
= path_walk(path
, &dir
, perms
);
6759 if (cct
->_conf
->client_permissions
) {
6760 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6764 return _rmdir(dir
.get(), name
.c_str(), perms
);
6767 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6769 std::lock_guard
lock(client_lock
);
6770 tout(cct
) << __func__
<< std::endl
;
6771 tout(cct
) << relpath
<< std::endl
;
6772 tout(cct
) << mode
<< std::endl
;
6773 tout(cct
) << rdev
<< std::endl
;
6778 if (std::string(relpath
) == "/")
6781 filepath
path(relpath
);
6782 string name
= path
.last_dentry();
6785 int r
= path_walk(path
, &dir
, perms
);
6788 if (cct
->_conf
->client_permissions
) {
6789 int r
= may_create(dir
.get(), perms
);
6793 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6798 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6800 std::lock_guard
lock(client_lock
);
6801 tout(cct
) << __func__
<< std::endl
;
6802 tout(cct
) << target
<< std::endl
;
6803 tout(cct
) << relpath
<< std::endl
;
6808 if (std::string(relpath
) == "/")
6811 filepath
path(relpath
);
6812 string name
= path
.last_dentry();
6815 int r
= path_walk(path
, &dir
, perms
);
6818 if (cct
->_conf
->client_permissions
) {
6819 int r
= may_create(dir
.get(), perms
);
6823 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6826 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6828 std::lock_guard
lock(client_lock
);
6829 tout(cct
) << __func__
<< std::endl
;
6830 tout(cct
) << relpath
<< std::endl
;
6835 filepath
path(relpath
);
6837 int r
= path_walk(path
, &in
, perms
, false);
6841 return _readlink(in
.get(), buf
, size
);
6844 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6846 if (!in
->is_symlink())
6849 // copy into buf (at most size bytes)
6850 int r
= in
->symlink
.length();
6853 memcpy(buf
, in
->symlink
.c_str(), r
);
6860 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6862 bool yes
= in
->caps_issued_mask(mask
, true);
6864 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6868 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6870 in
->make_nosnap_relative_path(path
);
6871 req
->set_filepath(path
);
6873 req
->head
.args
.getattr
.mask
= mask
;
6875 int res
= make_request(req
, perms
);
6876 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6880 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6881 const UserPerm
& perms
, InodeRef
*inp
)
6883 int issued
= in
->caps_issued();
6885 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6886 ccap_string(issued
) << dendl
;
6888 if (in
->snapid
!= CEPH_NOSNAP
) {
6891 if ((mask
& CEPH_SETATTR_SIZE
) &&
6892 (unsigned long)stx
->stx_size
> in
->size
&&
6893 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6898 // make the change locally?
6899 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6900 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6901 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6902 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6903 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6906 * This works because we implicitly flush the caps as part of the
6907 * request, so the cap update check will happen with the writeback
6908 * cap context, and then the setattr check will happen with the
6911 * In reality this pattern is likely pretty rare (different users
6912 * setattr'ing the same file). If that turns out not to be the
6913 * case later, we can build a more complex pipelined cap writeback
6917 mask
|= CEPH_SETATTR_CTIME
;
6922 // caller just needs us to bump the ctime
6923 in
->ctime
= ceph_clock_now();
6924 in
->cap_dirtier_uid
= perms
.uid();
6925 in
->cap_dirtier_gid
= perms
.gid();
6926 if (issued
& CEPH_CAP_AUTH_EXCL
)
6927 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6928 else if (issued
& CEPH_CAP_FILE_EXCL
)
6929 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6930 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6931 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6933 mask
|= CEPH_SETATTR_CTIME
;
6936 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6937 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6939 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6941 if (mask
& CEPH_SETATTR_UID
) {
6942 in
->ctime
= ceph_clock_now();
6943 in
->cap_dirtier_uid
= perms
.uid();
6944 in
->cap_dirtier_gid
= perms
.gid();
6945 in
->uid
= stx
->stx_uid
;
6946 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6947 mask
&= ~CEPH_SETATTR_UID
;
6949 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6951 if (mask
& CEPH_SETATTR_GID
) {
6952 in
->ctime
= ceph_clock_now();
6953 in
->cap_dirtier_uid
= perms
.uid();
6954 in
->cap_dirtier_gid
= perms
.gid();
6955 in
->gid
= stx
->stx_gid
;
6956 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6957 mask
&= ~CEPH_SETATTR_GID
;
6959 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6962 if (mask
& CEPH_SETATTR_MODE
) {
6963 in
->ctime
= ceph_clock_now();
6964 in
->cap_dirtier_uid
= perms
.uid();
6965 in
->cap_dirtier_gid
= perms
.gid();
6966 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6967 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6968 mask
&= ~CEPH_SETATTR_MODE
;
6969 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6970 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6971 /* Must squash the any setuid/setgid bits with an ownership change */
6972 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6973 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6976 if (mask
& CEPH_SETATTR_BTIME
) {
6977 in
->ctime
= ceph_clock_now();
6978 in
->cap_dirtier_uid
= perms
.uid();
6979 in
->cap_dirtier_gid
= perms
.gid();
6980 in
->btime
= utime_t(stx
->stx_btime
);
6981 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6982 mask
&= ~CEPH_SETATTR_BTIME
;
6983 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6985 } else if (mask
& CEPH_SETATTR_SIZE
) {
6986 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6987 mask
|= CEPH_SETATTR_KILL_SGUID
;
6990 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6991 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6992 if (mask
& CEPH_SETATTR_MTIME
)
6993 in
->mtime
= utime_t(stx
->stx_mtime
);
6994 if (mask
& CEPH_SETATTR_ATIME
)
6995 in
->atime
= utime_t(stx
->stx_atime
);
6996 in
->ctime
= ceph_clock_now();
6997 in
->cap_dirtier_uid
= perms
.uid();
6998 in
->cap_dirtier_gid
= perms
.gid();
6999 in
->time_warp_seq
++;
7000 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7001 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
7010 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7014 in
->make_nosnap_relative_path(path
);
7015 req
->set_filepath(path
);
7018 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
7019 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7021 if (mask
& CEPH_SETATTR_MODE
) {
7022 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7023 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7024 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7026 if (mask
& CEPH_SETATTR_UID
) {
7027 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7028 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7029 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7031 if (mask
& CEPH_SETATTR_GID
) {
7032 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7033 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7034 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7036 if (mask
& CEPH_SETATTR_BTIME
) {
7037 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7038 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7040 if (mask
& CEPH_SETATTR_MTIME
) {
7041 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7042 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7045 if (mask
& CEPH_SETATTR_ATIME
) {
7046 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7047 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7050 if (mask
& CEPH_SETATTR_SIZE
) {
7051 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7052 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7053 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7056 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7059 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7062 req
->head
.args
.setattr
.mask
= mask
;
7064 req
->regetattr_mask
= mask
;
7066 int res
= make_request(req
, perms
, inp
);
7067 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7071 /* Note that we only care about attrs that setattr cares about */
7072 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7074 stx
->stx_size
= st
->st_size
;
7075 stx
->stx_mode
= st
->st_mode
;
7076 stx
->stx_uid
= st
->st_uid
;
7077 stx
->stx_gid
= st
->st_gid
;
7079 stx
->stx_mtime
= st
->st_mtimespec
;
7080 stx
->stx_atime
= st
->st_atimespec
;
7082 stx
->stx_mtime
= st
->st_mtim
;
7083 stx
->stx_atime
= st
->st_atim
;
7087 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7088 const UserPerm
& perms
, InodeRef
*inp
)
7090 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7093 if (mask
& CEPH_SETATTR_MODE
)
7094 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7098 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7099 const UserPerm
& perms
)
7101 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7102 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7103 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7104 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7105 if (cct
->_conf
->client_permissions
) {
7106 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7110 return __setattrx(in
.get(), stx
, mask
, perms
);
7113 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7114 const UserPerm
& perms
)
7116 struct ceph_statx stx
;
7118 stat_to_statx(attr
, &stx
);
7119 mask
&= ~CEPH_SETATTR_BTIME
;
7121 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7122 mask
&= ~CEPH_SETATTR_UID
;
7124 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7125 mask
&= ~CEPH_SETATTR_GID
;
7128 return _setattrx(in
, &stx
, mask
, perms
);
7131 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7132 const UserPerm
& perms
)
7134 std::lock_guard
lock(client_lock
);
7135 tout(cct
) << __func__
<< std::endl
;
7136 tout(cct
) << relpath
<< std::endl
;
7137 tout(cct
) << mask
<< std::endl
;
7142 filepath
path(relpath
);
7144 int r
= path_walk(path
, &in
, perms
);
7147 return _setattr(in
, attr
, mask
, perms
);
7150 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7151 const UserPerm
& perms
, int flags
)
7153 std::lock_guard
lock(client_lock
);
7154 tout(cct
) << __func__
<< std::endl
;
7155 tout(cct
) << relpath
<< std::endl
;
7156 tout(cct
) << mask
<< std::endl
;
7161 filepath
path(relpath
);
7163 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7166 return _setattrx(in
, stx
, mask
, perms
);
7169 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7171 std::lock_guard
lock(client_lock
);
7172 tout(cct
) << __func__
<< std::endl
;
7173 tout(cct
) << fd
<< std::endl
;
7174 tout(cct
) << mask
<< std::endl
;
7179 Fh
*f
= get_filehandle(fd
);
7182 #if defined(__linux__) && defined(O_PATH)
7183 if (f
->flags
& O_PATH
)
7186 return _setattr(f
->inode
, attr
, mask
, perms
);
7189 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7191 std::lock_guard
lock(client_lock
);
7192 tout(cct
) << __func__
<< std::endl
;
7193 tout(cct
) << fd
<< std::endl
;
7194 tout(cct
) << mask
<< std::endl
;
7199 Fh
*f
= get_filehandle(fd
);
7202 #if defined(__linux__) && defined(O_PATH)
7203 if (f
->flags
& O_PATH
)
7206 return _setattrx(f
->inode
, stx
, mask
, perms
);
7209 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7210 frag_info_t
*dirstat
, int mask
)
7212 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7213 std::lock_guard
lock(client_lock
);
7214 tout(cct
) << "stat" << std::endl
;
7215 tout(cct
) << relpath
<< std::endl
;
7220 filepath
path(relpath
);
7222 int r
= path_walk(path
, &in
, perms
, true, mask
);
7225 r
= _getattr(in
, mask
, perms
);
7227 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7230 fill_stat(in
, stbuf
, dirstat
);
7231 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7235 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7239 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7240 if (flags
& AT_NO_ATTR_SYNC
)
7243 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7244 mask
|= CEPH_CAP_PIN
;
7245 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7246 mask
|= CEPH_CAP_AUTH_SHARED
;
7247 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7248 mask
|= CEPH_CAP_LINK_SHARED
;
7249 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7250 mask
|= CEPH_CAP_FILE_SHARED
;
7251 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7252 mask
|= CEPH_CAP_XATTR_SHARED
;
7257 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7258 const UserPerm
& perms
,
7259 unsigned int want
, unsigned int flags
)
7261 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7262 std::lock_guard
lock(client_lock
);
7263 tout(cct
) << "statx" << std::endl
;
7264 tout(cct
) << relpath
<< std::endl
;
7269 filepath
path(relpath
);
7272 unsigned mask
= statx_to_mask(flags
, want
);
7274 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7278 r
= _getattr(in
, mask
, perms
);
7280 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7284 fill_statx(in
, mask
, stx
);
7285 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7289 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7290 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7292 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7293 std::lock_guard
lock(client_lock
);
7294 tout(cct
) << __func__
<< std::endl
;
7295 tout(cct
) << relpath
<< std::endl
;
7300 filepath
path(relpath
);
7302 // don't follow symlinks
7303 int r
= path_walk(path
, &in
, perms
, false, mask
);
7306 r
= _getattr(in
, mask
, perms
);
7308 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7311 fill_stat(in
, stbuf
, dirstat
);
7312 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7316 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7318 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7319 << " mode 0" << oct
<< in
->mode
<< dec
7320 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7321 memset(st
, 0, sizeof(struct stat
));
7322 if (use_faked_inos())
7323 st
->st_ino
= in
->faked_ino
;
7325 st
->st_ino
= in
->ino
;
7326 st
->st_dev
= in
->snapid
;
7327 st
->st_mode
= in
->mode
;
7328 st
->st_rdev
= in
->rdev
;
7330 switch (in
->nlink
) {
7332 st
->st_nlink
= 0; /* dir is unlinked */
7335 st
->st_nlink
= 1 /* parent dentry */
7337 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7343 st
->st_nlink
= in
->nlink
;
7345 st
->st_uid
= in
->uid
;
7346 st
->st_gid
= in
->gid
;
7347 if (in
->ctime
> in
->mtime
) {
7348 stat_set_ctime_sec(st
, in
->ctime
.sec());
7349 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7351 stat_set_ctime_sec(st
, in
->mtime
.sec());
7352 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7354 stat_set_atime_sec(st
, in
->atime
.sec());
7355 stat_set_atime_nsec(st
, in
->atime
.nsec());
7356 stat_set_mtime_sec(st
, in
->mtime
.sec());
7357 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7359 if (cct
->_conf
->client_dirsize_rbytes
)
7360 st
->st_size
= in
->rstat
.rbytes
;
7362 st
->st_size
= in
->dirstat
.size();
7365 st
->st_size
= in
->size
;
7366 st
->st_blocks
= (in
->size
+ 511) >> 9;
7368 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7371 *dirstat
= in
->dirstat
;
7375 return in
->caps_issued();
7378 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7380 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7381 << " mode 0" << oct
<< in
->mode
<< dec
7382 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7383 memset(stx
, 0, sizeof(struct ceph_statx
));
7386 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7387 * so that all bits are set.
7392 /* These are always considered to be available */
7393 stx
->stx_dev
= in
->snapid
;
7394 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7396 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7397 stx
->stx_mode
= S_IFMT
& in
->mode
;
7398 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7399 stx
->stx_rdev
= in
->rdev
;
7400 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7402 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7403 stx
->stx_uid
= in
->uid
;
7404 stx
->stx_gid
= in
->gid
;
7405 stx
->stx_mode
= in
->mode
;
7406 in
->btime
.to_timespec(&stx
->stx_btime
);
7407 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7410 if (mask
& CEPH_CAP_LINK_SHARED
) {
7412 switch (in
->nlink
) {
7414 stx
->stx_nlink
= 0; /* dir is unlinked */
7417 stx
->stx_nlink
= 1 /* parent dentry */
7419 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7425 stx
->stx_nlink
= in
->nlink
;
7427 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7430 if (mask
& CEPH_CAP_FILE_SHARED
) {
7432 in
->atime
.to_timespec(&stx
->stx_atime
);
7433 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7436 if (cct
->_conf
->client_dirsize_rbytes
)
7437 stx
->stx_size
= in
->rstat
.rbytes
;
7439 stx
->stx_size
= in
->dirstat
.size();
7440 stx
->stx_blocks
= 1;
7442 stx
->stx_size
= in
->size
;
7443 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7445 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7446 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7449 /* Change time and change_attr both require all shared caps to view */
7450 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7451 stx
->stx_version
= in
->change_attr
;
7452 if (in
->ctime
> in
->mtime
)
7453 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7455 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7456 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7461 void Client::touch_dn(Dentry
*dn
)
7466 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7468 std::lock_guard
lock(client_lock
);
7469 tout(cct
) << __func__
<< std::endl
;
7470 tout(cct
) << relpath
<< std::endl
;
7471 tout(cct
) << mode
<< std::endl
;
7476 filepath
path(relpath
);
7478 int r
= path_walk(path
, &in
, perms
);
7482 attr
.st_mode
= mode
;
7483 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7486 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7488 std::lock_guard
lock(client_lock
);
7489 tout(cct
) << __func__
<< std::endl
;
7490 tout(cct
) << fd
<< std::endl
;
7491 tout(cct
) << mode
<< std::endl
;
7496 Fh
*f
= get_filehandle(fd
);
7499 #if defined(__linux__) && defined(O_PATH)
7500 if (f
->flags
& O_PATH
)
7504 attr
.st_mode
= mode
;
7505 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7508 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7510 std::lock_guard
lock(client_lock
);
7511 tout(cct
) << __func__
<< std::endl
;
7512 tout(cct
) << relpath
<< std::endl
;
7513 tout(cct
) << mode
<< std::endl
;
7518 filepath
path(relpath
);
7520 // don't follow symlinks
7521 int r
= path_walk(path
, &in
, perms
, false);
7525 attr
.st_mode
= mode
;
7526 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7529 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7530 const UserPerm
& perms
)
7532 std::lock_guard
lock(client_lock
);
7533 tout(cct
) << __func__
<< std::endl
;
7534 tout(cct
) << relpath
<< std::endl
;
7535 tout(cct
) << new_uid
<< std::endl
;
7536 tout(cct
) << new_gid
<< std::endl
;
7541 filepath
path(relpath
);
7543 int r
= path_walk(path
, &in
, perms
);
7547 attr
.st_uid
= new_uid
;
7548 attr
.st_gid
= new_gid
;
7549 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7552 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7554 std::lock_guard
lock(client_lock
);
7555 tout(cct
) << __func__
<< std::endl
;
7556 tout(cct
) << fd
<< std::endl
;
7557 tout(cct
) << new_uid
<< std::endl
;
7558 tout(cct
) << new_gid
<< std::endl
;
7563 Fh
*f
= get_filehandle(fd
);
7566 #if defined(__linux__) && defined(O_PATH)
7567 if (f
->flags
& O_PATH
)
7571 attr
.st_uid
= new_uid
;
7572 attr
.st_gid
= new_gid
;
7574 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7575 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7576 return _setattr(f
->inode
, &attr
, mask
, perms
);
7579 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7580 const UserPerm
& perms
)
7582 std::lock_guard
lock(client_lock
);
7583 tout(cct
) << __func__
<< std::endl
;
7584 tout(cct
) << relpath
<< std::endl
;
7585 tout(cct
) << new_uid
<< std::endl
;
7586 tout(cct
) << new_gid
<< std::endl
;
7591 filepath
path(relpath
);
7593 // don't follow symlinks
7594 int r
= path_walk(path
, &in
, perms
, false);
7598 attr
.st_uid
= new_uid
;
7599 attr
.st_gid
= new_gid
;
7601 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7602 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7603 return _setattr(in
, &attr
, mask
, perms
);
7606 static void attr_set_atime_and_mtime(struct stat
*attr
,
7607 const utime_t
&atime
,
7608 const utime_t
&mtime
)
7610 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7611 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7612 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7613 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7616 // for [l]utime() invoke the timeval variant as the timespec
7617 // variant are not yet implemented. for futime[s](), invoke
7618 // the timespec variant.
7619 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7620 const UserPerm
& perms
)
7622 struct timeval tv
[2];
7623 tv
[0].tv_sec
= buf
->actime
;
7625 tv
[1].tv_sec
= buf
->modtime
;
7628 return utimes(relpath
, tv
, perms
);
7631 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7632 const UserPerm
& perms
)
7634 struct timeval tv
[2];
7635 tv
[0].tv_sec
= buf
->actime
;
7637 tv
[1].tv_sec
= buf
->modtime
;
7640 return lutimes(relpath
, tv
, perms
);
7643 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7645 struct timespec ts
[2];
7646 ts
[0].tv_sec
= buf
->actime
;
7648 ts
[1].tv_sec
= buf
->modtime
;
7651 return futimens(fd
, ts
, perms
);
7654 int Client::utimes(const char *relpath
, struct timeval times
[2],
7655 const UserPerm
& perms
)
7657 std::lock_guard
lock(client_lock
);
7658 tout(cct
) << __func__
<< std::endl
;
7659 tout(cct
) << relpath
<< std::endl
;
7660 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7662 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7668 filepath
path(relpath
);
7670 int r
= path_walk(path
, &in
, perms
);
7674 utime_t
atime(times
[0]);
7675 utime_t
mtime(times
[1]);
7677 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7678 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7681 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7682 const UserPerm
& perms
)
7684 std::lock_guard
lock(client_lock
);
7685 tout(cct
) << __func__
<< std::endl
;
7686 tout(cct
) << relpath
<< std::endl
;
7687 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7689 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7695 filepath
path(relpath
);
7697 int r
= path_walk(path
, &in
, perms
, false);
7701 utime_t
atime(times
[0]);
7702 utime_t
mtime(times
[1]);
7704 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7705 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7708 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7710 struct timespec ts
[2];
7711 ts
[0].tv_sec
= times
[0].tv_sec
;
7712 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7713 ts
[1].tv_sec
= times
[1].tv_sec
;
7714 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7716 return futimens(fd
, ts
, perms
);
7719 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7721 std::lock_guard
lock(client_lock
);
7722 tout(cct
) << __func__
<< std::endl
;
7723 tout(cct
) << fd
<< std::endl
;
7724 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7726 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7732 Fh
*f
= get_filehandle(fd
);
7735 #if defined(__linux__) && defined(O_PATH)
7736 if (f
->flags
& O_PATH
)
7740 utime_t
atime(times
[0]);
7741 utime_t
mtime(times
[1]);
7743 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7744 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7747 int Client::flock(int fd
, int operation
, uint64_t owner
)
7749 std::lock_guard
lock(client_lock
);
7750 tout(cct
) << __func__
<< std::endl
;
7751 tout(cct
) << fd
<< std::endl
;
7752 tout(cct
) << operation
<< std::endl
;
7753 tout(cct
) << owner
<< std::endl
;
7758 Fh
*f
= get_filehandle(fd
);
7762 return _flock(f
, operation
, owner
);
7765 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7767 std::lock_guard
lock(client_lock
);
7768 tout(cct
) << __func__
<< std::endl
;
7769 tout(cct
) << relpath
<< std::endl
;
7774 filepath
path(relpath
);
7776 int r
= path_walk(path
, &in
, perms
, true);
7779 if (cct
->_conf
->client_permissions
) {
7780 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7784 r
= _opendir(in
.get(), dirpp
, perms
);
7785 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7787 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7791 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7795 *dirpp
= new dir_result_t(in
, perms
);
7796 opened_dirs
.insert(*dirpp
);
7797 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7802 int Client::closedir(dir_result_t
*dir
)
7804 std::lock_guard
lock(client_lock
);
7805 tout(cct
) << __func__
<< std::endl
;
7806 tout(cct
) << (unsigned long)dir
<< std::endl
;
7808 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7813 void Client::_closedir(dir_result_t
*dirp
)
7815 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7817 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7818 dirp
->inode
.reset();
7820 _readdir_drop_dirp_buffer(dirp
);
7821 opened_dirs
.erase(dirp
);
7825 void Client::rewinddir(dir_result_t
*dirp
)
7827 std::lock_guard
lock(client_lock
);
7828 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7833 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7834 _readdir_drop_dirp_buffer(d
);
7838 loff_t
Client::telldir(dir_result_t
*dirp
)
7840 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7841 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7845 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7847 std::lock_guard
lock(client_lock
);
7849 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7854 if (offset
== dirp
->offset
)
7857 if (offset
> dirp
->offset
)
7858 dirp
->release_count
= 0; // bump if we do a forward seek
7860 dirp
->ordered_count
= 0; // disable filling readdir cache
7862 if (dirp
->hash_order()) {
7863 if (dirp
->offset
> offset
) {
7864 _readdir_drop_dirp_buffer(dirp
);
7869 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7870 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7871 _readdir_drop_dirp_buffer(dirp
);
7876 dirp
->offset
= offset
;
7881 // ino_t d_ino; /* inode number */
7882 // off_t d_off; /* offset to the next dirent */
7883 // unsigned short d_reclen; /* length of this record */
7884 // unsigned char d_type; /* type of file */
7885 // char d_name[256]; /* filename */
7887 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7889 strncpy(de
->d_name
, name
, 255);
7890 de
->d_name
[255] = '\0';
7893 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7894 de
->d_off
= next_off
;
7897 de
->d_type
= IFTODT(type
);
7898 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7899 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7903 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7905 frag_t fg
= dirp
->buffer_frag
;
7907 if (fg
.is_rightmost()) {
7908 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7915 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7917 if (dirp
->hash_order()) {
7919 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7920 if (dirp
->offset
< new_offset
) // don't decrease offset
7921 dirp
->offset
= new_offset
;
7923 dirp
->last_name
.clear();
7924 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7925 _readdir_rechoose_frag(dirp
);
7929 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7931 ceph_assert(dirp
->inode
);
7933 if (dirp
->hash_order())
7936 frag_t cur
= frag_t(dirp
->offset_high());
7937 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7939 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7940 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7941 dirp
->last_name
.clear();
7942 dirp
->next_offset
= 2;
7946 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7948 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7949 dirp
->buffer
.clear();
7952 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7955 ceph_assert(dirp
->inode
);
7957 // get the current frag.
7959 if (dirp
->hash_order())
7960 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7962 fg
= frag_t(dirp
->offset_high());
7964 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7965 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7967 int op
= CEPH_MDS_OP_READDIR
;
7968 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7969 op
= CEPH_MDS_OP_LSSNAP
;
7971 InodeRef
& diri
= dirp
->inode
;
7973 MetaRequest
*req
= new MetaRequest(op
);
7975 diri
->make_nosnap_relative_path(path
);
7976 req
->set_filepath(path
);
7977 req
->set_inode(diri
.get());
7978 req
->head
.args
.readdir
.frag
= fg
;
7979 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7980 if (dirp
->last_name
.length()) {
7981 req
->path2
.set_path(dirp
->last_name
);
7982 } else if (dirp
->hash_order()) {
7983 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7988 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7990 if (res
== -EAGAIN
) {
7991 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
7992 _readdir_rechoose_frag(dirp
);
7993 return _readdir_get_frag(dirp
);
7997 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
7998 << " size " << dirp
->buffer
.size() << dendl
;
8000 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8007 struct dentry_off_lt
{
8008 bool operator()(const Dentry
* dn
, int64_t off
) const {
8009 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8013 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8014 int caps
, bool getref
)
8016 ceph_assert(ceph_mutex_is_locked(client_lock
));
8017 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8018 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8020 Dir
*dir
= dirp
->inode
->dir
;
8023 ldout(cct
, 10) << " dir is empty" << dendl
;
8028 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8029 dir
->readdir_cache
.end(),
8030 dirp
->offset
, dentry_off_lt());
8034 if (!dirp
->inode
->is_complete_and_ordered())
8036 if (pd
== dir
->readdir_cache
.end())
8039 if (dn
->inode
== NULL
) {
8040 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8044 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8045 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8050 int idx
= pd
- dir
->readdir_cache
.begin();
8051 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8055 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8056 pd
= dir
->readdir_cache
.begin() + idx
;
8057 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8060 struct ceph_statx stx
;
8062 fill_statx(dn
->inode
, caps
, &stx
);
8064 uint64_t next_off
= dn
->offset
+ 1;
8065 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8067 if (pd
== dir
->readdir_cache
.end())
8068 next_off
= dir_result_t::END
;
8072 in
= dn
->inode
.get();
8076 dn_name
= dn
->name
; // fill in name while we have lock
8078 client_lock
.unlock();
8079 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8081 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8082 << " = " << r
<< dendl
;
8087 dirp
->offset
= next_off
;
8089 dirp
->next_offset
= 2;
8091 dirp
->next_offset
= dirp
->offset_low();
8092 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8093 dirp
->release_count
= 0; // last_name no longer match cache index
8098 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8103 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8104 unsigned want
, unsigned flags
, bool getref
)
8106 int caps
= statx_to_mask(flags
, want
);
8108 std::lock_guard
lock(client_lock
);
8113 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8115 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8116 << dec
<< " at_end=" << dirp
->at_end()
8117 << " hash_order=" << dirp
->hash_order() << dendl
;
8120 struct ceph_statx stx
;
8121 memset(&de
, 0, sizeof(de
));
8122 memset(&stx
, 0, sizeof(stx
));
8124 InodeRef
& diri
= dirp
->inode
;
8129 if (dirp
->offset
== 0) {
8130 ldout(cct
, 15) << " including ." << dendl
;
8131 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8132 uint64_t next_off
= 1;
8135 r
= _getattr(diri
, caps
, dirp
->perms
);
8139 fill_statx(diri
, caps
, &stx
);
8140 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8142 Inode
*inode
= NULL
;
8148 client_lock
.unlock();
8149 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8154 dirp
->offset
= next_off
;
8158 if (dirp
->offset
== 1) {
8159 ldout(cct
, 15) << " including .." << dendl
;
8160 uint64_t next_off
= 2;
8162 if (diri
->dentries
.empty())
8165 in
= diri
->get_first_parent()->dir
->parent_inode
;
8168 r
= _getattr(in
, caps
, dirp
->perms
);
8172 fill_statx(in
, caps
, &stx
);
8173 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8175 Inode
*inode
= NULL
;
8181 client_lock
.unlock();
8182 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8187 dirp
->offset
= next_off
;
8192 // can we read from our cache?
8193 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8194 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8195 << dirp
->inode
->is_complete_and_ordered()
8196 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8198 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8199 dirp
->inode
->is_complete_and_ordered() &&
8200 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8201 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8210 bool check_caps
= true;
8211 if (!dirp
->is_cached()) {
8212 int r
= _readdir_get_frag(dirp
);
8215 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8216 // different than the requested one. (our dirfragtree was outdated)
8219 frag_t fg
= dirp
->buffer_frag
;
8221 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8222 << " offset " << hex
<< dirp
->offset
<< dendl
;
8224 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8225 dirp
->offset
, dir_result_t::dentry_off_lt());
8226 it
!= dirp
->buffer
.end();
8228 dir_result_t::dentry
&entry
= *it
;
8230 uint64_t next_off
= entry
.offset
+ 1;
8234 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8239 fill_statx(entry
.inode
, caps
, &stx
);
8240 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8242 Inode
*inode
= NULL
;
8244 inode
= entry
.inode
.get();
8248 client_lock
.unlock();
8249 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8252 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8253 << " = " << r
<< dendl
;
8257 dirp
->offset
= next_off
;
8262 if (dirp
->next_offset
> 2) {
8263 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8264 _readdir_drop_dirp_buffer(dirp
);
8268 if (!fg
.is_rightmost()) {
8270 _readdir_next_frag(dirp
);
8274 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8275 diri
->dir_release_count
== dirp
->release_count
) {
8276 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8277 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8279 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8280 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8282 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8284 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8285 diri
->flags
|= I_COMPLETE
;
8297 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8299 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8306 * 1 if we got a dirent
8307 * 0 for end of directory
8311 struct single_readdir
{
8313 struct ceph_statx
*stx
;
8318 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8319 struct ceph_statx
*stx
, off_t off
,
8322 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8325 return -1; // already filled this dirent
8335 struct dirent
*Client::readdir(dir_result_t
*d
)
8338 static struct dirent de
;
8345 // our callback fills the dirent and sets sr.full=true on first
8346 // call, and returns -1 the second time around.
8347 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8349 errno
= -ret
; // this sucks.
8350 return (dirent
*) NULL
;
8355 return (dirent
*) NULL
;
8358 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8359 struct ceph_statx
*stx
, unsigned want
,
8360 unsigned flags
, Inode
**out
)
8368 // our callback fills the dirent and sets sr.full=true on first
8369 // call, and returns -1 the second time around.
8370 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8382 struct getdents_result
{
8389 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8390 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8392 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8398 dlen
= strlen(de
->d_name
) + 1;
8400 if (c
->pos
+ dlen
> c
->buflen
)
8401 return -1; // doesn't fit
8404 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8406 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8412 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8417 gr
.fullent
= fullent
;
8420 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8422 if (r
< 0) { // some error
8423 if (r
== -1) { // buffer ran out of space
8424 if (gr
.pos
) { // but we got some entries already!
8426 } // or we need a larger buffer
8428 } else { // actual error, return it
8437 struct getdir_result
{
8438 list
<string
> *contents
;
8442 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8444 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8446 r
->contents
->push_back(de
->d_name
);
8451 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8452 const UserPerm
& perms
)
8454 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8456 std::lock_guard
lock(client_lock
);
8457 tout(cct
) << "getdir" << std::endl
;
8458 tout(cct
) << relpath
<< std::endl
;
8462 int r
= opendir(relpath
, &d
, perms
);
8467 gr
.contents
= &contents
;
8469 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8479 /****** file i/o **********/
8480 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8481 mode_t mode
, int stripe_unit
, int stripe_count
,
8482 int object_size
, const char *data_pool
)
8484 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8485 std::lock_guard
lock(client_lock
);
8486 tout(cct
) << "open" << std::endl
;
8487 tout(cct
) << relpath
<< std::endl
;
8488 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8495 #if defined(__linux__) && defined(O_PATH)
8496 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8497 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8498 * in kernel (fs/open.c). */
8500 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8503 filepath
path(relpath
);
8505 bool created
= false;
8506 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8507 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8508 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8510 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8513 #if defined(__linux__) && defined(O_PATH)
8514 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8516 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8520 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8521 filepath dirpath
= path
;
8522 string dname
= dirpath
.last_dentry();
8523 dirpath
.pop_dentry();
8525 r
= path_walk(dirpath
, &dir
, perms
, true,
8526 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8529 if (cct
->_conf
->client_permissions
) {
8530 r
= may_create(dir
.get(), perms
);
8534 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8535 stripe_count
, object_size
, data_pool
, &created
, perms
);
8541 // posix says we can only check permissions of existing files
8542 if (cct
->_conf
->client_permissions
) {
8543 r
= may_open(in
.get(), flags
, perms
);
8550 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8552 // allocate a integer file descriptor
8555 ceph_assert(fd_map
.count(r
) == 0);
8560 tout(cct
) << r
<< std::endl
;
8561 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8565 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8567 /* Use default file striping parameters */
8568 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8571 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8572 const UserPerm
& perms
)
8574 std::lock_guard
lock(client_lock
);
8575 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8580 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8582 req
->set_filepath(path
);
8584 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8586 sprintf(f
, "%u", h
);
8587 filepath
path2(dirino
);
8588 path2
.push_dentry(string(f
));
8589 req
->set_filepath2(path2
);
8591 int r
= make_request(req
, perms
, NULL
, NULL
,
8592 rand() % mdsmap
->get_num_in_mds());
8593 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8599 * Load inode into local cache.
8601 * If inode pointer is non-NULL, and take a reference on
8602 * the resulting Inode object in one operation, so that caller
8603 * can safely assume inode will still be there after return.
8605 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8607 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8612 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8614 req
->set_filepath(path
);
8616 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8617 if (r
== 0 && inode
!= NULL
) {
8618 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8619 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8620 ceph_assert(p
!= inode_map
.end());
8624 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8628 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8630 std::lock_guard
lock(client_lock
);
8631 return _lookup_ino(ino
, perms
, inode
);
8635 * Find the parent inode of `ino` and insert it into
8636 * our cache. Conditionally also set `parent` to a referenced
8637 * Inode* if caller provides non-NULL value.
8639 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8641 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8643 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8644 filepath
path(ino
->ino
);
8645 req
->set_filepath(path
);
8648 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8649 // Give caller a reference to the parent ino if they provided a pointer.
8650 if (parent
!= NULL
) {
8652 *parent
= target
.get();
8654 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8659 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8664 * Populate the parent dentry for `ino`, provided it is
8665 * a child of `parent`.
8667 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8669 ceph_assert(parent
->is_dir());
8670 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8675 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8676 req
->set_filepath2(filepath(parent
->ino
));
8677 req
->set_filepath(filepath(ino
->ino
));
8678 req
->set_inode(ino
);
8680 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8681 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8685 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8687 std::lock_guard
lock(client_lock
);
8688 return _lookup_name(ino
, parent
, perms
);
8691 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8694 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8696 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8698 if (in
->snapid
!= CEPH_NOSNAP
) {
8699 in
->snap_cap_refs
++;
8700 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8701 << ccap_string(in
->caps_issued()) << dendl
;
8704 const auto& conf
= cct
->_conf
;
8705 f
->readahead
.set_trigger_requests(1);
8706 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8707 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8708 if (conf
->client_readahead_max_bytes
) {
8709 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8711 if (conf
->client_readahead_max_periods
) {
8712 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8714 f
->readahead
.set_max_readahead_size(max_readahead
);
8715 vector
<uint64_t> alignments
;
8716 alignments
.push_back(in
->layout
.get_period());
8717 alignments
.push_back(in
->layout
.stripe_unit
);
8718 f
->readahead
.set_alignments(alignments
);
8723 int Client::_release_fh(Fh
*f
)
8725 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8726 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8727 Inode
*in
= f
->inode
.get();
8728 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8732 if (in
->snapid
== CEPH_NOSNAP
) {
8733 if (in
->put_open_ref(f
->mode
)) {
8734 _flush(in
, new C_Client_FlushComplete(this, in
));
8738 ceph_assert(in
->snap_cap_refs
> 0);
8739 in
->snap_cap_refs
--;
8742 _release_filelocks(f
);
8744 // Finally, read any async err (i.e. from flushes)
8745 int err
= f
->take_async_err();
8747 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8748 << cpp_strerror(err
) << dendl
;
8750 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8758 void Client::_put_fh(Fh
*f
)
8760 int left
= f
->put();
8766 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8767 const UserPerm
& perms
)
8769 if (in
->snapid
!= CEPH_NOSNAP
&&
8770 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8774 // use normalized flags to generate cmode
8775 int cflags
= ceph_flags_sys2wire(flags
);
8776 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8777 cflags
|= CEPH_O_LAZY
;
8779 int cmode
= ceph_flags_to_mode(cflags
);
8780 int want
= ceph_caps_for_mode(cmode
);
8783 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8785 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8787 check_caps(in
, CHECK_CAPS_NODELAY
);
8790 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8792 in
->make_nosnap_relative_path(path
);
8793 req
->set_filepath(path
);
8794 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8795 req
->head
.args
.open
.mode
= mode
;
8796 req
->head
.args
.open
.pool
= -1;
8797 if (cct
->_conf
->client_debug_getattr_caps
)
8798 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8800 req
->head
.args
.open
.mask
= 0;
8801 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8803 result
= make_request(req
, perms
);
8806 * NFS expects that delegations will be broken on a conflicting open,
8807 * not just when there is actual conflicting access to the file. SMB leases
8808 * and oplocks also have similar semantics.
8810 * Ensure that clients that have delegations enabled will wait on minimal
8811 * caps during open, just to ensure that other clients holding delegations
8812 * return theirs first.
8814 if (deleg_timeout
&& result
== 0) {
8817 if (cmode
& CEPH_FILE_MODE_WR
)
8818 need
|= CEPH_CAP_FILE_WR
;
8819 if (cmode
& CEPH_FILE_MODE_RD
)
8820 need
|= CEPH_CAP_FILE_RD
;
8822 result
= get_caps(in
, need
, want
, &have
, -1);
8824 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8825 " . Denying open: " <<
8826 cpp_strerror(result
) << dendl
;
8827 in
->put_open_ref(cmode
);
8829 put_cap_ref(in
, need
);
8837 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8839 in
->put_open_ref(cmode
);
8847 int Client::_renew_caps(Inode
*in
)
8849 int wanted
= in
->caps_file_wanted();
8850 if (in
->is_any_caps() &&
8851 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8852 check_caps(in
, CHECK_CAPS_NODELAY
);
8857 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8859 else if (wanted
& CEPH_CAP_FILE_RD
)
8861 else if (wanted
& CEPH_CAP_FILE_WR
)
8864 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8866 in
->make_nosnap_relative_path(path
);
8867 req
->set_filepath(path
);
8868 req
->head
.args
.open
.flags
= flags
;
8869 req
->head
.args
.open
.pool
= -1;
8870 if (cct
->_conf
->client_debug_getattr_caps
)
8871 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8873 req
->head
.args
.open
.mask
= 0;
8876 // duplicate in case Cap goes away; not sure if that race is a concern?
8877 const UserPerm
*pperm
= in
->get_best_perms();
8881 int ret
= make_request(req
, perms
);
8885 int Client::close(int fd
)
8887 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8888 std::lock_guard
lock(client_lock
);
8889 tout(cct
) << "close" << std::endl
;
8890 tout(cct
) << fd
<< std::endl
;
8895 Fh
*fh
= get_filehandle(fd
);
8898 int err
= _release_fh(fh
);
8901 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8909 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8911 std::lock_guard
lock(client_lock
);
8912 tout(cct
) << "lseek" << std::endl
;
8913 tout(cct
) << fd
<< std::endl
;
8914 tout(cct
) << offset
<< std::endl
;
8915 tout(cct
) << whence
<< std::endl
;
8920 Fh
*f
= get_filehandle(fd
);
8923 #if defined(__linux__) && defined(O_PATH)
8924 if (f
->flags
& O_PATH
)
8927 return _lseek(f
, offset
, whence
);
8930 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8932 Inode
*in
= f
->inode
.get();
8933 bool whence_check
= false;
8938 whence_check
= true;
8943 whence_check
= true;
8949 whence_check
= true;
8955 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8966 pos
= f
->pos
+ offset
;
8970 pos
= in
->size
+ offset
;
8975 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
8983 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
8990 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9000 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9005 void Client::lock_fh_pos(Fh
*f
)
9007 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9009 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9010 ceph::condition_variable cond
;
9011 f
->pos_waiters
.push_back(&cond
);
9012 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9013 std::unique_lock l
{client_lock
, std::adopt_lock
};
9014 cond
.wait(l
, [f
, me
=&cond
] {
9015 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9018 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9019 ceph_assert(f
->pos_waiters
.front() == &cond
);
9020 f
->pos_waiters
.pop_front();
9023 f
->pos_locked
= true;
9026 void Client::unlock_fh_pos(Fh
*f
)
9028 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9029 f
->pos_locked
= false;
9032 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9034 if (!in
->inline_data
.length()) {
9035 onfinish
->complete(0);
9040 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9041 object_t oid
= oid_buf
;
9043 ObjectOperation create_ops
;
9044 create_ops
.create(false);
9046 objecter
->mutate(oid
,
9047 OSDMap::file_to_object_locator(in
->layout
),
9049 in
->snaprealm
->get_snap_context(),
9050 ceph::real_clock::now(),
9054 bufferlist inline_version_bl
;
9055 encode(in
->inline_version
, inline_version_bl
);
9057 ObjectOperation uninline_ops
;
9058 uninline_ops
.cmpxattr("inline_version",
9059 CEPH_OSD_CMPXATTR_OP_GT
,
9060 CEPH_OSD_CMPXATTR_MODE_U64
,
9062 bufferlist inline_data
= in
->inline_data
;
9063 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9064 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9066 objecter
->mutate(oid
,
9067 OSDMap::file_to_object_locator(in
->layout
),
9069 in
->snaprealm
->get_snap_context(),
9070 ceph::real_clock::now(),
9079 // blocking osd interface
9081 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9083 std::lock_guard
lock(client_lock
);
9084 tout(cct
) << "read" << std::endl
;
9085 tout(cct
) << fd
<< std::endl
;
9086 tout(cct
) << size
<< std::endl
;
9087 tout(cct
) << offset
<< std::endl
;
9092 Fh
*f
= get_filehandle(fd
);
9095 #if defined(__linux__) && defined(O_PATH)
9096 if (f
->flags
& O_PATH
)
9100 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9101 size
= std::min(size
, (loff_t
)INT_MAX
);
9102 int r
= _read(f
, offset
, size
, &bl
);
9103 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9105 bl
.begin().copy(bl
.length(), buf
);
9111 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9115 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9118 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9121 bool movepos
= false;
9122 std::unique_ptr
<C_SaferCond
> onuninline
;
9124 const auto& conf
= cct
->_conf
;
9125 Inode
*in
= f
->inode
.get();
9127 utime_t start
= ceph_clock_now();
9129 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9131 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9138 loff_t start_pos
= offset
;
9140 if (in
->inline_version
== 0) {
9141 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9145 ceph_assert(in
->inline_version
> 0);
9149 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9150 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9152 want
= CEPH_CAP_FILE_CACHE
;
9153 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9157 if (f
->flags
& O_DIRECT
)
9158 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9160 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9161 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9162 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9163 uninline_data(in
, onuninline
.get());
9165 uint32_t len
= in
->inline_data
.length();
9166 uint64_t endoff
= offset
+ size
;
9167 if (endoff
> in
->size
)
9171 if (endoff
<= len
) {
9172 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9174 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9175 bl
->append_zero(endoff
- len
);
9177 r
= endoff
- offset
;
9178 } else if ((uint64_t)offset
< endoff
) {
9179 bl
->append_zero(endoff
- offset
);
9180 r
= endoff
- offset
;
9188 if (!conf
->client_debug_force_sync_read
&&
9190 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9192 if (f
->flags
& O_RSYNC
) {
9193 _flush_range(in
, offset
, size
);
9195 r
= _read_async(f
, offset
, size
, bl
);
9199 if (f
->flags
& O_DIRECT
)
9200 _flush_range(in
, offset
, size
);
9202 bool checkeof
= false;
9203 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9210 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9213 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9218 if ((uint64_t)offset
< in
->size
)
9224 ceph_assert(r
>= 0);
9227 f
->pos
= start_pos
+ r
;
9230 lat
= ceph_clock_now();
9232 logger
->tinc(l_c_read
, lat
);
9238 client_lock
.unlock();
9239 int ret
= onuninline
->wait();
9241 if (ret
>= 0 || ret
== -ECANCELED
) {
9242 in
->inline_data
.clear();
9243 in
->inline_version
= CEPH_INLINE_NONE
;
9244 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9250 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9258 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9261 f
->readahead
.inc_pending();
9264 Client::C_Readahead::~C_Readahead() {
9265 f
->readahead
.dec_pending();
9269 void Client::C_Readahead::finish(int r
) {
9270 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9271 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9274 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9276 const auto& conf
= cct
->_conf
;
9277 Inode
*in
= f
->inode
.get();
9279 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9281 // trim read based on file size?
9282 if (off
>= in
->size
)
9286 if (off
+ len
> in
->size
) {
9287 len
= in
->size
- off
;
9290 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9291 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9292 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9294 // read (and possibly block)
9296 C_SaferCond
onfinish("Client::_read_async flock");
9297 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9298 off
, len
, bl
, 0, &onfinish
);
9300 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9301 client_lock
.unlock();
9302 r
= onfinish
.wait();
9304 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9307 if(f
->readahead
.get_min_readahead_size() > 0) {
9308 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9309 if (readahead_extent
.second
> 0) {
9310 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9311 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9312 Context
*onfinish2
= new C_Readahead(this, f
);
9313 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9314 readahead_extent
.first
, readahead_extent
.second
,
9315 NULL
, 0, onfinish2
);
9317 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9318 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9320 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9329 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9332 Inode
*in
= f
->inode
.get();
9337 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9340 C_SaferCond
onfinish("Client::_read_sync flock");
9344 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9346 in
->truncate_size
, in
->truncate_seq
,
9348 client_lock
.unlock();
9349 int r
= onfinish
.wait();
9352 // if we get ENOENT from OSD, assume 0 bytes returned
9363 bl
->claim_append(tbl
);
9366 if (r
>= 0 && r
< wanted
) {
9367 if (pos
< in
->size
) {
9368 // zero up to known EOF
9369 int64_t some
= in
->size
- pos
;
9372 auto z
= buffer::ptr_node::create(some
);
9374 bl
->push_back(std::move(z
));
9391 * we keep count of uncommitted sync writes on the inode, so that
9394 void Client::_sync_write_commit(Inode
*in
)
9396 ceph_assert(unsafe_sync_write
> 0);
9397 unsafe_sync_write
--;
9399 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9401 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9402 if (unsafe_sync_write
== 0 && unmounting
) {
9403 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9404 mount_cond
.notify_all();
9408 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9410 std::lock_guard
lock(client_lock
);
9411 tout(cct
) << "write" << std::endl
;
9412 tout(cct
) << fd
<< std::endl
;
9413 tout(cct
) << size
<< std::endl
;
9414 tout(cct
) << offset
<< std::endl
;
9419 Fh
*fh
= get_filehandle(fd
);
9422 #if defined(__linux__) && defined(O_PATH)
9423 if (fh
->flags
& O_PATH
)
9426 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9427 size
= std::min(size
, (loff_t
)INT_MAX
);
9428 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9429 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9433 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9437 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9440 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9441 unsigned iovcnt
, int64_t offset
, bool write
,
9444 #if defined(__linux__) && defined(O_PATH)
9445 if (fh
->flags
& O_PATH
)
9448 loff_t totallen
= 0;
9449 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9450 totallen
+= iov
[i
].iov_len
;
9454 * Some of the API functions take 64-bit size values, but only return
9455 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9456 * we don't do I/Os larger than the values we can return.
9459 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9462 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9463 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9467 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9468 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9472 auto iter
= bl
.cbegin();
9473 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9475 * This piece of code aims to handle the case that bufferlist does not have enough data
9476 * to fill in the iov
9478 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
9479 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
9480 resid
-= round_size
;
9481 /* iter is self-updating */
9487 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9489 std::lock_guard
lock(client_lock
);
9490 tout(cct
) << fd
<< std::endl
;
9491 tout(cct
) << offset
<< std::endl
;
9496 Fh
*fh
= get_filehandle(fd
);
9499 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9502 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9503 const struct iovec
*iov
, int iovcnt
)
9507 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9510 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9511 Inode
*in
= f
->inode
.get();
9513 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9517 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9519 // was Fh opened as writeable?
9520 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9523 // use/adjust fd pos?
9527 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9528 * change out from under us.
9530 if (f
->flags
& O_APPEND
) {
9531 auto r
= _lseek(f
, 0, SEEK_END
);
9543 uint64_t endoff
= offset
+ size
;
9544 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9549 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9551 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9554 utime_t start
= ceph_clock_now();
9556 if (in
->inline_version
== 0) {
9557 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9560 ceph_assert(in
->inline_version
> 0);
9563 // copy into fresh buffer (since our write may be resub, async)
9567 bl
.append(buf
, size
);
9569 for (int i
= 0; i
< iovcnt
; i
++) {
9570 if (iov
[i
].iov_len
> 0) {
9571 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9577 uint64_t totalwritten
;
9579 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9580 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9582 want
= CEPH_CAP_FILE_BUFFER
;
9583 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9587 /* clear the setuid/setgid bits, if any */
9588 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9589 struct ceph_statx stx
= { 0 };
9591 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9592 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9596 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9599 if (f
->flags
& O_DIRECT
)
9600 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9602 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9604 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9606 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9607 if (endoff
> cct
->_conf
->client_max_inline_size
||
9608 endoff
> CEPH_INLINE_MAX_SIZE
||
9609 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9610 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9611 uninline_data(in
, onuninline
.get());
9613 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9615 uint32_t len
= in
->inline_data
.length();
9618 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
9621 in
->inline_data
.splice(offset
, len
- offset
);
9622 else if (offset
> len
)
9623 in
->inline_data
.append_zero(offset
- len
);
9625 in
->inline_data
.append(bl
);
9626 in
->inline_version
++;
9628 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9634 if (cct
->_conf
->client_oc
&&
9635 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9636 // do buffered write
9637 if (!in
->oset
.dirty_or_tx
)
9638 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9640 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9642 // async, caching, non-blocking.
9643 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9644 in
->snaprealm
->get_snap_context(),
9645 offset
, size
, bl
, ceph::real_clock::now(),
9647 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9652 // flush cached write if O_SYNC is set on file fh
9653 // O_DSYNC == O_SYNC on linux < 2.6.33
9654 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9655 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9656 _flush_range(in
, offset
, size
);
9659 if (f
->flags
& O_DIRECT
)
9660 _flush_range(in
, offset
, size
);
9662 // simple, non-atomic sync write
9663 C_SaferCond
onfinish("Client::_write flock");
9664 unsafe_sync_write
++;
9665 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9667 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9668 offset
, size
, bl
, ceph::real_clock::now(), 0,
9669 in
->truncate_size
, in
->truncate_seq
,
9671 client_lock
.unlock();
9674 _sync_write_commit(in
);
9677 // if we get here, write was successful, update client metadata
9680 lat
= ceph_clock_now();
9682 logger
->tinc(l_c_wrlat
, lat
);
9689 totalwritten
= size
;
9690 r
= (int64_t)totalwritten
;
9693 if (totalwritten
+ offset
> in
->size
) {
9694 in
->size
= totalwritten
+ offset
;
9695 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9697 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9698 check_caps(in
, CHECK_CAPS_NODELAY
);
9699 } else if (is_max_size_approaching(in
)) {
9703 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9705 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9709 in
->mtime
= in
->ctime
= ceph_clock_now();
9711 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9715 if (nullptr != onuninline
) {
9716 client_lock
.unlock();
9717 int uninline_ret
= onuninline
->wait();
9720 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9721 in
->inline_data
.clear();
9722 in
->inline_version
= CEPH_INLINE_NONE
;
9723 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9729 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9733 int Client::_flush(Fh
*f
)
9735 Inode
*in
= f
->inode
.get();
9736 int err
= f
->take_async_err();
9738 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9739 << cpp_strerror(err
) << dendl
;
9741 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9747 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9749 struct ceph_statx stx
;
9750 stx
.stx_size
= length
;
9751 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9754 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9756 std::lock_guard
lock(client_lock
);
9757 tout(cct
) << __func__
<< std::endl
;
9758 tout(cct
) << fd
<< std::endl
;
9759 tout(cct
) << length
<< std::endl
;
9764 Fh
*f
= get_filehandle(fd
);
9767 #if defined(__linux__) && defined(O_PATH)
9768 if (f
->flags
& O_PATH
)
9772 attr
.st_size
= length
;
9773 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9776 int Client::fsync(int fd
, bool syncdataonly
)
9778 std::lock_guard
lock(client_lock
);
9779 tout(cct
) << "fsync" << std::endl
;
9780 tout(cct
) << fd
<< std::endl
;
9781 tout(cct
) << syncdataonly
<< std::endl
;
9786 Fh
*f
= get_filehandle(fd
);
9789 #if defined(__linux__) && defined(O_PATH)
9790 if (f
->flags
& O_PATH
)
9793 int r
= _fsync(f
, syncdataonly
);
9795 // The IOs in this fsync were okay, but maybe something happened
9796 // in the background that we shoudl be reporting?
9797 r
= f
->take_async_err();
9798 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9799 << ") = 0, async_err = " << r
<< dendl
;
9801 // Assume that an error we encountered during fsync, even reported
9802 // synchronously, would also have applied the error to the Fh, and we
9803 // should clear it here to avoid returning the same error again on next
9805 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9807 f
->take_async_err();
9812 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9815 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9816 ceph_tid_t flush_tid
= 0;
9819 utime_t start
= ceph_clock_now();
9821 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9823 if (cct
->_conf
->client_oc
) {
9824 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9825 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9826 _flush(in
, object_cacher_completion
.get());
9827 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9830 if (!syncdataonly
&& in
->dirty_caps
) {
9831 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9832 if (in
->flushing_caps
)
9833 flush_tid
= last_flush_tid
;
9834 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9836 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9839 MetaRequest
*req
= in
->unsafe_ops
.back();
9840 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9843 wait_on_list(req
->waitfor_safe
);
9847 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9848 client_lock
.unlock();
9849 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9850 r
= object_cacher_completion
->wait();
9852 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9854 // FIXME: this can starve
9855 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9856 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9857 << " uncommitted, waiting" << dendl
;
9858 wait_on_list(in
->waitfor_commit
);
9864 wait_sync_caps(in
, flush_tid
);
9866 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9868 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9869 << cpp_strerror(-r
) << dendl
;
9872 lat
= ceph_clock_now();
9874 logger
->tinc(l_c_fsync
, lat
);
9879 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9881 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9882 return _fsync(f
->inode
.get(), syncdataonly
);
9885 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9887 std::lock_guard
lock(client_lock
);
9888 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9889 tout(cct
) << fd
<< std::endl
;
9894 Fh
*f
= get_filehandle(fd
);
9897 int r
= _getattr(f
->inode
, mask
, perms
);
9900 fill_stat(f
->inode
, stbuf
, NULL
);
9901 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9905 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9906 unsigned int want
, unsigned int flags
)
9908 std::lock_guard
lock(client_lock
);
9909 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9910 tout(cct
) << fd
<< std::endl
;
9915 Fh
*f
= get_filehandle(fd
);
9919 unsigned mask
= statx_to_mask(flags
, want
);
9922 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9923 r
= _getattr(f
->inode
, mask
, perms
);
9925 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9930 fill_statx(f
->inode
, mask
, stx
);
9931 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9935 // not written yet, but i want to link!
9937 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9938 const UserPerm
& perms
)
9940 std::lock_guard
lock(client_lock
);
9941 tout(cct
) << "chdir" << std::endl
;
9942 tout(cct
) << relpath
<< std::endl
;
9947 filepath
path(relpath
);
9949 int r
= path_walk(path
, &in
, perms
);
9953 if (!(in
.get()->is_dir()))
9958 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9960 _getcwd(new_cwd
, perms
);
9964 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9967 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
9969 Inode
*in
= cwd
.get();
9970 while (in
!= root
) {
9971 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
9973 // A cwd or ancester is unlinked
9974 if (in
->dentries
.empty()) {
9978 Dentry
*dn
= in
->get_first_parent();
9983 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
9984 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9985 filepath
path(in
->ino
);
9986 req
->set_filepath(path
);
9988 int res
= make_request(req
, perms
);
9997 path
.push_front_dentry(dn
->name
);
9998 in
= dn
->dir
->parent_inode
;
10001 dir
+= path
.get_path();
10004 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10006 std::lock_guard
l(client_lock
);
10008 _getcwd(dir
, perms
);
10011 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10012 const UserPerm
& perms
)
10014 std::lock_guard
l(client_lock
);
10015 tout(cct
) << __func__
<< std::endl
;
10016 unsigned long int total_files_on_fs
;
10024 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10025 if (data_pools
.size() == 1) {
10026 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10028 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10031 client_lock
.unlock();
10032 int rval
= cond
.wait();
10034 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10035 client_lock
.lock();
10038 ldout(cct
, 1) << "underlying call to statfs returned error: "
10039 << cpp_strerror(rval
)
10044 memset(stbuf
, 0, sizeof(*stbuf
));
10047 * we're going to set a block size of 4MB so we can represent larger
10048 * FSes without overflowing. Additionally convert the space
10049 * measurements from KB to bytes while making them in terms of
10050 * blocks. We use 4MB only because it is big enough, and because it
10051 * actually *is* the (ceph) default block size.
10053 const int CEPH_BLOCK_SHIFT
= 22;
10054 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10055 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10056 stbuf
->f_files
= total_files_on_fs
;
10057 stbuf
->f_ffree
= 0;
10058 stbuf
->f_favail
= -1;
10059 stbuf
->f_fsid
= -1; // ??
10060 stbuf
->f_flag
= 0; // ??
10061 stbuf
->f_namemax
= NAME_MAX
;
10063 // Usually quota_root will == root_ancestor, but if the mount root has no
10064 // quota but we can see a parent of it that does have a quota, we'll
10065 // respect that one instead.
10066 ceph_assert(root
!= nullptr);
10067 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10069 // get_quota_root should always give us something
10070 // because client quotas are always enabled
10071 ceph_assert(quota_root
!= nullptr);
10073 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10075 // Skip the getattr if any sessions are stale, as we don't want to
10076 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10078 if (!_any_stale_sessions()) {
10079 int r
= _getattr(quota_root
, 0, perms
, true);
10081 // Ignore return value: error getting latest inode metadata is not a good
10082 // reason to break "df".
10083 lderr(cct
) << "Error in getattr on quota root 0x"
10084 << std::hex
<< quota_root
->ino
<< std::dec
10085 << " statfs result may be outdated" << dendl
;
10089 // Special case: if there is a size quota set on the Inode acting
10090 // as the root for this client mount, then report the quota status
10091 // as the filesystem statistics.
10092 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10093 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10094 // It is possible for a quota to be exceeded: arithmetic here must
10095 // handle case where used > total.
10096 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10098 stbuf
->f_blocks
= total
;
10099 stbuf
->f_bfree
= free
;
10100 stbuf
->f_bavail
= free
;
10102 // General case: report the cluster statistics returned from RADOS. Because
10103 // multiple pools may be used without one filesystem namespace via
10104 // layouts, this is the most correct thing we can do.
10105 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10106 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10107 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10113 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10114 struct flock
*fl
, uint64_t owner
, bool removing
)
10116 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10117 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10118 << " type " << fl
->l_type
<< " owner " << owner
10119 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10122 if (F_RDLCK
== fl
->l_type
)
10123 lock_cmd
= CEPH_LOCK_SHARED
;
10124 else if (F_WRLCK
== fl
->l_type
)
10125 lock_cmd
= CEPH_LOCK_EXCL
;
10126 else if (F_UNLCK
== fl
->l_type
)
10127 lock_cmd
= CEPH_LOCK_UNLOCK
;
10131 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10135 * Set the most significant bit, so that MDS knows the 'owner'
10136 * is sufficient to identify the owner of lock. (old code uses
10137 * both 'owner' and 'pid')
10139 owner
|= (1ULL << 63);
10141 MetaRequest
*req
= new MetaRequest(op
);
10143 in
->make_nosnap_relative_path(path
);
10144 req
->set_filepath(path
);
10145 req
->set_inode(in
);
10147 req
->head
.args
.filelock_change
.rule
= lock_type
;
10148 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10149 req
->head
.args
.filelock_change
.owner
= owner
;
10150 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10151 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10152 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10153 req
->head
.args
.filelock_change
.wait
= sleep
;
10158 if (sleep
&& switch_interrupt_cb
) {
10159 // enable interrupt
10160 switch_interrupt_cb(callback_handle
, req
->get());
10161 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10162 // disable interrupt
10163 switch_interrupt_cb(callback_handle
, NULL
);
10164 if (ret
== 0 && req
->aborted()) {
10165 // effect of this lock request has been revoked by the 'lock intr' request
10166 ret
= req
->get_abort_code();
10170 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10174 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10175 ceph_filelock filelock
;
10176 auto p
= bl
.cbegin();
10177 decode(filelock
, p
);
10179 if (CEPH_LOCK_SHARED
== filelock
.type
)
10180 fl
->l_type
= F_RDLCK
;
10181 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10182 fl
->l_type
= F_WRLCK
;
10184 fl
->l_type
= F_UNLCK
;
10186 fl
->l_whence
= SEEK_SET
;
10187 fl
->l_start
= filelock
.start
;
10188 fl
->l_len
= filelock
.length
;
10189 fl
->l_pid
= filelock
.pid
;
10190 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10191 ceph_lock_state_t
*lock_state
;
10192 if (lock_type
== CEPH_LOCK_FCNTL
) {
10193 if (!in
->fcntl_locks
)
10194 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10195 lock_state
= in
->fcntl_locks
.get();
10196 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10197 if (!in
->flock_locks
)
10198 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10199 lock_state
= in
->flock_locks
.get();
10204 _update_lock_state(fl
, owner
, lock_state
);
10207 if (lock_type
== CEPH_LOCK_FCNTL
) {
10208 if (!fh
->fcntl_locks
)
10209 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10210 lock_state
= fh
->fcntl_locks
.get();
10212 if (!fh
->flock_locks
)
10213 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10214 lock_state
= fh
->flock_locks
.get();
10216 _update_lock_state(fl
, owner
, lock_state
);
10224 int Client::_interrupt_filelock(MetaRequest
*req
)
10226 // Set abort code, but do not kick. The abort code prevents the request
10227 // from being re-sent.
10228 req
->abort(-EINTR
);
10230 return 0; // haven't sent the request
10232 Inode
*in
= req
->inode();
10235 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10236 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10237 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10238 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10244 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10246 in
->make_nosnap_relative_path(path
);
10247 intr_req
->set_filepath(path
);
10248 intr_req
->set_inode(in
);
10249 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10250 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10251 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10253 UserPerm
perms(req
->get_uid(), req
->get_gid());
10254 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10257 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10259 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10262 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10263 encode(nr_fcntl_locks
, bl
);
10264 if (nr_fcntl_locks
) {
10265 auto &lock_state
= in
->fcntl_locks
;
10266 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10267 p
!= lock_state
->held_locks
.end();
10269 encode(p
->second
, bl
);
10272 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10273 encode(nr_flock_locks
, bl
);
10274 if (nr_flock_locks
) {
10275 auto &lock_state
= in
->flock_locks
;
10276 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10277 p
!= lock_state
->held_locks
.end();
10279 encode(p
->second
, bl
);
10282 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10283 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10286 void Client::_release_filelocks(Fh
*fh
)
10288 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10291 Inode
*in
= fh
->inode
.get();
10292 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10294 list
<pair
<int, ceph_filelock
> > to_release
;
10296 if (fh
->fcntl_locks
) {
10297 auto &lock_state
= fh
->fcntl_locks
;
10298 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10299 p
!= lock_state
->held_locks
.end();
10301 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10302 lock_state
.reset();
10304 if (fh
->flock_locks
) {
10305 auto &lock_state
= fh
->flock_locks
;
10306 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10307 p
!= lock_state
->held_locks
.end();
10309 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10310 lock_state
.reset();
10313 if (to_release
.empty())
10316 // mds has already released filelocks if session was closed.
10317 if (in
->caps
.empty())
10321 memset(&fl
, 0, sizeof(fl
));
10322 fl
.l_whence
= SEEK_SET
;
10323 fl
.l_type
= F_UNLCK
;
10325 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10326 p
!= to_release
.end();
10328 fl
.l_start
= p
->second
.start
;
10329 fl
.l_len
= p
->second
.length
;
10330 fl
.l_pid
= p
->second
.pid
;
10331 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10332 p
->second
.owner
, true);
10336 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10337 ceph_lock_state_t
*lock_state
)
10340 if (F_RDLCK
== fl
->l_type
)
10341 lock_cmd
= CEPH_LOCK_SHARED
;
10342 else if (F_WRLCK
== fl
->l_type
)
10343 lock_cmd
= CEPH_LOCK_EXCL
;
10345 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10347 ceph_filelock filelock
;
10348 filelock
.start
= fl
->l_start
;
10349 filelock
.length
= fl
->l_len
;
10350 filelock
.client
= 0;
10351 // see comment in _do_filelock()
10352 filelock
.owner
= owner
| (1ULL << 63);
10353 filelock
.pid
= fl
->l_pid
;
10354 filelock
.type
= lock_cmd
;
10356 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10357 list
<ceph_filelock
> activated_locks
;
10358 lock_state
->remove_lock(filelock
, activated_locks
);
10360 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10365 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10367 Inode
*in
= fh
->inode
.get();
10368 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10369 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10373 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10375 Inode
*in
= fh
->inode
.get();
10376 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10377 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10378 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10382 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10384 Inode
*in
= fh
->inode
.get();
10385 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10387 int sleep
= !(cmd
& LOCK_NB
);
10406 memset(&fl
, 0, sizeof(fl
));
10408 fl
.l_whence
= SEEK_SET
;
10410 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10411 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10415 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10417 /* Since the only thing this does is wrap a call to statfs, and
10418 statfs takes a lock, it doesn't seem we have a need to split it
10420 return statfs(0, stbuf
, perms
);
10423 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10427 std::lock_guard
l(client_lock
);
10428 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10429 << " invalidate_ino_cb " << args
->ino_cb
10430 << " invalidate_dentry_cb " << args
->dentry_cb
10431 << " switch_interrupt_cb " << args
->switch_intr_cb
10432 << " remount_cb " << args
->remount_cb
10434 callback_handle
= args
->handle
;
10435 if (args
->ino_cb
) {
10436 ino_invalidate_cb
= args
->ino_cb
;
10437 async_ino_invalidator
.start();
10439 if (args
->dentry_cb
) {
10440 dentry_invalidate_cb
= args
->dentry_cb
;
10441 async_dentry_invalidator
.start();
10443 if (args
->switch_intr_cb
) {
10444 switch_interrupt_cb
= args
->switch_intr_cb
;
10445 interrupt_finisher
.start();
10447 if (args
->remount_cb
) {
10448 remount_cb
= args
->remount_cb
;
10449 remount_finisher
.start();
10451 umask_cb
= args
->umask_cb
;
10454 int Client::test_dentry_handling(bool can_invalidate
)
10458 can_invalidate_dentries
= can_invalidate
;
10460 if (can_invalidate_dentries
) {
10461 ceph_assert(dentry_invalidate_cb
);
10462 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10465 ceph_assert(remount_cb
);
10466 ldout(cct
, 1) << "using remount_cb" << dendl
;
10467 r
= _do_remount(false);
10473 int Client::_sync_fs()
10475 ldout(cct
, 10) << __func__
<< dendl
;
10478 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10479 if (cct
->_conf
->client_oc
) {
10480 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10481 objectcacher
->flush_all(cond
.get());
10486 ceph_tid_t flush_tid
= last_flush_tid
;
10488 // wait for unsafe mds requests
10489 wait_unsafe_requests();
10491 wait_sync_caps(flush_tid
);
10493 if (nullptr != cond
) {
10494 client_lock
.unlock();
10495 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10497 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10498 client_lock
.lock();
10504 int Client::sync_fs()
10506 std::lock_guard
l(client_lock
);
10514 int64_t Client::drop_caches()
10516 std::lock_guard
l(client_lock
);
10517 return objectcacher
->release_all();
10520 int Client::_lazyio(Fh
*fh
, int enable
)
10522 Inode
*in
= fh
->inode
.get();
10523 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10525 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10528 int orig_mode
= fh
->mode
;
10530 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10531 in
->get_open_ref(fh
->mode
);
10532 in
->put_open_ref(orig_mode
);
10533 check_caps(in
, CHECK_CAPS_NODELAY
);
10535 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10536 in
->get_open_ref(fh
->mode
);
10537 in
->put_open_ref(orig_mode
);
10544 int Client::lazyio(int fd
, int enable
)
10546 std::lock_guard
l(client_lock
);
10547 Fh
*f
= get_filehandle(fd
);
10551 return _lazyio(f
, enable
);
10554 int Client::ll_lazyio(Fh
*fh
, int enable
)
10556 std::lock_guard
lock(client_lock
);
10557 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10558 tout(cct
) << __func__
<< std::endl
;
10560 return _lazyio(fh
, enable
);
10563 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10565 std::lock_guard
l(client_lock
);
10566 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10567 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10569 Fh
*f
= get_filehandle(fd
);
10579 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10581 std::lock_guard
l(client_lock
);
10582 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10583 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10585 Fh
*f
= get_filehandle(fd
);
10588 Inode
*in
= f
->inode
.get();
10591 if (_release(in
)) {
10592 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10600 // =============================
10603 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10605 std::lock_guard
l(client_lock
);
10610 filepath
path(relpath
);
10612 int r
= path_walk(path
, &in
, perm
);
10615 if (cct
->_conf
->client_permissions
) {
10616 r
= may_create(in
.get(), perm
);
10620 Inode
*snapdir
= open_snapdir(in
.get());
10621 return _mkdir(snapdir
, name
, 0, perm
);
10624 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10626 std::lock_guard
l(client_lock
);
10631 filepath
path(relpath
);
10633 int r
= path_walk(path
, &in
, perms
);
10636 if (cct
->_conf
->client_permissions
) {
10637 r
= may_delete(in
.get(), NULL
, perms
);
10641 Inode
*snapdir
= open_snapdir(in
.get());
10642 return _rmdir(snapdir
, name
, perms
);
10645 // =============================
10648 int Client::get_caps_issued(int fd
) {
10650 std::lock_guard
lock(client_lock
);
10655 Fh
*f
= get_filehandle(fd
);
10659 return f
->inode
->caps_issued();
10662 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10664 std::lock_guard
lock(client_lock
);
10671 int r
= path_walk(p
, &in
, perms
, true);
10674 return in
->caps_issued();
10677 // =========================================
10680 Inode
*Client::open_snapdir(Inode
*diri
)
10683 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10684 if (!inode_map
.count(vino
)) {
10685 in
= new Inode(this, vino
, &diri
->layout
);
10687 in
->ino
= diri
->ino
;
10688 in
->snapid
= CEPH_SNAPDIR
;
10689 in
->mode
= diri
->mode
;
10690 in
->uid
= diri
->uid
;
10691 in
->gid
= diri
->gid
;
10693 in
->mtime
= diri
->mtime
;
10694 in
->ctime
= diri
->ctime
;
10695 in
->btime
= diri
->btime
;
10696 in
->size
= diri
->size
;
10697 in
->change_attr
= diri
->change_attr
;
10699 in
->dirfragtree
.clear();
10700 in
->snapdir_parent
= diri
;
10701 diri
->flags
|= I_SNAPDIR_OPEN
;
10702 inode_map
[vino
] = in
;
10703 if (use_faked_inos())
10704 _assign_faked_ino(in
);
10705 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10707 in
= inode_map
[vino
];
10708 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10713 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10714 Inode
**out
, const UserPerm
& perms
)
10716 std::lock_guard
lock(client_lock
);
10717 vinodeno_t vparent
= _get_vino(parent
);
10718 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10719 tout(cct
) << __func__
<< std::endl
;
10720 tout(cct
) << name
<< std::endl
;
10726 if (!fuse_default_permissions
) {
10727 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10728 r
= may_lookup(parent
, perms
);
10734 string
dname(name
);
10737 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10744 fill_stat(in
, attr
);
10748 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10749 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10750 tout(cct
) << attr
->st_ino
<< std::endl
;
10755 int Client::ll_lookup_inode(
10756 struct inodeno_t ino
,
10757 const UserPerm
& perms
,
10760 ceph_assert(inode
!= NULL
);
10761 std::lock_guard
lock(client_lock
);
10762 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10767 // Num1: get inode and *inode
10768 int r
= _lookup_ino(ino
, perms
, inode
);
10772 ceph_assert(*inode
!= NULL
);
10774 if (!(*inode
)->dentries
.empty()) {
10775 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10779 if ((*inode
)->is_root()) {
10780 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10784 // Num2: Request the parent inode, so that we can look up the name
10786 r
= _lookup_parent(*inode
, perms
, &parent
);
10788 _ll_forget(*inode
, 1);
10792 ceph_assert(parent
!= NULL
);
10794 // Num3: Finally, get the name (dentry) of the requested inode
10795 r
= _lookup_name(*inode
, parent
, perms
);
10797 // Unexpected error
10798 _ll_forget(parent
, 1);
10799 _ll_forget(*inode
, 1);
10803 _ll_forget(parent
, 1);
10807 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10808 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10809 const UserPerm
& perms
)
10811 std::lock_guard
lock(client_lock
);
10812 vinodeno_t vparent
= _get_vino(parent
);
10813 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10814 tout(cct
) << "ll_lookupx" << std::endl
;
10815 tout(cct
) << name
<< std::endl
;
10821 if (!fuse_default_permissions
) {
10822 r
= may_lookup(parent
, perms
);
10827 string
dname(name
);
10830 unsigned mask
= statx_to_mask(flags
, want
);
10831 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10837 fill_statx(in
, mask
, stx
);
10841 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10842 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10843 tout(cct
) << stx
->stx_ino
<< std::endl
;
10848 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10849 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10851 std::lock_guard
lock(client_lock
);
10856 filepath
fp(name
, 0);
10859 unsigned mask
= statx_to_mask(flags
, want
);
10861 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10862 tout(cct
) << __func__
<< std::endl
;
10863 tout(cct
) << name
<< std::endl
;
10865 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10867 /* zero out mask, just in case... */
10874 fill_statx(in
, mask
, stx
);
10881 void Client::_ll_get(Inode
*in
)
10883 if (in
->ll_ref
== 0) {
10885 if (in
->is_dir() && !in
->dentries
.empty()) {
10886 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10887 in
->get_first_parent()->get(); // pin dentry
10889 if (in
->snapid
!= CEPH_NOSNAP
)
10890 ll_snap_ref
[in
->snapid
]++;
10893 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10896 int Client::_ll_put(Inode
*in
, uint64_t num
)
10899 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10900 if (in
->ll_ref
== 0) {
10901 if (in
->is_dir() && !in
->dentries
.empty()) {
10902 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10903 in
->get_first_parent()->put(); // unpin dentry
10905 if (in
->snapid
!= CEPH_NOSNAP
) {
10906 auto p
= ll_snap_ref
.find(in
->snapid
);
10907 ceph_assert(p
!= ll_snap_ref
.end());
10908 ceph_assert(p
->second
> 0);
10909 if (--p
->second
== 0)
10910 ll_snap_ref
.erase(p
);
10919 void Client::_ll_drop_pins()
10921 ldout(cct
, 10) << __func__
<< dendl
;
10922 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10923 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10924 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10925 it
!= inode_map
.end();
10927 Inode
*in
= it
->second
;
10931 to_be_put
.insert(in
);
10932 _ll_put(in
, in
->ll_ref
);
10937 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
10939 inodeno_t ino
= in
->ino
;
10941 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10942 tout(cct
) << __func__
<< std::endl
;
10943 tout(cct
) << ino
.val
<< std::endl
;
10944 tout(cct
) << count
<< std::endl
;
10946 // Ignore forget if we're no longer mounted
10950 if (ino
== 1) return true; // ignore forget on root.
10953 if (in
->ll_ref
< count
) {
10954 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10955 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10956 _ll_put(in
, in
->ll_ref
);
10959 if (_ll_put(in
, count
) == 0)
10966 bool Client::ll_forget(Inode
*in
, uint64_t count
)
10968 std::lock_guard
lock(client_lock
);
10969 return _ll_forget(in
, count
);
10972 bool Client::ll_put(Inode
*in
)
10974 /* ll_forget already takes the lock */
10975 return ll_forget(in
, 1);
10978 int Client::ll_get_snap_ref(snapid_t snap
)
10980 std::lock_guard
lock(client_lock
);
10981 auto p
= ll_snap_ref
.find(snap
);
10982 if (p
!= ll_snap_ref
.end())
10987 snapid_t
Client::ll_get_snapid(Inode
*in
)
10989 std::lock_guard
lock(client_lock
);
10993 Inode
*Client::ll_get_inode(ino_t ino
)
10995 std::lock_guard
lock(client_lock
);
11000 vinodeno_t vino
= _map_faked_ino(ino
);
11001 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11002 if (p
== inode_map
.end())
11004 Inode
*in
= p
->second
;
11009 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11011 std::lock_guard
lock(client_lock
);
11016 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11017 if (p
== inode_map
.end())
11019 Inode
*in
= p
->second
;
11024 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11026 vinodeno_t vino
= _get_vino(in
);
11028 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11029 tout(cct
) << __func__
<< std::endl
;
11030 tout(cct
) << vino
.ino
.val
<< std::endl
;
11032 if (vino
.snapid
< CEPH_NOSNAP
)
11035 return _getattr(in
, caps
, perms
);
11038 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11040 std::lock_guard
lock(client_lock
);
11045 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11048 fill_stat(in
, attr
);
11049 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11053 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11054 unsigned int flags
, const UserPerm
& perms
)
11056 std::lock_guard
lock(client_lock
);
11062 unsigned mask
= statx_to_mask(flags
, want
);
11064 if (mask
&& !in
->caps_issued_mask(mask
, true))
11065 res
= _ll_getattr(in
, mask
, perms
);
11068 fill_statx(in
, mask
, stx
);
11069 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11073 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11074 const UserPerm
& perms
, InodeRef
*inp
)
11076 vinodeno_t vino
= _get_vino(in
);
11078 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11080 tout(cct
) << __func__
<< std::endl
;
11081 tout(cct
) << vino
.ino
.val
<< std::endl
;
11082 tout(cct
) << stx
->stx_mode
<< std::endl
;
11083 tout(cct
) << stx
->stx_uid
<< std::endl
;
11084 tout(cct
) << stx
->stx_gid
<< std::endl
;
11085 tout(cct
) << stx
->stx_size
<< std::endl
;
11086 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11087 tout(cct
) << stx
->stx_atime
<< std::endl
;
11088 tout(cct
) << stx
->stx_btime
<< std::endl
;
11089 tout(cct
) << mask
<< std::endl
;
11091 if (!fuse_default_permissions
) {
11092 int res
= may_setattr(in
, stx
, mask
, perms
);
11097 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11099 return __setattrx(in
, stx
, mask
, perms
, inp
);
11102 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11103 const UserPerm
& perms
)
11105 std::lock_guard
lock(client_lock
);
11110 InodeRef
target(in
);
11111 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11113 ceph_assert(in
== target
.get());
11114 fill_statx(in
, in
->caps_issued(), stx
);
11117 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11121 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11122 const UserPerm
& perms
)
11124 struct ceph_statx stx
;
11125 stat_to_statx(attr
, &stx
);
11127 std::lock_guard
lock(client_lock
);
11132 InodeRef
target(in
);
11133 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11135 ceph_assert(in
== target
.get());
11136 fill_stat(in
, attr
);
11139 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11147 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11148 const UserPerm
& perms
)
11150 std::lock_guard
lock(client_lock
);
11156 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11159 return _getxattr(in
, name
, value
, size
, perms
);
11162 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11163 const UserPerm
& perms
)
11165 std::lock_guard
lock(client_lock
);
11171 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11174 return _getxattr(in
, name
, value
, size
, perms
);
11177 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11178 const UserPerm
& perms
)
11180 std::lock_guard
lock(client_lock
);
11185 Fh
*f
= get_filehandle(fd
);
11188 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11191 int Client::listxattr(const char *path
, char *list
, size_t size
,
11192 const UserPerm
& perms
)
11194 std::lock_guard
lock(client_lock
);
11200 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11203 return Client::_listxattr(in
.get(), list
, size
, perms
);
11206 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11207 const UserPerm
& perms
)
11209 std::lock_guard
lock(client_lock
);
11215 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11218 return Client::_listxattr(in
.get(), list
, size
, perms
);
11221 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11223 std::lock_guard
lock(client_lock
);
11228 Fh
*f
= get_filehandle(fd
);
11231 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11234 int Client::removexattr(const char *path
, const char *name
,
11235 const UserPerm
& perms
)
11237 std::lock_guard
lock(client_lock
);
11243 int r
= Client::path_walk(path
, &in
, perms
, true);
11246 return _removexattr(in
, name
, perms
);
11249 int Client::lremovexattr(const char *path
, const char *name
,
11250 const UserPerm
& perms
)
11252 std::lock_guard
lock(client_lock
);
11258 int r
= Client::path_walk(path
, &in
, perms
, false);
11261 return _removexattr(in
, name
, perms
);
11264 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11266 std::lock_guard
lock(client_lock
);
11271 Fh
*f
= get_filehandle(fd
);
11274 return _removexattr(f
->inode
, name
, perms
);
11277 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11278 size_t size
, int flags
, const UserPerm
& perms
)
11280 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11282 std::lock_guard
lock(client_lock
);
11288 int r
= Client::path_walk(path
, &in
, perms
, true);
11291 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11294 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11295 size_t size
, int flags
, const UserPerm
& perms
)
11297 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11299 std::lock_guard
lock(client_lock
);
11305 int r
= Client::path_walk(path
, &in
, perms
, false);
11308 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11311 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11312 int flags
, const UserPerm
& perms
)
11314 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11316 std::lock_guard
lock(client_lock
);
11321 Fh
*f
= get_filehandle(fd
);
11324 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11327 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11328 const UserPerm
& perms
)
11332 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11336 // Do a force getattr to get the latest quota before returning
11337 // a value to userspace.
11339 if (vxattr
->flags
& VXATTR_RSTAT
) {
11340 flags
|= CEPH_STAT_RSTAT
;
11342 r
= _getattr(in
, flags
, perms
, true);
11344 // Error from getattr!
11348 // call pointer-to-member function
11350 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11351 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11357 if (r
> (int)size
) {
11359 } else if (r
> 0) {
11360 memcpy(value
, buf
, r
);
11366 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11371 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11375 if (in
->xattrs
.count(n
)) {
11376 r
= in
->xattrs
[n
].length();
11377 if (r
> 0 && size
!= 0) {
11378 if (size
>= (unsigned)r
)
11379 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11386 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11390 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11391 const UserPerm
& perms
)
11393 if (cct
->_conf
->client_permissions
) {
11394 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11398 return _getxattr(in
.get(), name
, value
, size
, perms
);
11401 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11402 size_t size
, const UserPerm
& perms
)
11404 std::lock_guard
lock(client_lock
);
11409 vinodeno_t vino
= _get_vino(in
);
11411 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11412 tout(cct
) << __func__
<< std::endl
;
11413 tout(cct
) << vino
.ino
.val
<< std::endl
;
11414 tout(cct
) << name
<< std::endl
;
11416 if (!fuse_default_permissions
) {
11417 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11422 return _getxattr(in
, name
, value
, size
, perms
);
11425 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11426 const UserPerm
& perms
)
11428 bool len_only
= (size
== 0);
11429 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11435 for (const auto& p
: in
->xattrs
) {
11436 size_t this_len
= p
.first
.length() + 1;
11441 if (this_len
> size
) {
11446 memcpy(name
, p
.first
.c_str(), this_len
);
11451 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11455 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11456 const UserPerm
& perms
)
11458 std::lock_guard
lock(client_lock
);
11463 vinodeno_t vino
= _get_vino(in
);
11465 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11466 tout(cct
) << __func__
<< std::endl
;
11467 tout(cct
) << vino
.ino
.val
<< std::endl
;
11468 tout(cct
) << size
<< std::endl
;
11470 return _listxattr(in
, names
, size
, perms
);
11473 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11474 size_t size
, int flags
, const UserPerm
& perms
)
11477 int xattr_flags
= 0;
11479 xattr_flags
|= CEPH_XATTR_REMOVE
;
11480 if (flags
& XATTR_CREATE
)
11481 xattr_flags
|= CEPH_XATTR_CREATE
;
11482 if (flags
& XATTR_REPLACE
)
11483 xattr_flags
|= CEPH_XATTR_REPLACE
;
11485 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11487 in
->make_nosnap_relative_path(path
);
11488 req
->set_filepath(path
);
11489 req
->set_string2(name
);
11490 req
->set_inode(in
);
11491 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11494 assert (value
|| size
== 0);
11495 bl
.append((const char*)value
, size
);
11498 int res
= make_request(req
, perms
);
11501 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11506 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11507 size_t size
, int flags
, const UserPerm
& perms
)
11509 if (in
->snapid
!= CEPH_NOSNAP
) {
11513 bool posix_acl_xattr
= false;
11514 if (acl_type
== POSIX_ACL
)
11515 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11517 if (strncmp(name
, "user.", 5) &&
11518 strncmp(name
, "security.", 9) &&
11519 strncmp(name
, "trusted.", 8) &&
11520 strncmp(name
, "ceph.", 5) &&
11522 return -EOPNOTSUPP
;
11524 bool check_realm
= false;
11526 if (posix_acl_xattr
) {
11527 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11528 mode_t new_mode
= in
->mode
;
11530 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11537 if (new_mode
!= in
->mode
) {
11538 struct ceph_statx stx
;
11539 stx
.stx_mode
= new_mode
;
11540 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11545 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11547 if (!S_ISDIR(in
->mode
))
11549 int ret
= posix_acl_check(value
, size
);
11558 return -EOPNOTSUPP
;
11561 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11563 if (vxattr
->readonly
)
11564 return -EOPNOTSUPP
;
11565 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11566 check_realm
= true;
11570 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11571 if (ret
>= 0 && check_realm
) {
11572 // check if snaprealm was created for quota inode
11573 if (in
->quota
.is_enable() &&
11574 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11581 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11582 size_t size
, int flags
, const UserPerm
& perms
)
11584 if (cct
->_conf
->client_permissions
) {
11585 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11589 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11592 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11595 if (name
== "layout") {
11596 string::iterator begin
= value
.begin();
11597 string::iterator end
= value
.end();
11598 keys_and_values
<string::iterator
> p
; // create instance of parser
11599 std::map
<string
, string
> m
; // map to receive results
11600 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11605 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11606 if (q
->first
== "pool") {
11611 } else if (name
== "layout.pool") {
11615 if (tmp
.length()) {
11618 pool
= boost::lexical_cast
<unsigned>(tmp
);
11619 if (!osdmap
->have_pg_pool(pool
))
11621 } catch (boost::bad_lexical_cast
const&) {
11622 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11632 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11634 // For setting pool of layout, MetaRequest need osdmap epoch.
11635 // There is a race which create a new data pool but client and mds both don't have.
11636 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11637 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11638 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11639 string
rest(strstr(name
, "layout"));
11640 string
v((const char*)value
, size
);
11641 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11642 return _setxattr_check_data_pool(rest
, v
, &o
);
11645 if (r
== -ENOENT
) {
11647 objecter
->wait_for_latest_osdmap(&ctx
);
11653 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11654 size_t size
, int flags
, const UserPerm
& perms
)
11656 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11658 std::lock_guard
lock(client_lock
);
11663 vinodeno_t vino
= _get_vino(in
);
11665 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11666 tout(cct
) << __func__
<< std::endl
;
11667 tout(cct
) << vino
.ino
.val
<< std::endl
;
11668 tout(cct
) << name
<< std::endl
;
11670 if (!fuse_default_permissions
) {
11671 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11675 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11678 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11680 if (in
->snapid
!= CEPH_NOSNAP
) {
11684 // same xattrs supported by kernel client
11685 if (strncmp(name
, "user.", 5) &&
11686 strncmp(name
, "system.", 7) &&
11687 strncmp(name
, "security.", 9) &&
11688 strncmp(name
, "trusted.", 8) &&
11689 strncmp(name
, "ceph.", 5))
11690 return -EOPNOTSUPP
;
11692 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11693 if (vxattr
&& vxattr
->readonly
)
11694 return -EOPNOTSUPP
;
11696 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11698 in
->make_nosnap_relative_path(path
);
11699 req
->set_filepath(path
);
11700 req
->set_filepath2(name
);
11701 req
->set_inode(in
);
11703 int res
= make_request(req
, perms
);
11706 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11710 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11712 if (cct
->_conf
->client_permissions
) {
11713 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11717 return _removexattr(in
.get(), name
, perms
);
11720 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11722 std::lock_guard
lock(client_lock
);
11727 vinodeno_t vino
= _get_vino(in
);
11729 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11730 tout(cct
) << "ll_removexattr" << std::endl
;
11731 tout(cct
) << vino
.ino
.val
<< std::endl
;
11732 tout(cct
) << name
<< std::endl
;
11734 if (!fuse_default_permissions
) {
11735 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11740 return _removexattr(in
, name
, perms
);
11743 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11745 return in
->quota
.is_enable() &&
11746 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11748 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11750 return snprintf(val
, size
,
11751 "max_bytes=%lld max_files=%lld",
11752 (long long int)in
->quota
.max_bytes
,
11753 (long long int)in
->quota
.max_files
);
11755 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11757 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11759 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11761 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11764 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11766 return in
->layout
!= file_layout_t();
11768 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11770 int r
= snprintf(val
, size
,
11771 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11772 (unsigned long long)in
->layout
.stripe_unit
,
11773 (unsigned long long)in
->layout
.stripe_count
,
11774 (unsigned long long)in
->layout
.object_size
);
11775 objecter
->with_osdmap([&](const OSDMap
& o
) {
11776 if (o
.have_pg_pool(in
->layout
.pool_id
))
11777 r
+= snprintf(val
+ r
, size
- r
, "%s",
11778 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11780 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11781 (uint64_t)in
->layout
.pool_id
);
11783 if (in
->layout
.pool_ns
.length())
11784 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11785 in
->layout
.pool_ns
.c_str());
11788 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11790 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11792 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11794 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11796 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11798 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11800 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11803 objecter
->with_osdmap([&](const OSDMap
& o
) {
11804 if (o
.have_pg_pool(in
->layout
.pool_id
))
11805 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11806 in
->layout
.pool_id
).c_str());
11808 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11812 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11814 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11816 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11818 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11820 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11822 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11824 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11826 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11828 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11830 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11832 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11834 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11836 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11838 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11840 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11842 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11844 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11846 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11847 (long)in
->rstat
.rctime
.nsec());
11849 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11851 return in
->dir_pin
!= -ENODATA
;
11853 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11855 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11858 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11860 return !in
->snap_btime
.is_zero();
11863 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11865 return snprintf(val
, size
, "%llu.%09lu",
11866 (long long unsigned)in
->snap_btime
.sec(),
11867 (long unsigned)in
->snap_btime
.nsec());
11870 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11871 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11873 #define XATTR_NAME_CEPH(_type, _name) \
11875 name: CEPH_XATTR_NAME(_type, _name), \
11876 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11881 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11883 name: CEPH_XATTR_NAME(_type, _name), \
11884 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11889 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11891 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11892 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11894 exists_cb: &Client::_vxattrcb_layout_exists, \
11897 #define XATTR_QUOTA_FIELD(_type, _name) \
11899 name: CEPH_XATTR_NAME(_type, _name), \
11900 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11902 exists_cb: &Client::_vxattrcb_quota_exists, \
11906 const Client::VXattr
Client::_dir_vxattrs
[] = {
11908 name
: "ceph.dir.layout",
11909 getxattr_cb
: &Client::_vxattrcb_layout
,
11911 exists_cb
: &Client::_vxattrcb_layout_exists
,
11914 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11915 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11916 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11917 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11918 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11919 XATTR_NAME_CEPH(dir
, entries
),
11920 XATTR_NAME_CEPH(dir
, files
),
11921 XATTR_NAME_CEPH(dir
, subdirs
),
11922 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11923 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11924 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11925 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11926 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11928 name
: "ceph.quota",
11929 getxattr_cb
: &Client::_vxattrcb_quota
,
11931 exists_cb
: &Client::_vxattrcb_quota_exists
,
11934 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11935 XATTR_QUOTA_FIELD(quota
, max_files
),
11937 name
: "ceph.dir.pin",
11938 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11940 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11944 name
: "ceph.snap.btime",
11945 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11947 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11950 { name
: "" } /* Required table terminator */
11953 const Client::VXattr
Client::_file_vxattrs
[] = {
11955 name
: "ceph.file.layout",
11956 getxattr_cb
: &Client::_vxattrcb_layout
,
11958 exists_cb
: &Client::_vxattrcb_layout_exists
,
11961 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11962 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11963 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11964 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11965 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11967 name
: "ceph.snap.btime",
11968 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11970 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11973 { name
: "" } /* Required table terminator */
11976 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11979 return _dir_vxattrs
;
11980 else if (in
->is_file())
11981 return _file_vxattrs
;
11985 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11987 if (strncmp(name
, "ceph.", 5) == 0) {
11988 const VXattr
*vxattr
= _get_vxattrs(in
);
11990 while (!vxattr
->name
.empty()) {
11991 if (vxattr
->name
== name
)
12000 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
12002 std::lock_guard
lock(client_lock
);
12007 vinodeno_t vino
= _get_vino(in
);
12009 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
12010 tout(cct
) << "ll_readlink" << std::endl
;
12011 tout(cct
) << vino
.ino
.val
<< std::endl
;
12013 for (auto dn
: in
->dentries
) {
12017 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
12018 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12022 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12023 const UserPerm
& perms
, InodeRef
*inp
)
12025 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12026 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12027 << ", gid " << perms
.gid() << ")" << dendl
;
12029 if (strlen(name
) > NAME_MAX
)
12030 return -ENAMETOOLONG
;
12032 if (dir
->snapid
!= CEPH_NOSNAP
) {
12035 if (is_quota_files_exceeded(dir
, perms
)) {
12039 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12042 dir
->make_nosnap_relative_path(path
);
12043 path
.push_dentry(name
);
12044 req
->set_filepath(path
);
12045 req
->set_inode(dir
);
12046 req
->head
.args
.mknod
.rdev
= rdev
;
12047 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12048 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12050 bufferlist xattrs_bl
;
12051 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12054 req
->head
.args
.mknod
.mode
= mode
;
12055 if (xattrs_bl
.length() > 0)
12056 req
->set_data(xattrs_bl
);
12059 res
= get_or_create(dir
, name
, &de
);
12062 req
->set_dentry(de
);
12064 res
= make_request(req
, perms
, inp
);
12068 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12076 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12077 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12078 const UserPerm
& perms
)
12080 std::lock_guard
lock(client_lock
);
12085 vinodeno_t vparent
= _get_vino(parent
);
12087 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12088 tout(cct
) << "ll_mknod" << std::endl
;
12089 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12090 tout(cct
) << name
<< std::endl
;
12091 tout(cct
) << mode
<< std::endl
;
12092 tout(cct
) << rdev
<< std::endl
;
12094 if (!fuse_default_permissions
) {
12095 int r
= may_create(parent
, perms
);
12101 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12103 fill_stat(in
, attr
);
12106 tout(cct
) << attr
->st_ino
<< std::endl
;
12107 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12108 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12113 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12114 dev_t rdev
, Inode
**out
,
12115 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12116 const UserPerm
& perms
)
12118 unsigned caps
= statx_to_mask(flags
, want
);
12119 std::lock_guard
lock(client_lock
);
12124 vinodeno_t vparent
= _get_vino(parent
);
12126 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12127 tout(cct
) << "ll_mknodx" << std::endl
;
12128 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12129 tout(cct
) << name
<< std::endl
;
12130 tout(cct
) << mode
<< std::endl
;
12131 tout(cct
) << rdev
<< std::endl
;
12133 if (!fuse_default_permissions
) {
12134 int r
= may_create(parent
, perms
);
12140 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12142 fill_statx(in
, caps
, stx
);
12145 tout(cct
) << stx
->stx_ino
<< std::endl
;
12146 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12147 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12152 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12153 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12154 int object_size
, const char *data_pool
, bool *created
,
12155 const UserPerm
& perms
)
12157 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12158 mode
<< dec
<< ")" << dendl
;
12160 if (strlen(name
) > NAME_MAX
)
12161 return -ENAMETOOLONG
;
12162 if (dir
->snapid
!= CEPH_NOSNAP
) {
12165 if (is_quota_files_exceeded(dir
, perms
)) {
12169 // use normalized flags to generate cmode
12170 int cflags
= ceph_flags_sys2wire(flags
);
12171 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12172 cflags
|= CEPH_O_LAZY
;
12174 int cmode
= ceph_flags_to_mode(cflags
);
12176 int64_t pool_id
= -1;
12177 if (data_pool
&& *data_pool
) {
12178 pool_id
= objecter
->with_osdmap(
12179 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12182 if (pool_id
> 0xffffffffll
)
12183 return -ERANGE
; // bummer!
12186 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12189 dir
->make_nosnap_relative_path(path
);
12190 path
.push_dentry(name
);
12191 req
->set_filepath(path
);
12192 req
->set_inode(dir
);
12193 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12195 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12196 req
->head
.args
.open
.stripe_count
= stripe_count
;
12197 req
->head
.args
.open
.object_size
= object_size
;
12198 if (cct
->_conf
->client_debug_getattr_caps
)
12199 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12201 req
->head
.args
.open
.mask
= 0;
12202 req
->head
.args
.open
.pool
= pool_id
;
12203 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12204 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12207 bufferlist xattrs_bl
;
12208 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12211 req
->head
.args
.open
.mode
= mode
;
12212 if (xattrs_bl
.length() > 0)
12213 req
->set_data(xattrs_bl
);
12216 res
= get_or_create(dir
, name
, &de
);
12219 req
->set_dentry(de
);
12221 res
= make_request(req
, perms
, inp
, created
);
12226 /* If the caller passed a value in fhp, do the open */
12228 (*inp
)->get_open_ref(cmode
);
12229 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12235 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12236 << " layout " << stripe_unit
12237 << ' ' << stripe_count
12238 << ' ' << object_size
12239 <<") = " << res
<< dendl
;
12248 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12251 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12252 << mode
<< dec
<< ", uid " << perm
.uid()
12253 << ", gid " << perm
.gid() << ")" << dendl
;
12255 if (strlen(name
) > NAME_MAX
)
12256 return -ENAMETOOLONG
;
12258 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12261 if (is_quota_files_exceeded(dir
, perm
)) {
12264 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12265 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12268 dir
->make_nosnap_relative_path(path
);
12269 path
.push_dentry(name
);
12270 req
->set_filepath(path
);
12271 req
->set_inode(dir
);
12272 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12273 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12276 bufferlist xattrs_bl
;
12277 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12280 req
->head
.args
.mkdir
.mode
= mode
;
12281 if (xattrs_bl
.length() > 0)
12282 req
->set_data(xattrs_bl
);
12285 res
= get_or_create(dir
, name
, &de
);
12288 req
->set_dentry(de
);
12290 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12291 res
= make_request(req
, perm
, inp
);
12292 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12296 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12304 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12305 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12307 std::lock_guard
lock(client_lock
);
12312 vinodeno_t vparent
= _get_vino(parent
);
12314 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12315 tout(cct
) << "ll_mkdir" << std::endl
;
12316 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12317 tout(cct
) << name
<< std::endl
;
12318 tout(cct
) << mode
<< std::endl
;
12320 if (!fuse_default_permissions
) {
12321 int r
= may_create(parent
, perm
);
12327 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12329 fill_stat(in
, attr
);
12332 tout(cct
) << attr
->st_ino
<< std::endl
;
12333 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12334 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12339 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12340 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12341 const UserPerm
& perms
)
12343 std::lock_guard
lock(client_lock
);
12348 vinodeno_t vparent
= _get_vino(parent
);
12350 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12351 tout(cct
) << "ll_mkdirx" << std::endl
;
12352 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12353 tout(cct
) << name
<< std::endl
;
12354 tout(cct
) << mode
<< std::endl
;
12356 if (!fuse_default_permissions
) {
12357 int r
= may_create(parent
, perms
);
12363 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12365 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12371 tout(cct
) << stx
->stx_ino
<< std::endl
;
12372 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12373 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12378 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12379 const UserPerm
& perms
, InodeRef
*inp
)
12381 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12382 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12385 if (strlen(name
) > NAME_MAX
)
12386 return -ENAMETOOLONG
;
12388 if (dir
->snapid
!= CEPH_NOSNAP
) {
12391 if (is_quota_files_exceeded(dir
, perms
)) {
12395 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12398 dir
->make_nosnap_relative_path(path
);
12399 path
.push_dentry(name
);
12400 req
->set_filepath(path
);
12401 req
->set_inode(dir
);
12402 req
->set_string2(target
);
12403 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12404 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12407 int res
= get_or_create(dir
, name
, &de
);
12410 req
->set_dentry(de
);
12412 res
= make_request(req
, perms
, inp
);
12415 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12424 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12425 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12427 std::lock_guard
lock(client_lock
);
12432 vinodeno_t vparent
= _get_vino(parent
);
12434 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12436 tout(cct
) << "ll_symlink" << std::endl
;
12437 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12438 tout(cct
) << name
<< std::endl
;
12439 tout(cct
) << value
<< std::endl
;
12441 if (!fuse_default_permissions
) {
12442 int r
= may_create(parent
, perms
);
12448 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12450 fill_stat(in
, attr
);
12453 tout(cct
) << attr
->st_ino
<< std::endl
;
12454 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12455 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12460 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12461 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12462 unsigned flags
, const UserPerm
& perms
)
12464 std::lock_guard
lock(client_lock
);
12469 vinodeno_t vparent
= _get_vino(parent
);
12471 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12473 tout(cct
) << "ll_symlinkx" << std::endl
;
12474 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12475 tout(cct
) << name
<< std::endl
;
12476 tout(cct
) << value
<< std::endl
;
12478 if (!fuse_default_permissions
) {
12479 int r
= may_create(parent
, perms
);
12485 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12487 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12490 tout(cct
) << stx
->stx_ino
<< std::endl
;
12491 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12492 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12497 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12499 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12500 << " uid " << perm
.uid() << " gid " << perm
.gid()
12503 if (dir
->snapid
!= CEPH_NOSNAP
) {
12507 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12510 dir
->make_nosnap_relative_path(path
);
12511 path
.push_dentry(name
);
12512 req
->set_filepath(path
);
12518 int res
= get_or_create(dir
, name
, &de
);
12521 req
->set_dentry(de
);
12522 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12523 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12525 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12529 in
= otherin
.get();
12530 req
->set_other_inode(in
);
12531 in
->break_all_delegs();
12532 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12534 req
->set_inode(dir
);
12536 res
= make_request(req
, perm
);
12539 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12547 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12549 std::lock_guard
lock(client_lock
);
12554 vinodeno_t vino
= _get_vino(in
);
12556 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12557 tout(cct
) << "ll_unlink" << std::endl
;
12558 tout(cct
) << vino
.ino
.val
<< std::endl
;
12559 tout(cct
) << name
<< std::endl
;
12561 if (!fuse_default_permissions
) {
12562 int r
= may_delete(in
, name
, perm
);
12566 return _unlink(in
, name
, perm
);
12569 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12571 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12572 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12574 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12578 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12579 MetaRequest
*req
= new MetaRequest(op
);
12581 dir
->make_nosnap_relative_path(path
);
12582 path
.push_dentry(name
);
12583 req
->set_filepath(path
);
12584 req
->set_inode(dir
);
12586 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12587 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12588 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12593 int res
= get_or_create(dir
, name
, &de
);
12596 if (op
== CEPH_MDS_OP_RMDIR
)
12597 req
->set_dentry(de
);
12601 res
= _lookup(dir
, name
, 0, &in
, perms
);
12605 if (op
== CEPH_MDS_OP_RMSNAP
) {
12606 unlink(de
, true, true);
12609 req
->set_other_inode(in
.get());
12611 res
= make_request(req
, perms
);
12614 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12622 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12624 std::lock_guard
lock(client_lock
);
12629 vinodeno_t vino
= _get_vino(in
);
12631 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12632 tout(cct
) << "ll_rmdir" << std::endl
;
12633 tout(cct
) << vino
.ino
.val
<< std::endl
;
12634 tout(cct
) << name
<< std::endl
;
12636 if (!fuse_default_permissions
) {
12637 int r
= may_delete(in
, name
, perms
);
12642 return _rmdir(in
, name
, perms
);
12645 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12647 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12648 << todir
->ino
<< " " << toname
12649 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12652 if (fromdir
->snapid
!= todir
->snapid
)
12655 int op
= CEPH_MDS_OP_RENAME
;
12656 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12657 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12658 op
= CEPH_MDS_OP_RENAMESNAP
;
12664 MetaRequest
*req
= new MetaRequest(op
);
12667 fromdir
->make_nosnap_relative_path(from
);
12668 from
.push_dentry(fromname
);
12670 todir
->make_nosnap_relative_path(to
);
12671 to
.push_dentry(toname
);
12672 req
->set_filepath(to
);
12673 req
->set_filepath2(from
);
12676 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12680 res
= get_or_create(todir
, toname
, &de
);
12684 if (op
== CEPH_MDS_OP_RENAME
) {
12685 req
->set_old_dentry(oldde
);
12686 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12687 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12689 req
->set_dentry(de
);
12690 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12691 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12693 InodeRef oldin
, otherin
;
12694 Inode
*fromdir_root
= nullptr;
12695 Inode
*todir_root
= nullptr;
12697 bool quota_check
= false;
12698 if (fromdir
!= todir
) {
12700 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12702 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12704 if (todir_root
->quota
.is_enable() && fromdir_root
!= todir_root
) {
12705 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12706 // to auth MDS to get latest rstat for todir_root and source dir
12707 // even if their dentry caches and inode caps are satisfied.
12708 res
= _getattr(todir_root
, CEPH_STAT_RSTAT
, perm
, true);
12712 quota_check
= true;
12713 if (oldde
->inode
&& oldde
->inode
->is_dir()) {
12714 mask
|= CEPH_STAT_RSTAT
;
12719 res
= _lookup(fromdir
, fromname
, mask
, &oldin
, perm
);
12723 Inode
*oldinode
= oldin
.get();
12724 oldinode
->break_all_delegs();
12725 req
->set_old_inode(oldinode
);
12726 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12729 int64_t old_bytes
, old_files
;
12730 if (oldinode
->is_dir()) {
12731 old_bytes
= oldinode
->rstat
.rbytes
;
12732 old_files
= oldinode
->rstat
.rsize();
12734 old_bytes
= oldinode
->size
;
12738 bool quota_exceed
= false;
12739 if (todir_root
&& todir_root
->quota
.max_bytes
&&
12740 (old_bytes
+ todir_root
->rstat
.rbytes
) >= todir_root
->quota
.max_bytes
) {
12741 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " bytes="
12742 << old_bytes
<< ") to (" << todir
->ino
12743 << ") will exceed quota on " << *todir_root
<< dendl
;
12744 quota_exceed
= true;
12747 if (todir_root
&& todir_root
->quota
.max_files
&&
12748 (old_files
+ todir_root
->rstat
.rsize()) >= todir_root
->quota
.max_files
) {
12749 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " files="
12750 << old_files
<< ") to (" << todir
->ino
12751 << ") will exceed quota on " << *todir_root
<< dendl
;
12752 quota_exceed
= true;
12755 if (quota_exceed
) {
12756 res
= (oldinode
->is_dir()) ? -EXDEV
: -EDQUOT
;
12761 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12765 Inode
*in
= otherin
.get();
12766 req
->set_other_inode(in
);
12767 in
->break_all_delegs();
12769 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12777 req
->set_inode(todir
);
12779 // renamesnap reply contains no tracedn, so we need to invalidate
12781 unlink(oldde
, true, true);
12782 unlink(de
, true, true);
12784 req
->set_inode(todir
);
12787 res
= make_request(req
, perm
, &target
);
12788 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12790 // renamed item from our cache
12793 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12801 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12802 const char *newname
, const UserPerm
& perm
)
12804 std::lock_guard
lock(client_lock
);
12809 vinodeno_t vparent
= _get_vino(parent
);
12810 vinodeno_t vnewparent
= _get_vino(newparent
);
12812 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12813 << vnewparent
<< " " << newname
<< dendl
;
12814 tout(cct
) << "ll_rename" << std::endl
;
12815 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12816 tout(cct
) << name
<< std::endl
;
12817 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12818 tout(cct
) << newname
<< std::endl
;
12820 if (!fuse_default_permissions
) {
12821 int r
= may_delete(parent
, name
, perm
);
12824 r
= may_delete(newparent
, newname
, perm
);
12825 if (r
< 0 && r
!= -ENOENT
)
12829 return _rename(parent
, name
, newparent
, newname
, perm
);
12832 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12834 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12835 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12837 if (strlen(newname
) > NAME_MAX
)
12838 return -ENAMETOOLONG
;
12840 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12843 if (is_quota_files_exceeded(dir
, perm
)) {
12847 in
->break_all_delegs();
12848 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12850 filepath
path(newname
, dir
->ino
);
12851 req
->set_filepath(path
);
12852 filepath
existing(in
->ino
);
12853 req
->set_filepath2(existing
);
12855 req
->set_inode(dir
);
12856 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12857 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12860 int res
= get_or_create(dir
, newname
, &de
);
12863 req
->set_dentry(de
);
12865 res
= make_request(req
, perm
, inp
);
12866 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12869 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12877 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12878 const UserPerm
& perm
)
12880 std::lock_guard
lock(client_lock
);
12885 vinodeno_t vino
= _get_vino(in
);
12886 vinodeno_t vnewparent
= _get_vino(newparent
);
12888 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12890 tout(cct
) << "ll_link" << std::endl
;
12891 tout(cct
) << vino
.ino
.val
<< std::endl
;
12892 tout(cct
) << vnewparent
<< std::endl
;
12893 tout(cct
) << newname
<< std::endl
;
12897 if (!fuse_default_permissions
) {
12898 if (S_ISDIR(in
->mode
))
12901 int r
= may_hardlink(in
, perm
);
12905 r
= may_create(newparent
, perm
);
12910 return _link(in
, newparent
, newname
, perm
, &target
);
12913 int Client::ll_num_osds(void)
12915 std::lock_guard
lock(client_lock
);
12916 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12919 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12921 std::lock_guard
lock(client_lock
);
12924 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12925 if (!o
.exists(osd
))
12927 g
= o
.get_addrs(osd
).front();
12932 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12933 *addr
= ntohl(nb_addr
);
12937 uint32_t Client::ll_stripe_unit(Inode
*in
)
12939 std::lock_guard
lock(client_lock
);
12940 return in
->layout
.stripe_unit
;
12943 uint64_t Client::ll_snap_seq(Inode
*in
)
12945 std::lock_guard
lock(client_lock
);
12946 return in
->snaprealm
->seq
;
12949 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12951 std::lock_guard
lock(client_lock
);
12952 *layout
= in
->layout
;
12956 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12958 return ll_file_layout(fh
->inode
.get(), layout
);
12961 /* Currently we cannot take advantage of redundancy in reads, since we
12962 would have to go through all possible placement groups (a
12963 potentially quite large number determined by a hash), and use CRUSH
12964 to calculate the appropriate set of OSDs for each placement group,
12965 then index into that. An array with one entry per OSD is much more
12966 tractable and works for demonstration purposes. */
12968 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12969 file_layout_t
* layout
)
12971 std::lock_guard
lock(client_lock
);
12973 inodeno_t ino
= in
->ino
;
12974 uint32_t object_size
= layout
->object_size
;
12975 uint32_t su
= layout
->stripe_unit
;
12976 uint32_t stripe_count
= layout
->stripe_count
;
12977 uint64_t stripes_per_object
= object_size
/ su
;
12978 uint64_t stripeno
= 0, stripepos
= 0;
12981 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12982 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12984 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12985 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12987 object_t oid
= file_object_t(ino
, objectno
);
12988 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12989 ceph_object_layout olayout
=
12990 o
.file_to_object_layout(oid
, *layout
);
12991 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12994 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12999 /* Return the offset of the block, internal to the object */
13001 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
13003 std::lock_guard
lock(client_lock
);
13004 file_layout_t
*layout
=&(in
->layout
);
13005 uint32_t object_size
= layout
->object_size
;
13006 uint32_t su
= layout
->stripe_unit
;
13007 uint64_t stripes_per_object
= object_size
/ su
;
13009 return (blockno
% stripes_per_object
) * su
;
13012 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
13013 const UserPerm
& perms
)
13015 std::lock_guard
lock(client_lock
);
13020 vinodeno_t vino
= _get_vino(in
);
13022 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13023 tout(cct
) << "ll_opendir" << std::endl
;
13024 tout(cct
) << vino
.ino
.val
<< std::endl
;
13026 if (!fuse_default_permissions
) {
13027 int r
= may_open(in
, flags
, perms
);
13032 int r
= _opendir(in
, dirpp
, perms
);
13033 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
13035 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13040 int Client::ll_releasedir(dir_result_t
*dirp
)
13042 std::lock_guard
lock(client_lock
);
13043 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13044 tout(cct
) << "ll_releasedir" << std::endl
;
13045 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13054 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13056 std::lock_guard
lock(client_lock
);
13057 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13058 tout(cct
) << "ll_fsyncdir" << std::endl
;
13059 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13064 return _fsync(dirp
->inode
.get(), false);
13067 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13069 ceph_assert(!(flags
& O_CREAT
));
13071 std::lock_guard
lock(client_lock
);
13076 vinodeno_t vino
= _get_vino(in
);
13078 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13079 tout(cct
) << "ll_open" << std::endl
;
13080 tout(cct
) << vino
.ino
.val
<< std::endl
;
13081 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13084 if (!fuse_default_permissions
) {
13085 r
= may_open(in
, flags
, perms
);
13090 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13093 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13095 ll_unclosed_fh_set
.insert(fhptr
);
13097 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13098 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13099 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13103 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13104 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13105 const UserPerm
& perms
)
13109 vinodeno_t vparent
= _get_vino(parent
);
13111 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13112 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13113 << ", gid " << perms
.gid() << dendl
;
13114 tout(cct
) << "ll_create" << std::endl
;
13115 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13116 tout(cct
) << name
<< std::endl
;
13117 tout(cct
) << mode
<< std::endl
;
13118 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13120 bool created
= false;
13121 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13123 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13126 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13127 if (!fuse_default_permissions
) {
13128 r
= may_create(parent
, perms
);
13132 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13143 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13145 if (!fuse_default_permissions
) {
13146 r
= may_open(in
->get(), flags
, perms
);
13149 int release_r
= _release_fh(*fhp
);
13150 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13155 if (*fhp
== NULL
) {
13156 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13164 ll_unclosed_fh_set
.insert(*fhp
);
13169 Inode
*inode
= in
->get();
13170 if (use_faked_inos())
13171 ino
= inode
->faked_ino
;
13176 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13177 tout(cct
) << ino
<< std::endl
;
13178 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13179 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13180 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13185 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13186 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13187 const UserPerm
& perms
)
13189 std::lock_guard
lock(client_lock
);
13195 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13200 // passing an Inode in outp requires an additional ref
13205 fill_stat(in
, attr
);
13213 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13214 int oflags
, Inode
**outp
, Fh
**fhp
,
13215 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13216 const UserPerm
& perms
)
13218 unsigned caps
= statx_to_mask(lflags
, want
);
13219 std::lock_guard
lock(client_lock
);
13225 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13229 // passing an Inode in outp requires an additional ref
13234 fill_statx(in
, caps
, stx
);
13243 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13245 std::lock_guard
lock(client_lock
);
13246 tout(cct
) << "ll_lseek" << std::endl
;
13247 tout(cct
) << offset
<< std::endl
;
13248 tout(cct
) << whence
<< std::endl
;
13253 return _lseek(fh
, offset
, whence
);
13256 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13258 std::lock_guard
lock(client_lock
);
13259 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13260 tout(cct
) << "ll_read" << std::endl
;
13261 tout(cct
) << (unsigned long)fh
<< std::endl
;
13262 tout(cct
) << off
<< std::endl
;
13263 tout(cct
) << len
<< std::endl
;
13268 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13269 len
= std::min(len
, (loff_t
)INT_MAX
);
13270 return _read(fh
, off
, len
, bl
);
13273 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13277 file_layout_t
* layout
)
13279 std::lock_guard
lock(client_lock
);
13284 vinodeno_t vino
= _get_vino(in
);
13285 object_t oid
= file_object_t(vino
.ino
, blockid
);
13286 C_SaferCond onfinish
;
13289 objecter
->read(oid
,
13290 object_locator_t(layout
->pool_id
),
13295 CEPH_OSD_FLAG_READ
,
13298 client_lock
.unlock();
13299 int r
= onfinish
.wait();
13300 client_lock
.lock();
13303 bl
.begin().copy(bl
.length(), buf
);
13310 /* It appears that the OSD doesn't return success unless the entire
13311 buffer was written, return the write length on success. */
13313 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13314 char* buf
, uint64_t offset
,
13315 uint64_t length
, file_layout_t
* layout
,
13316 uint64_t snapseq
, uint32_t sync
)
13318 vinodeno_t vino
= ll_get_vino(in
);
13320 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13325 if (true || sync
) {
13326 /* if write is stable, the epilogue is waiting on
13328 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13330 object_t oid
= file_object_t(vino
.ino
, blockid
);
13331 SnapContext fakesnap
;
13332 ceph::bufferlist bl
;
13334 bl
.push_back(buffer::copy(buf
, length
));
13337 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13340 fakesnap
.seq
= snapseq
;
13342 /* lock just in time */
13343 client_lock
.lock();
13345 client_lock
.unlock();
13349 objecter
->write(oid
,
13350 object_locator_t(layout
->pool_id
),
13355 ceph::real_clock::now(),
13359 client_lock
.unlock();
13360 if (nullptr != onsafe
) {
13361 r
= onsafe
->wait();
13371 int Client::ll_commit_blocks(Inode
*in
,
13375 std::lock_guard
lock(client_lock
);
13377 BarrierContext *bctx;
13378 vinodeno_t vino = _get_vino(in);
13379 uint64_t ino = vino.ino;
13381 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13382 << offset << " to " << length << dendl;
13388 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13389 if (p != barriers.end()) {
13390 barrier_interval civ(offset, offset + length);
13391 p->second->commit_barrier(civ);
13397 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13399 std::lock_guard
lock(client_lock
);
13400 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13401 "~" << len
<< dendl
;
13402 tout(cct
) << "ll_write" << std::endl
;
13403 tout(cct
) << (unsigned long)fh
<< std::endl
;
13404 tout(cct
) << off
<< std::endl
;
13405 tout(cct
) << len
<< std::endl
;
13410 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13411 len
= std::min(len
, (loff_t
)INT_MAX
);
13412 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13413 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13418 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13420 std::lock_guard
lock(client_lock
);
13423 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13426 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13428 std::lock_guard
lock(client_lock
);
13431 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13434 int Client::ll_flush(Fh
*fh
)
13436 std::lock_guard
lock(client_lock
);
13437 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13438 tout(cct
) << "ll_flush" << std::endl
;
13439 tout(cct
) << (unsigned long)fh
<< std::endl
;
13447 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13449 std::lock_guard
lock(client_lock
);
13450 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13451 tout(cct
) << "ll_fsync" << std::endl
;
13452 tout(cct
) << (unsigned long)fh
<< std::endl
;
13457 int r
= _fsync(fh
, syncdataonly
);
13459 // If we're returning an error, clear it from the FH
13460 fh
->take_async_err();
13465 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13467 std::lock_guard
lock(client_lock
);
13468 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13469 tout(cct
) << "ll_sync_inode" << std::endl
;
13470 tout(cct
) << (unsigned long)in
<< std::endl
;
13475 return _fsync(in
, syncdataonly
);
13478 #ifdef FALLOC_FL_PUNCH_HOLE
13480 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13482 if (offset
< 0 || length
<= 0)
13485 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13486 return -EOPNOTSUPP
;
13488 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13489 return -EOPNOTSUPP
;
13491 Inode
*in
= fh
->inode
.get();
13493 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13494 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13498 if (in
->snapid
!= CEPH_NOSNAP
)
13501 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13504 uint64_t size
= offset
+ length
;
13505 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13507 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13512 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13516 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13517 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13518 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13519 (have
& CEPH_CAP_FILE_BUFFER
)) {
13521 auto inline_iter
= in
->inline_data
.cbegin();
13522 int len
= in
->inline_data
.length();
13523 if (offset
< len
) {
13525 inline_iter
.copy(offset
, bl
);
13527 if (offset
+ size
> len
)
13528 size
= len
- offset
;
13530 bl
.append_zero(size
);
13531 if (offset
+ size
< len
) {
13532 inline_iter
+= size
;
13533 inline_iter
.copy(len
- offset
- size
, bl
);
13535 in
->inline_data
= bl
;
13536 in
->inline_version
++;
13538 in
->mtime
= in
->ctime
= ceph_clock_now();
13540 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13542 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13543 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13544 uninline_data(in
, onuninline
.get());
13547 C_SaferCond
onfinish("Client::_punch_hole flock");
13549 unsafe_sync_write
++;
13550 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13552 _invalidate_inode_cache(in
, offset
, length
);
13553 filer
->zero(in
->ino
, &in
->layout
,
13554 in
->snaprealm
->get_snap_context(),
13556 ceph::real_clock::now(),
13557 0, true, &onfinish
);
13558 in
->mtime
= in
->ctime
= ceph_clock_now();
13560 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13562 client_lock
.unlock();
13564 client_lock
.lock();
13565 _sync_write_commit(in
);
13567 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13568 uint64_t size
= offset
+ length
;
13569 if (size
> in
->size
) {
13571 in
->mtime
= in
->ctime
= ceph_clock_now();
13573 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13575 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13576 check_caps(in
, CHECK_CAPS_NODELAY
);
13577 } else if (is_max_size_approaching(in
)) {
13583 if (nullptr != onuninline
) {
13584 client_lock
.unlock();
13585 int ret
= onuninline
->wait();
13586 client_lock
.lock();
13588 if (ret
>= 0 || ret
== -ECANCELED
) {
13589 in
->inline_data
.clear();
13590 in
->inline_version
= CEPH_INLINE_NONE
;
13591 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13597 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13602 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13604 return -EOPNOTSUPP
;
13610 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13612 std::lock_guard
lock(client_lock
);
13613 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13614 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13615 tout(cct
) << (unsigned long)fh
<< std::endl
;
13620 return _fallocate(fh
, mode
, offset
, length
);
13623 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13625 std::lock_guard
lock(client_lock
);
13626 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13631 Fh
*fh
= get_filehandle(fd
);
13634 #if defined(__linux__) && defined(O_PATH)
13635 if (fh
->flags
& O_PATH
)
13638 return _fallocate(fh
, mode
, offset
, length
);
13641 int Client::ll_release(Fh
*fh
)
13643 std::lock_guard
lock(client_lock
);
13648 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13650 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13651 tout(cct
) << (unsigned long)fh
<< std::endl
;
13653 if (ll_unclosed_fh_set
.count(fh
))
13654 ll_unclosed_fh_set
.erase(fh
);
13655 return _release_fh(fh
);
13658 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13660 std::lock_guard
lock(client_lock
);
13662 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13663 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13668 return _getlk(fh
, fl
, owner
);
13671 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13673 std::lock_guard
lock(client_lock
);
13675 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13676 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13681 return _setlk(fh
, fl
, owner
, sleep
);
13684 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13686 std::lock_guard
lock(client_lock
);
13688 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13689 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13694 return _flock(fh
, cmd
, owner
);
13697 int Client::set_deleg_timeout(uint32_t timeout
)
13699 std::lock_guard
lock(client_lock
);
13702 * The whole point is to prevent blacklisting so we must time out the
13703 * delegation before the session autoclose timeout kicks in.
13705 if (timeout
>= mdsmap
->get_session_autoclose())
13708 deleg_timeout
= timeout
;
13712 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13716 std::lock_guard
lock(client_lock
);
13721 Inode
*inode
= fh
->inode
.get();
13724 case CEPH_DELEGATION_NONE
:
13725 inode
->unset_deleg(fh
);
13730 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13731 } catch (std::bad_alloc
&) {
13739 class C_Client_RequestInterrupt
: public Context
{
13744 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13747 void finish(int r
) override
{
13748 std::lock_guard
l(client
->client_lock
);
13749 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13750 client
->_interrupt_filelock(req
);
13751 client
->put_request(req
);
13755 void Client::ll_interrupt(void *d
)
13757 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13758 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13759 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13760 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13763 // =========================================
13766 // expose file layouts
13768 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13769 const UserPerm
& perms
)
13771 std::lock_guard
lock(client_lock
);
13776 filepath
path(relpath
);
13778 int r
= path_walk(path
, &in
, perms
);
13784 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13788 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13790 std::lock_guard
lock(client_lock
);
13795 Fh
*f
= get_filehandle(fd
);
13798 Inode
*in
= f
->inode
.get();
13802 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13806 int64_t Client::get_default_pool_id()
13808 std::lock_guard
lock(client_lock
);
13813 /* first data pool is the default */
13814 return mdsmap
->get_first_data_pool();
13819 int64_t Client::get_pool_id(const char *pool_name
)
13821 std::lock_guard
lock(client_lock
);
13826 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13830 string
Client::get_pool_name(int64_t pool
)
13832 std::lock_guard
lock(client_lock
);
13837 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13838 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13842 int Client::get_pool_replication(int64_t pool
)
13844 std::lock_guard
lock(client_lock
);
13849 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13850 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13854 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13856 std::lock_guard
lock(client_lock
);
13861 Fh
*f
= get_filehandle(fd
);
13864 Inode
*in
= f
->inode
.get();
13866 vector
<ObjectExtent
> extents
;
13867 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13868 ceph_assert(extents
.size() == 1);
13870 objecter
->with_osdmap([&](const OSDMap
& o
) {
13871 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13872 o
.pg_to_acting_osds(pg
, osds
);
13879 * Return the remainder of the extent (stripe unit)
13881 * If length = 1 is passed to Striper::file_to_extents we get a single
13882 * extent back, but its length is one so we still need to compute the length
13883 * to the end of the stripe unit.
13885 * If length = su then we may get 1 or 2 objects back in the extents vector
13886 * which would have to be examined. Even then, the offsets are local to the
13887 * object, so matching up to the file offset is extra work.
13889 * It seems simpler to stick with length = 1 and manually compute the
13893 uint64_t su
= in
->layout
.stripe_unit
;
13894 *len
= su
- (off
% su
);
13900 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13902 std::lock_guard
lock(client_lock
);
13909 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13910 return o
.crush
->get_full_location_ordered(id
, path
);
13914 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13915 vector
<entity_addr_t
>& address
)
13917 std::lock_guard
lock(client_lock
);
13922 Fh
*f
= get_filehandle(fd
);
13925 Inode
*in
= f
->inode
.get();
13928 vector
<ObjectExtent
> extents
;
13929 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13930 in
->truncate_size
, extents
);
13931 ceph_assert(extents
.size() == 1);
13933 // now we have the object and its 'layout'
13934 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13935 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13937 o
.pg_to_acting_osds(pg
, osds
);
13940 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13941 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13942 address
.push_back(addr
);
13948 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13950 std::lock_guard
lock(client_lock
);
13955 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13956 if (!o
.exists(osd
))
13959 addr
= o
.get_addrs(osd
).front();
13964 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13965 loff_t length
, loff_t offset
)
13967 std::lock_guard
lock(client_lock
);
13972 Fh
*f
= get_filehandle(fd
);
13975 Inode
*in
= f
->inode
.get();
13977 // map to a list of extents
13978 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13980 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13985 /* find an osd with the same ip. -ENXIO if none. */
13986 int Client::get_local_osd()
13988 std::lock_guard
lock(client_lock
);
13993 objecter
->with_osdmap([this](const OSDMap
& o
) {
13994 if (o
.get_epoch() != local_osd_epoch
) {
13995 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
13996 local_osd_epoch
= o
.get_epoch();
14007 // ===============================
14009 void Client::ms_handle_connect(Connection
*con
)
14011 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14014 bool Client::ms_handle_reset(Connection
*con
)
14016 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14020 void Client::ms_handle_remote_reset(Connection
*con
)
14022 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14023 std::lock_guard
l(client_lock
);
14024 switch (con
->get_peer_type()) {
14025 case CEPH_ENTITY_TYPE_MDS
:
14027 // kludge to figure out which mds this is; fixme with a Connection* state
14028 mds_rank_t mds
= MDS_RANK_NONE
;
14029 MetaSession
*s
= NULL
;
14030 for (auto &p
: mds_sessions
) {
14031 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14037 assert (s
!= NULL
);
14038 switch (s
->state
) {
14039 case MetaSession::STATE_CLOSING
:
14040 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14041 _closed_mds_session(s
);
14044 case MetaSession::STATE_OPENING
:
14046 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14047 list
<Context
*> waiters
;
14048 waiters
.swap(s
->waiting_for_open
);
14049 _closed_mds_session(s
);
14050 MetaSession
*news
= _get_or_open_mds_session(mds
);
14051 news
->waiting_for_open
.swap(waiters
);
14055 case MetaSession::STATE_OPEN
:
14057 objecter
->maybe_request_map(); /* to check if we are blacklisted */
14058 const auto& conf
= cct
->_conf
;
14059 if (conf
->client_reconnect_stale
) {
14060 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14061 _closed_mds_session(s
);
14063 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14064 s
->state
= MetaSession::STATE_STALE
;
14069 case MetaSession::STATE_NEW
:
14070 case MetaSession::STATE_CLOSED
:
14080 bool Client::ms_handle_refused(Connection
*con
)
14082 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14086 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14088 Inode
*quota_in
= root_ancestor
;
14089 SnapRealm
*realm
= in
->snaprealm
;
14091 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14092 if (realm
->ino
!= in
->ino
) {
14093 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14094 if (p
== inode_map
.end())
14097 if (p
->second
->quota
.is_enable()) {
14098 quota_in
= p
->second
;
14102 realm
= realm
->pparent
;
14104 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14109 * Traverse quota ancestors of the Inode, return true
14110 * if any of them passes the passed function
14112 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14113 std::function
<bool (const Inode
&in
)> test
)
14116 ceph_assert(in
!= NULL
);
14121 if (in
== root_ancestor
) {
14122 // We're done traversing, drop out
14125 // Continue up the tree
14126 in
= get_quota_root(in
, perms
);
14133 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14135 return check_quota_condition(in
, perms
,
14136 [](const Inode
&in
) {
14137 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14141 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14142 const UserPerm
& perms
)
14144 return check_quota_condition(in
, perms
,
14145 [&new_bytes
](const Inode
&in
) {
14146 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14147 > in
.quota
.max_bytes
;
14151 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14153 ceph_assert(in
->size
>= in
->reported_size
);
14154 const uint64_t size
= in
->size
- in
->reported_size
;
14155 return check_quota_condition(in
, perms
,
14156 [&size
](const Inode
&in
) {
14157 if (in
.quota
.max_bytes
) {
14158 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14162 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14163 return (space
>> 4) < size
;
14177 int Client::check_pool_perm(Inode
*in
, int need
)
14179 if (!cct
->_conf
->client_check_pool_perm
)
14182 int64_t pool_id
= in
->layout
.pool_id
;
14183 std::string pool_ns
= in
->layout
.pool_ns
;
14184 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14187 auto it
= pool_perms
.find(perm_key
);
14188 if (it
== pool_perms
.end())
14190 if (it
->second
== POOL_CHECKING
) {
14191 // avoid concurrent checkings
14192 wait_on_list(waiting_for_pool_perm
);
14195 ceph_assert(have
& POOL_CHECKED
);
14201 if (in
->snapid
!= CEPH_NOSNAP
) {
14202 // pool permission check needs to write to the first object. But for snapshot,
14203 // head of the first object may have alread been deleted. To avoid creating
14204 // orphan object, skip the check for now.
14208 pool_perms
[perm_key
] = POOL_CHECKING
;
14211 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14212 object_t oid
= oid_buf
;
14214 SnapContext nullsnapc
;
14216 C_SaferCond rd_cond
;
14217 ObjectOperation rd_op
;
14218 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14220 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14221 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14223 C_SaferCond wr_cond
;
14224 ObjectOperation wr_op
;
14225 wr_op
.create(true);
14227 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14228 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14230 client_lock
.unlock();
14231 int rd_ret
= rd_cond
.wait();
14232 int wr_ret
= wr_cond
.wait();
14233 client_lock
.lock();
14235 bool errored
= false;
14237 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14239 else if (rd_ret
!= -EPERM
) {
14240 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14241 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14245 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14246 have
|= POOL_WRITE
;
14247 else if (wr_ret
!= -EPERM
) {
14248 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14249 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14254 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14255 // Raise EIO because actual error code might be misleading for
14256 // userspace filesystem user.
14257 pool_perms
.erase(perm_key
);
14258 signal_cond_list(waiting_for_pool_perm
);
14262 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14263 signal_cond_list(waiting_for_pool_perm
);
14266 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14267 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14268 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14271 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14272 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14273 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14280 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14282 if (acl_type
== POSIX_ACL
) {
14283 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14284 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14286 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14292 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14294 if (acl_type
== NO_ACL
)
14297 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14301 if (acl_type
== POSIX_ACL
) {
14302 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14303 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14304 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14305 r
= posix_acl_access_chmod(acl
, mode
);
14308 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14314 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14318 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14319 const UserPerm
& perms
)
14321 if (acl_type
== NO_ACL
)
14324 if (S_ISLNK(*mode
))
14327 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14331 if (acl_type
== POSIX_ACL
) {
14332 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14333 map
<string
, bufferptr
> xattrs
;
14335 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14336 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14337 r
= posix_acl_inherit_mode(acl
, mode
);
14342 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14346 xattrs
[ACL_EA_ACCESS
] = acl
;
14349 if (S_ISDIR(*mode
))
14350 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14354 encode(xattrs
, xattrs_bl
);
14357 *mode
&= ~umask_cb(callback_handle
);
14362 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14366 void Client::set_filer_flags(int flags
)
14368 std::lock_guard
l(client_lock
);
14369 ceph_assert(flags
== 0 ||
14370 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14371 objecter
->add_global_op_flags(flags
);
14374 void Client::clear_filer_flags(int flags
)
14376 std::lock_guard
l(client_lock
);
14377 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14378 objecter
->clear_global_op_flag(flags
);
14381 // called before mount
14382 void Client::set_uuid(const std::string
& uuid
)
14384 std::lock_guard
l(client_lock
);
14385 assert(initialized
);
14386 assert(!uuid
.empty());
14388 metadata
["uuid"] = uuid
;
14392 // called before mount. 0 means infinite
14393 void Client::set_session_timeout(unsigned timeout
)
14395 std::lock_guard
l(client_lock
);
14396 assert(initialized
);
14398 metadata
["timeout"] = stringify(timeout
);
14401 // called before mount
14402 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14403 const std::string
& fs_name
)
14405 std::lock_guard
l(client_lock
);
14413 auto it
= metadata
.find("uuid");
14414 if (it
!= metadata
.end() && it
->second
== uuid
)
14418 int r
= subscribe_mdsmap(fs_name
);
14420 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14424 if (metadata
.empty())
14425 populate_metadata("");
14427 while (mdsmap
->get_epoch() == 0)
14428 wait_on_list(waiting_for_mdsmap
);
14431 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14432 if (!mdsmap
->is_up(mds
)) {
14433 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14434 wait_on_list(waiting_for_mdsmap
);
14438 MetaSession
*session
;
14439 if (!have_open_session(mds
)) {
14440 session
= _get_or_open_mds_session(mds
);
14441 if (session
->state
!= MetaSession::STATE_OPENING
) {
14445 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14446 wait_on_context_list(session
->waiting_for_open
);
14447 if (rejected_by_mds
.count(mds
))
14452 session
= &mds_sessions
.at(mds
);
14453 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14454 return -EOPNOTSUPP
;
14456 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14457 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14458 session
->reclaim_state
= MetaSession::RECLAIMING
;
14459 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
14460 session
->con
->send_message2(std::move(m
));
14461 wait_on_list(waiting_for_reclaim
);
14462 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14463 return reclaim_errno
? : -ENOTRECOVERABLE
;
14469 // didn't find target session in any mds
14470 if (reclaim_target_addrs
.empty()) {
14471 if (flags
& CEPH_RECLAIM_RESET
)
14473 return -ENOTRECOVERABLE
;
14476 if (flags
& CEPH_RECLAIM_RESET
)
14479 // use blacklist to check if target session was killed
14480 // (config option mds_session_blacklist_on_evict needs to be true)
14482 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14483 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14484 client_lock
.unlock();
14486 client_lock
.lock();
14489 bool blacklisted
= objecter
->with_osdmap(
14490 [this](const OSDMap
&osd_map
) -> bool {
14491 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14494 return -ENOTRECOVERABLE
;
14496 metadata
["reclaiming_uuid"] = uuid
;
14500 void Client::finish_reclaim()
14502 auto it
= metadata
.find("reclaiming_uuid");
14503 if (it
== metadata
.end()) {
14504 for (auto &p
: mds_sessions
)
14505 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14509 for (auto &p
: mds_sessions
) {
14510 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14511 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
14512 p
.second
.con
->send_message2(std::move(m
));
14515 metadata
["uuid"] = it
->second
;
14516 metadata
.erase(it
);
14519 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14521 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14522 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14524 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14526 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14530 if (reply
->get_result() >= 0) {
14531 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14532 if (reply
->get_epoch() > reclaim_osd_epoch
)
14533 reclaim_osd_epoch
= reply
->get_epoch();
14534 if (!reply
->get_addrs().empty())
14535 reclaim_target_addrs
= reply
->get_addrs();
14537 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14538 reclaim_errno
= reply
->get_result();
14541 signal_cond_list(waiting_for_reclaim
);
14545 * This is included in cap release messages, to cause
14546 * the MDS to wait until this OSD map epoch. It is necessary
14547 * in corner cases where we cancel RADOS ops, so that
14548 * nobody else tries to do IO to the same objects in
14549 * the same epoch as the cancelled ops.
14551 void Client::set_cap_epoch_barrier(epoch_t e
)
14553 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14554 cap_epoch_barrier
= e
;
14557 const char** Client::get_tracked_conf_keys() const
14559 static const char* keys
[] = {
14560 "client_cache_size",
14561 "client_cache_mid",
14563 "client_deleg_timeout",
14564 "client_deleg_break_on_open",
14570 void Client::handle_conf_change(const ConfigProxy
& conf
,
14571 const std::set
<std::string
> &changed
)
14573 std::lock_guard
lock(client_lock
);
14575 if (changed
.count("client_cache_mid")) {
14576 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14578 if (changed
.count("client_acl_type")) {
14580 if (cct
->_conf
->client_acl_type
== "posix_acl")
14581 acl_type
= POSIX_ACL
;
14585 void intrusive_ptr_add_ref(Inode
*in
)
14590 void intrusive_ptr_release(Inode
*in
)
14592 in
->client
->put_inode(in
);
14595 mds_rank_t
Client::_get_random_up_mds() const
14597 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14599 std::set
<mds_rank_t
> up
;
14600 mdsmap
->get_up_mds_set(up
);
14603 return MDS_RANK_NONE
;
14604 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14605 for (int n
= rand() % up
.size(); n
; n
--)
14611 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14612 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14614 monclient
->set_messenger(m
);
14615 objecter
->set_client_incarnation(0);
14618 StandaloneClient::~StandaloneClient()
14621 objecter
= nullptr;
14624 int StandaloneClient::init()
14627 objectcacher
->start();
14630 client_lock
.lock();
14631 ceph_assert(!is_initialized());
14633 messenger
->add_dispatcher_tail(objecter
);
14634 messenger
->add_dispatcher_tail(this);
14636 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14637 int r
= monclient
->init();
14639 // need to do cleanup because we're in an intermediate init state
14641 client_lock
.unlock();
14642 objecter
->shutdown();
14643 objectcacher
->stop();
14644 monclient
->shutdown();
14649 client_lock
.unlock();
14655 void StandaloneClient::shutdown()
14657 Client::shutdown();
14658 objecter
->shutdown();
14659 monclient
->shutdown();