1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
78 #define dout_subsys ceph_subsys_client
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
87 #include "Delegation.h"
89 #include "ClientSnapRealm.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
99 #include "include/cephfs/ceph_statx.h"
101 #if HAVE_GETGROUPLIST
108 #define dout_prefix *_dout << "client." << whoami << " "
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112 // FreeBSD fails to define this
116 // Darwin fails to define this
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
129 Client
*client
= static_cast<Client
*>(p
);
130 client
->flush_set_callback(oset
);
136 Client::CommandHook::CommandHook(Client
*client
) :
141 int Client::CommandHook::call(
142 std::string_view command
,
143 const cmdmap_t
& cmdmap
,
148 f
->open_object_section("result");
150 std::lock_guard l
{m_client
->client_lock
};
151 if (command
== "mds_requests")
152 m_client
->dump_mds_requests(f
);
153 else if (command
== "mds_sessions")
154 m_client
->dump_mds_sessions(f
);
155 else if (command
== "dump_cache")
156 m_client
->dump_cache(f
);
157 else if (command
== "kick_stale_sessions")
158 m_client
->_kick_stale_sessions();
159 else if (command
== "status")
160 m_client
->dump_status(f
);
162 ceph_abort_msg("bad command registered");
171 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
172 : inode(in
), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
177 void Client::_reset_faked_inos()
180 free_faked_inos
.clear();
181 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
182 last_used_faked_ino
= 0;
183 last_used_faked_root
= 0;
184 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
187 void Client::_assign_faked_ino(Inode
*in
)
189 if (0 == last_used_faked_ino
)
190 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
192 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
193 last_used_faked_ino
= 2048;
194 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
196 ceph_assert(it
!= free_faked_inos
.end());
197 if (last_used_faked_ino
< it
.get_start()) {
198 ceph_assert(it
.get_len() > 0);
199 last_used_faked_ino
= it
.get_start();
201 ++last_used_faked_ino
;
202 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
204 in
->faked_ino
= last_used_faked_ino
;
205 free_faked_inos
.erase(in
->faked_ino
);
206 faked_ino_map
[in
->faked_ino
] = in
->vino();
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
216 void Client::_assign_faked_root(Inode
*in
)
218 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
219 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
220 last_used_faked_root
= 0;
221 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
223 assert(it
!= free_faked_inos
.end());
224 vinodeno_t inode_info
= in
->vino();
225 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
226 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
227 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
230 in
->faked_ino
= last_used_faked_root
;
231 free_faked_inos
.erase(in
->faked_ino
);
232 faked_ino_map
[in
->faked_ino
] = in
->vino();
235 void Client::_release_faked_ino(Inode
*in
)
237 free_faked_inos
.insert(in
->faked_ino
);
238 faked_ino_map
.erase(in
->faked_ino
);
241 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
246 else if (faked_ino_map
.count(ino
))
247 vino
= faked_ino_map
[ino
];
249 vino
= vinodeno_t(0, CEPH_NOSNAP
);
250 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
254 vinodeno_t
Client::map_faked_ino(ino_t ino
)
256 std::lock_guard
lock(client_lock
);
257 return _map_faked_ino(ino
);
262 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
263 : Dispatcher(m
->cct
),
264 timer(m
->cct
, client_lock
),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 objecter_finisher(m
->cct
),
274 m_command_hook(this),
279 user_id
= cct
->_conf
->client_mount_uid
;
280 group_id
= cct
->_conf
->client_mount_gid
;
281 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
282 "fuse_default_permissions");
284 if (cct
->_conf
->client_acl_type
== "posix_acl")
285 acl_type
= POSIX_ACL
;
287 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
290 free_fd_set
.insert(10, 1<<30);
292 mdsmap
.reset(new MDSMap
);
295 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
297 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
298 client_flush_set_callback
, // all commit callback
300 cct
->_conf
->client_oc_size
,
301 cct
->_conf
->client_oc_max_objects
,
302 cct
->_conf
->client_oc_max_dirty
,
303 cct
->_conf
->client_oc_target_dirty
,
304 cct
->_conf
->client_oc_max_dirty_age
,
306 objecter_finisher
.start();
307 filer
.reset(new Filer(objecter
, &objecter_finisher
));
308 objecter
->enable_blacklist_events();
314 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
319 std::lock_guard l
{client_lock
};
323 void Client::tear_down_cache()
326 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
330 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
335 while (!opened_dirs
.empty()) {
336 dir_result_t
*dirp
= *opened_dirs
.begin();
337 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
346 ceph_assert(lru
.lru_get_size() == 0);
349 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
350 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
354 while (!root_parents
.empty())
355 root_parents
.erase(root_parents
.begin());
360 ceph_assert(inode_map
.empty());
363 inodeno_t
Client::get_root_ino()
365 std::lock_guard
l(client_lock
);
366 if (use_faked_inos())
367 return root
->faked_ino
;
372 Inode
*Client::get_root()
374 std::lock_guard
l(client_lock
);
382 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
385 in
->make_long_path(path
);
386 ldout(cct
, 1) << "dump_inode: "
387 << (disconnected
? "DISCONNECTED ":"")
388 << "inode " << in
->ino
390 << " ref " << in
->get_num_ref()
394 f
->open_object_section("inode");
395 f
->dump_stream("path") << path
;
397 f
->dump_int("disconnected", 1);
404 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
405 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
406 it
!= in
->dir
->dentries
.end();
408 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
410 f
->open_object_section("dentry");
414 if (it
->second
->inode
)
415 dump_inode(f
, it
->second
->inode
.get(), did
, false);
420 void Client::dump_cache(Formatter
*f
)
424 ldout(cct
, 1) << __func__
<< dendl
;
427 f
->open_array_section("cache");
430 dump_inode(f
, root
, did
, true);
432 // make a second pass to catch anything disconnected
433 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
434 it
!= inode_map
.end();
436 if (did
.count(it
->second
))
438 dump_inode(f
, it
->second
, did
, true);
445 void Client::dump_status(Formatter
*f
)
447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
449 ldout(cct
, 1) << __func__
<< dendl
;
451 const epoch_t osd_epoch
452 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
455 f
->open_object_section("metadata");
456 for (const auto& kv
: metadata
)
457 f
->dump_string(kv
.first
.c_str(), kv
.second
);
460 f
->dump_int("dentry_count", lru
.lru_get_size());
461 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
462 f
->dump_int("id", get_nodeid().v
);
463 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
464 f
->dump_object("inst", inst
);
465 f
->dump_object("addr", inst
.addr
);
466 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
467 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
468 f
->dump_int("inode_count", inode_map
.size());
469 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
470 f
->dump_int("osd_epoch", osd_epoch
);
471 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
472 f
->dump_bool("blacklisted", blacklisted
);
479 objectcacher
->start();
481 std::lock_guard l
{client_lock
};
482 ceph_assert(!initialized
);
483 messenger
->add_dispatcher_tail(this);
489 void Client::_finish_init()
492 std::lock_guard l
{client_lock
};
494 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
495 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
496 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
497 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
498 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
499 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
500 logger
.reset(plb
.create_perf_counters());
501 cct
->get_perfcounters_collection()->add(logger
.get());
504 cct
->_conf
.add_observer(this);
506 AdminSocket
* admin_socket
= cct
->get_admin_socket();
507 int ret
= admin_socket
->register_command("mds_requests",
509 "show in-progress mds requests");
511 lderr(cct
) << "error registering admin socket command: "
512 << cpp_strerror(-ret
) << dendl
;
514 ret
= admin_socket
->register_command("mds_sessions",
516 "show mds session state");
518 lderr(cct
) << "error registering admin socket command: "
519 << cpp_strerror(-ret
) << dendl
;
521 ret
= admin_socket
->register_command("dump_cache",
523 "show in-memory metadata cache contents");
525 lderr(cct
) << "error registering admin socket command: "
526 << cpp_strerror(-ret
) << dendl
;
528 ret
= admin_socket
->register_command("kick_stale_sessions",
530 "kick sessions that were remote reset");
532 lderr(cct
) << "error registering admin socket command: "
533 << cpp_strerror(-ret
) << dendl
;
535 ret
= admin_socket
->register_command("status",
537 "show overall client status");
539 lderr(cct
) << "error registering admin socket command: "
540 << cpp_strerror(-ret
) << dendl
;
543 std::lock_guard l
{client_lock
};
547 void Client::shutdown()
549 ldout(cct
, 1) << __func__
<< dendl
;
551 // If we were not mounted, but were being used for sending
552 // MDS commands, we may have sessions that need closing.
554 std::lock_guard l
{client_lock
};
557 cct
->_conf
.remove_observer(this);
559 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
561 if (ino_invalidate_cb
) {
562 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
563 async_ino_invalidator
.wait_for_empty();
564 async_ino_invalidator
.stop();
567 if (dentry_invalidate_cb
) {
568 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
569 async_dentry_invalidator
.wait_for_empty();
570 async_dentry_invalidator
.stop();
573 if (switch_interrupt_cb
) {
574 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
575 interrupt_finisher
.wait_for_empty();
576 interrupt_finisher
.stop();
580 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
581 remount_finisher
.wait_for_empty();
582 remount_finisher
.stop();
585 objectcacher
->stop(); // outside of client_lock! this does a join.
587 std::lock_guard l
{client_lock
};
588 ceph_assert(initialized
);
592 objecter_finisher
.wait_for_empty();
593 objecter_finisher
.stop();
596 cct
->get_perfcounters_collection()->remove(logger
.get());
602 // ===================
603 // metadata cache stuff
605 void Client::trim_cache(bool trim_kernel_dcache
)
607 uint64_t max
= cct
->_conf
->client_cache_size
;
608 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
610 while (lru
.lru_get_size() != last
) {
611 last
= lru
.lru_get_size();
613 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
616 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
623 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
624 _invalidate_kernel_dcache();
627 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
628 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
632 while (!root_parents
.empty())
633 root_parents
.erase(root_parents
.begin());
639 void Client::trim_cache_for_reconnect(MetaSession
*s
)
641 mds_rank_t mds
= s
->mds_num
;
642 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
645 list
<Dentry
*> skipped
;
646 while (lru
.lru_get_size() > 0) {
647 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
651 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
652 dn
->dir
->parent_inode
->caps
.count(mds
)) {
656 skipped
.push_back(dn
);
659 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
660 lru
.lru_insert_mid(*p
);
662 ldout(cct
, 20) << __func__
<< " mds." << mds
663 << " trimmed " << trimmed
<< " dentries" << dendl
;
665 if (s
->caps
.size() > 0)
666 _invalidate_kernel_dcache();
669 void Client::trim_dentry(Dentry
*dn
)
671 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
673 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
676 Inode
*diri
= dn
->dir
->parent_inode
;
677 diri
->dir_release_count
++;
678 clear_dir_complete_and_ordered(diri
, true);
680 unlink(dn
, false, false); // drop dir, drop dentry
684 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
685 uint64_t truncate_seq
, uint64_t truncate_size
)
687 uint64_t prior_size
= in
->size
;
689 if (truncate_seq
> in
->truncate_seq
||
690 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
691 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
693 in
->reported_size
= size
;
694 if (truncate_seq
!= in
->truncate_seq
) {
695 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
696 << truncate_seq
<< dendl
;
697 in
->truncate_seq
= truncate_seq
;
698 in
->oset
.truncate_seq
= truncate_seq
;
700 // truncate cached file data
701 if (prior_size
> size
) {
702 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
706 // truncate inline data
707 if (in
->inline_version
< CEPH_INLINE_NONE
) {
708 uint32_t len
= in
->inline_data
.length();
710 in
->inline_data
.splice(size
, len
- size
);
713 if (truncate_seq
>= in
->truncate_seq
&&
714 in
->truncate_size
!= truncate_size
) {
716 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
717 << truncate_size
<< dendl
;
718 in
->truncate_size
= truncate_size
;
719 in
->oset
.truncate_size
= truncate_size
;
721 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
726 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
727 utime_t ctime
, utime_t mtime
, utime_t atime
)
729 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
730 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
732 if (time_warp_seq
> in
->time_warp_seq
)
733 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
734 << " is higher than local time_warp_seq "
735 << in
->time_warp_seq
<< dendl
;
738 // be careful with size, mtime, atime
739 if (issued
& (CEPH_CAP_FILE_EXCL
|
741 CEPH_CAP_FILE_BUFFER
|
743 CEPH_CAP_XATTR_EXCL
)) {
744 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
745 if (ctime
> in
->ctime
)
747 if (time_warp_seq
> in
->time_warp_seq
) {
748 //the mds updated times, so take those!
751 in
->time_warp_seq
= time_warp_seq
;
752 } else if (time_warp_seq
== in
->time_warp_seq
) {
754 if (mtime
> in
->mtime
)
756 if (atime
> in
->atime
)
758 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
759 //ignore mds values as we have a higher seq
762 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
763 if (time_warp_seq
>= in
->time_warp_seq
) {
767 in
->time_warp_seq
= time_warp_seq
;
771 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
772 << time_warp_seq
<< " is lower than local time_warp_seq "
778 void Client::_fragmap_remove_non_leaves(Inode
*in
)
780 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
781 if (!in
->dirfragtree
.is_leaf(p
->first
))
782 in
->fragmap
.erase(p
++);
787 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
789 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
790 if (p
->second
== mds
)
791 in
->fragmap
.erase(p
++);
796 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
797 MetaSession
*session
,
798 const UserPerm
& request_perms
)
801 bool was_new
= false;
802 if (inode_map
.count(st
->vino
)) {
803 in
= inode_map
[st
->vino
];
804 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
806 in
= new Inode(this, st
->vino
, &st
->layout
);
807 inode_map
[st
->vino
] = in
;
809 if (use_faked_inos())
810 _assign_faked_ino(in
);
814 if (use_faked_inos())
815 _assign_faked_root(root
);
818 } else if (!mounted
) {
819 root_parents
[root_ancestor
] = in
;
824 in
->ino
= st
->vino
.ino
;
825 in
->snapid
= st
->vino
.snapid
;
826 in
->mode
= st
->mode
& S_IFMT
;
831 if (in
->is_symlink())
832 in
->symlink
= st
->symlink
;
834 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
835 bool new_version
= false;
836 if (in
->version
== 0 ||
837 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
838 (in
->version
& ~1) < st
->version
))
842 in
->caps_issued(&issued
);
843 issued
|= in
->caps_dirty();
844 int new_issued
= ~issued
& (int)st
->cap
.caps
;
846 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
847 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
851 in
->btime
= st
->btime
;
852 in
->snap_btime
= st
->snap_btime
;
855 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
856 !(issued
& CEPH_CAP_LINK_EXCL
)) {
857 in
->nlink
= st
->nlink
;
860 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
861 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
862 st
->ctime
, st
->mtime
, st
->atime
);
866 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
867 in
->layout
= st
->layout
;
868 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
872 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
873 in
->dirstat
= st
->dirstat
;
875 // dir_layout/rstat/quota are not tracked by capability, update them only if
876 // the inode stat is from auth mds
877 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
878 in
->dir_layout
= st
->dir_layout
;
879 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
880 in
->rstat
= st
->rstat
;
881 in
->quota
= st
->quota
;
882 in
->dir_pin
= st
->dir_pin
;
884 // move me if/when version reflects fragtree changes.
885 if (in
->dirfragtree
!= st
->dirfragtree
) {
886 in
->dirfragtree
= st
->dirfragtree
;
887 _fragmap_remove_non_leaves(in
);
891 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
892 st
->xattrbl
.length() &&
893 st
->xattr_version
> in
->xattr_version
) {
894 auto p
= st
->xattrbl
.cbegin();
895 decode(in
->xattrs
, p
);
896 in
->xattr_version
= st
->xattr_version
;
899 if (st
->inline_version
> in
->inline_version
) {
900 in
->inline_data
= st
->inline_data
;
901 in
->inline_version
= st
->inline_version
;
904 /* always take a newer change attr */
905 if (st
->change_attr
> in
->change_attr
)
906 in
->change_attr
= st
->change_attr
;
908 if (st
->version
> in
->version
)
909 in
->version
= st
->version
;
912 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
915 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
917 if (in
->snapid
== CEPH_NOSNAP
) {
918 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
919 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
920 st
->cap
.flags
, request_perms
);
921 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
922 in
->max_size
= st
->max_size
;
923 in
->rstat
= st
->rstat
;
926 // setting I_COMPLETE needs to happen after adding the cap
928 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
929 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
930 in
->dirstat
.nfiles
== 0 &&
931 in
->dirstat
.nsubdirs
== 0) {
932 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
933 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
935 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
936 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
937 in
->dir
->readdir_cache
.clear();
938 for (const auto& p
: in
->dir
->dentries
) {
939 unlink(p
.second
, true, true); // keep dir, keep dentry
941 if (in
->dir
->dentries
.empty())
946 in
->snap_caps
|= st
->cap
.caps
;
954 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
956 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
957 Inode
*in
, utime_t from
, MetaSession
*session
,
961 if (dir
->dentries
.count(dname
))
962 dn
= dir
->dentries
[dname
];
964 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
965 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
968 if (dn
&& dn
->inode
) {
969 if (dn
->inode
->vino() == in
->vino()) {
971 ldout(cct
, 12) << " had dentry " << dname
972 << " with correct vino " << dn
->inode
->vino()
975 ldout(cct
, 12) << " had dentry " << dname
976 << " with WRONG vino " << dn
->inode
->vino()
978 unlink(dn
, true, true); // keep dir, keep dentry
982 if (!dn
|| !dn
->inode
) {
983 InodeRef
tmp_ref(in
);
985 if (old_dentry
->dir
!= dir
) {
986 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
987 old_diri
->dir_ordered_count
++;
988 clear_dir_complete_and_ordered(old_diri
, false);
990 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
992 Inode
*diri
= dir
->parent_inode
;
993 diri
->dir_ordered_count
++;
994 clear_dir_complete_and_ordered(diri
, false);
995 dn
= link(dir
, dname
, in
, dn
);
998 update_dentry_lease(dn
, dlease
, from
, session
);
1002 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1004 utime_t dttl
= from
;
1005 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1009 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1010 if (dttl
> dn
->lease_ttl
) {
1011 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1012 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1013 dn
->lease_ttl
= dttl
;
1014 dn
->lease_mds
= session
->mds_num
;
1015 dn
->lease_seq
= dlease
->seq
;
1016 dn
->lease_gen
= session
->cap_gen
;
1019 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1024 * update MDS location cache for a single inode
1026 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1029 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1030 if (dst
->auth
>= 0) {
1031 in
->fragmap
[dst
->frag
] = dst
->auth
;
1033 in
->fragmap
.erase(dst
->frag
);
1035 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1036 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1037 _fragmap_remove_non_leaves(in
);
1041 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1044 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1046 if (diri
->flags
& I_COMPLETE
) {
1048 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1049 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1051 if (diri
->flags
& I_DIR_ORDERED
) {
1052 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1053 diri
->flags
&= ~I_DIR_ORDERED
;
1057 diri
->dir
->readdir_cache
.clear();
1062 * insert results from readdir or lssnap into the metadata cache.
1064 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1066 auto& reply
= request
->reply
;
1067 ConnectionRef con
= request
->reply
->get_connection();
1069 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1070 features
= (uint64_t)-1;
1073 features
= con
->get_features();
1076 dir_result_t
*dirp
= request
->dirp
;
1079 // the extra buffer list is only set for readdir and lssnap replies
1080 auto p
= reply
->get_extra_bl().cbegin();
1083 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1085 diri
= open_snapdir(diri
);
1088 // only open dir if we're actually adding stuff to it!
1089 Dir
*dir
= diri
->open_dir();
1093 DirStat
dst(p
, features
);
1099 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1100 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1102 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1103 unsigned readdir_offset
= dirp
->next_offset
;
1104 string readdir_start
= dirp
->last_name
;
1105 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1107 unsigned last_hash
= 0;
1109 if (!readdir_start
.empty()) {
1110 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1111 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1112 /* mds understands offset_hash */
1113 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1117 if (fg
!= dst
.frag
) {
1118 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1122 readdir_start
.clear();
1123 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1127 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1128 << ", hash_order=" << hash_order
1129 << ", readdir_start " << readdir_start
1130 << ", last_hash " << last_hash
1131 << ", next_offset " << readdir_offset
<< dendl
;
1133 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1134 fg
.is_leftmost() && readdir_offset
== 2 &&
1135 !(hash_order
&& last_hash
)) {
1136 dirp
->release_count
= diri
->dir_release_count
;
1137 dirp
->ordered_count
= diri
->dir_ordered_count
;
1138 dirp
->start_shared_gen
= diri
->shared_gen
;
1139 dirp
->cache_index
= 0;
1142 dirp
->buffer_frag
= fg
;
1144 _readdir_drop_dirp_buffer(dirp
);
1145 dirp
->buffer
.reserve(numdn
);
1149 for (unsigned i
=0; i
<numdn
; i
++) {
1151 dlease
.decode(p
, features
);
1152 InodeStat
ist(p
, features
);
1154 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1156 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1159 if (diri
->dir
->dentries
.count(dname
)) {
1160 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1161 if (olddn
->inode
!= in
) {
1162 // replace incorrect dentry
1163 unlink(olddn
, true, true); // keep dir, dentry
1164 dn
= link(dir
, dname
, in
, olddn
);
1165 ceph_assert(dn
== olddn
);
1173 dn
= link(dir
, dname
, in
, NULL
);
1176 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1178 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1179 if (hash
!= last_hash
)
1182 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1184 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1186 // add to readdir cache
1187 if (dirp
->release_count
== diri
->dir_release_count
&&
1188 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1189 dirp
->start_shared_gen
== diri
->shared_gen
) {
1190 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1192 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1193 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1195 dir
->readdir_cache
.push_back(dn
);
1196 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1197 if (dirp
->inode
->is_complete_and_ordered())
1198 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1200 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1202 ceph_abort_msg("unexpected readdir buffer idx");
1204 dirp
->cache_index
++;
1206 // add to cached result list
1207 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1208 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1212 dirp
->last_name
= dname
;
1214 dirp
->next_offset
= 2;
1216 dirp
->next_offset
= readdir_offset
;
1218 if (dir
->is_empty())
1225 * insert a trace from a MDS reply into the cache.
1227 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1229 auto& reply
= request
->reply
;
1230 int op
= request
->get_op();
1232 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1233 << " is_target=" << (int)reply
->head
.is_target
1234 << " is_dentry=" << (int)reply
->head
.is_dentry
1237 auto p
= reply
->get_trace_bl().cbegin();
1238 if (request
->got_unsafe
) {
1239 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1240 ceph_assert(p
.end());
1245 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1247 Dentry
*d
= request
->dentry();
1249 Inode
*diri
= d
->dir
->parent_inode
;
1250 diri
->dir_release_count
++;
1251 clear_dir_complete_and_ordered(diri
, true);
1254 if (d
&& reply
->get_result() == 0) {
1255 if (op
== CEPH_MDS_OP_RENAME
) {
1257 Dentry
*od
= request
->old_dentry();
1258 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1260 unlink(od
, true, true); // keep dir, dentry
1261 } else if (op
== CEPH_MDS_OP_RMDIR
||
1262 op
== CEPH_MDS_OP_UNLINK
) {
1264 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1265 unlink(d
, true, true); // keep dir, dentry
1271 ConnectionRef con
= request
->reply
->get_connection();
1273 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1274 features
= (uint64_t)-1;
1277 features
= con
->get_features();
1279 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1282 SnapRealm
*realm
= NULL
;
1283 if (reply
->snapbl
.length())
1284 update_snap_trace(reply
->snapbl
, &realm
);
1286 ldout(cct
, 10) << " hrm "
1287 << " is_target=" << (int)reply
->head
.is_target
1288 << " is_dentry=" << (int)reply
->head
.is_dentry
1297 if (reply
->head
.is_dentry
) {
1298 dirst
.decode(p
, features
);
1299 dst
.decode(p
, features
);
1301 dlease
.decode(p
, features
);
1305 if (reply
->head
.is_target
) {
1306 ist
.decode(p
, features
);
1307 if (cct
->_conf
->client_debug_getattr_caps
) {
1308 unsigned wanted
= 0;
1309 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1310 wanted
= request
->head
.args
.getattr
.mask
;
1311 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1312 wanted
= request
->head
.args
.open
.mask
;
1314 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1315 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1316 ceph_abort_msg("MDS reply does not contain xattrs");
1319 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1324 if (reply
->head
.is_dentry
) {
1325 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1327 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1330 Dir
*dir
= diri
->open_dir();
1331 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1332 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1335 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1336 dn
= diri
->dir
->dentries
[dname
];
1338 diri
->dir_ordered_count
++;
1339 clear_dir_complete_and_ordered(diri
, false);
1340 unlink(dn
, true, true); // keep dir, dentry
1343 if (dlease
.duration_ms
> 0) {
1345 Dir
*dir
= diri
->open_dir();
1346 dn
= link(dir
, dname
, NULL
, NULL
);
1348 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1351 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1352 op
== CEPH_MDS_OP_MKSNAP
) {
1353 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1354 // fake it for snap lookup
1355 vinodeno_t vino
= ist
.vino
;
1356 vino
.snapid
= CEPH_SNAPDIR
;
1357 ceph_assert(inode_map
.count(vino
));
1358 diri
= inode_map
[vino
];
1360 string dname
= request
->path
.last_dentry();
1363 dlease
.duration_ms
= 0;
1366 Dir
*dir
= diri
->open_dir();
1367 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1369 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1370 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1372 unlink(dn
, true, true); // keep dir, dentry
1378 if (op
== CEPH_MDS_OP_READDIR
||
1379 op
== CEPH_MDS_OP_LSSNAP
) {
1380 insert_readdir_results(request
, session
, in
);
1381 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1382 // hack: return parent inode instead
1386 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1387 // pin the target inode if its parent dentry is not pinned
1388 request
->set_other_inode(in
);
1393 put_snap_realm(realm
);
1395 request
->target
= in
;
1401 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1403 mds_rank_t mds
= MDS_RANK_NONE
;
1405 bool is_hash
= false;
1410 if (req
->resend_mds
>= 0) {
1411 mds
= req
->resend_mds
;
1412 req
->resend_mds
= -1;
1413 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1417 if (cct
->_conf
->client_use_random_mds
)
1423 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1424 if (req
->path
.depth()) {
1425 hash
= in
->hash_dentry_name(req
->path
[0]);
1426 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1427 << " on " << req
->path
[0]
1428 << " => " << hash
<< dendl
;
1433 in
= de
->inode
.get();
1434 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1436 in
= de
->dir
->parent_inode
;
1437 hash
= in
->hash_dentry_name(de
->name
);
1438 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1439 << " on " << de
->name
1440 << " => " << hash
<< dendl
;
1445 if (in
->snapid
!= CEPH_NOSNAP
) {
1446 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1447 while (in
->snapid
!= CEPH_NOSNAP
) {
1448 if (in
->snapid
== CEPH_SNAPDIR
)
1449 in
= in
->snapdir_parent
.get();
1450 else if (!in
->dentries
.empty())
1451 /* In most cases there will only be one dentry, so getting it
1452 * will be the correct action. If there are multiple hard links,
1453 * I think the MDS should be able to redirect as needed*/
1454 in
= in
->get_first_parent()->dir
->parent_inode
;
1456 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1463 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1464 << " hash=" << hash
<< dendl
;
1466 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1467 frag_t fg
= in
->dirfragtree
[hash
];
1468 if (in
->fragmap
.count(fg
)) {
1469 mds
= in
->fragmap
[fg
];
1472 } else if (in
->auth_cap
) {
1473 mds
= in
->auth_cap
->session
->mds_num
;
1476 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1481 if (in
->auth_cap
&& req
->auth_is_best()) {
1482 mds
= in
->auth_cap
->session
->mds_num
;
1483 } else if (!in
->caps
.empty()) {
1484 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1488 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1495 mds
= _get_random_up_mds();
1496 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1500 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1505 void Client::connect_mds_targets(mds_rank_t mds
)
1507 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1508 ceph_assert(mds_sessions
.count(mds
));
1509 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1510 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1511 q
!= info
.export_targets
.end();
1513 if (mds_sessions
.count(*q
) == 0 &&
1514 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1515 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1516 << " export target mds." << *q
<< dendl
;
1517 _open_mds_session(*q
);
1522 void Client::dump_mds_sessions(Formatter
*f
)
1524 f
->dump_int("id", get_nodeid().v
);
1525 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1526 f
->dump_object("inst", inst
);
1527 f
->dump_stream("inst_str") << inst
;
1528 f
->dump_stream("addr_str") << inst
.addr
;
1529 f
->open_array_section("sessions");
1530 for (const auto &p
: mds_sessions
) {
1531 f
->open_object_section("session");
1536 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1538 void Client::dump_mds_requests(Formatter
*f
)
1540 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1541 p
!= mds_requests
.end();
1543 f
->open_object_section("request");
1549 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1550 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1551 InodeRef
*ptarget
, bool *pcreated
,
1552 const UserPerm
& perms
)
1554 // check whether this request actually did the create, and set created flag
1555 bufferlist extra_bl
;
1556 inodeno_t created_ino
;
1557 bool got_created_ino
= false;
1558 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1560 extra_bl
= reply
->get_extra_bl();
1561 if (extra_bl
.length() >= 8) {
1562 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1563 struct openc_response_t ocres
;
1565 decode(ocres
, extra_bl
);
1566 created_ino
= ocres
.created_ino
;
1568 * The userland cephfs client doesn't have a way to do an async create
1569 * (yet), so just discard delegated_inos for now. Eventually we should
1570 * store them and use them in create calls, even if they are synchronous,
1571 * if only for testing purposes.
1573 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1575 // u64 containing number of created ino
1576 decode(created_ino
, extra_bl
);
1578 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1579 got_created_ino
= true;
1583 *pcreated
= got_created_ino
;
1585 if (request
->target
) {
1586 *ptarget
= request
->target
;
1587 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1589 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1590 (*ptarget
) = p
->second
;
1591 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1593 // we got a traceless reply, and need to look up what we just
1594 // created. for now, do this by name. someday, do this by the
1595 // ino... which we know! FIXME.
1597 Dentry
*d
= request
->dentry();
1600 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1601 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1602 << " got_ino " << got_created_ino
1603 << " ino " << created_ino
1605 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1608 // if the dentry is not linked, just do our best. see #5021.
1609 ceph_abort_msg("how did this happen? i want logs!");
1612 Inode
*in
= request
->inode();
1613 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1614 << in
->ino
<< dendl
;
1615 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1619 // verify ino returned in reply and trace_dist are the same
1620 if (got_created_ino
&&
1621 created_ino
.val
!= target
->ino
.val
) {
1622 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1626 ptarget
->swap(target
);
1638 * Blocking helper to make an MDS request.
1640 * If the ptarget flag is set, behavior changes slightly: the caller
1641 * expects to get a pointer to the inode we are creating or operating
1642 * on. As a result, we will follow up any traceless mutation reply
1643 * with a getattr or lookup to transparently handle a traceless reply
1644 * from the MDS (as when the MDS restarts and the client has to replay
1647 * @param request the MetaRequest to execute
1648 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1649 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1650 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1651 * @param use_mds [optional] prefer a specific mds (-1 for default)
1652 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1654 int Client::make_request(MetaRequest
*request
,
1655 const UserPerm
& perms
,
1656 InodeRef
*ptarget
, bool *pcreated
,
1662 // assign a unique tid
1663 ceph_tid_t tid
= ++last_tid
;
1664 request
->set_tid(tid
);
1667 request
->op_stamp
= ceph_clock_now();
1670 mds_requests
[tid
] = request
->get();
1671 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1674 request
->set_caller_perms(perms
);
1676 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1677 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1678 request
->set_oldest_client_tid(1);
1680 request
->set_oldest_client_tid(oldest_tid
);
1685 request
->resend_mds
= use_mds
;
1687 MetaSession
*session
= NULL
;
1689 if (request
->aborted())
1693 request
->abort(-EBLACKLISTED
);
1698 ceph::condition_variable caller_cond
;
1699 request
->caller_cond
= &caller_cond
;
1702 Inode
*hash_diri
= NULL
;
1703 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1704 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1705 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1706 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1708 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1709 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1711 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1712 request
->resend_mds
= _get_random_up_mds();
1715 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1716 wait_on_list(waiting_for_mdsmap
);
1722 if (!have_open_session(mds
)) {
1723 session
= _get_or_open_mds_session(mds
);
1726 if (session
->state
== MetaSession::STATE_OPENING
) {
1727 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1728 wait_on_context_list(session
->waiting_for_open
);
1729 // Abort requests on REJECT from MDS
1730 if (rejected_by_mds
.count(mds
)) {
1731 request
->abort(-EPERM
);
1737 if (!have_open_session(mds
))
1740 session
= &mds_sessions
.at(mds
);
1744 send_request(request
, session
);
1747 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1748 request
->kick
= false;
1749 std::unique_lock l
{client_lock
, std::adopt_lock
};
1750 caller_cond
.wait(l
, [request
] {
1751 return (request
->reply
|| // reply
1752 request
->resend_mds
>= 0 || // forward
1756 request
->caller_cond
= nullptr;
1758 // did we get a reply?
1763 if (!request
->reply
) {
1764 ceph_assert(request
->aborted());
1765 ceph_assert(!request
->got_unsafe
);
1766 r
= request
->get_abort_code();
1767 request
->item
.remove_myself();
1768 unregister_request(request
);
1769 put_request(request
);
1774 auto reply
= std::move(request
->reply
);
1775 r
= reply
->get_result();
1777 request
->success
= true;
1779 // kick dispatcher (we've got it!)
1780 ceph_assert(request
->dispatch_cond
);
1781 request
->dispatch_cond
->notify_all();
1782 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1783 request
->dispatch_cond
= 0;
1785 if (r
>= 0 && ptarget
)
1786 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1789 *pdirbl
= reply
->get_extra_bl();
1792 utime_t lat
= ceph_clock_now();
1793 lat
-= request
->sent_stamp
;
1794 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1795 logger
->tinc(l_c_lat
, lat
);
1796 logger
->tinc(l_c_reply
, lat
);
1798 put_request(request
);
1802 void Client::unregister_request(MetaRequest
*req
)
1804 mds_requests
.erase(req
->tid
);
1805 if (req
->tid
== oldest_tid
) {
1806 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1808 if (p
== mds_requests
.end()) {
1812 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1813 oldest_tid
= p
->first
;
1822 void Client::put_request(MetaRequest
*request
)
1824 if (request
->_put()) {
1826 if (request
->success
)
1827 op
= request
->get_op();
1829 request
->take_other_inode(&other_in
);
1833 (op
== CEPH_MDS_OP_RMDIR
||
1834 op
== CEPH_MDS_OP_RENAME
||
1835 op
== CEPH_MDS_OP_RMSNAP
)) {
1836 _try_to_trim_inode(other_in
.get(), false);
1841 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1842 mds_rank_t mds
, int drop
,
1843 int unless
, int force
)
1845 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1846 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1847 << ", have:" << ", force:" << force
<< ")" << dendl
;
1849 auto it
= in
->caps
.find(mds
);
1850 if (it
!= in
->caps
.end()) {
1851 Cap
&cap
= it
->second
;
1852 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1853 if ((drop
& cap
.issued
) &&
1854 !(unless
& cap
.issued
)) {
1855 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(cap
.issued
) << dendl
;
1856 cap
.issued
&= ~drop
;
1857 cap
.implemented
&= ~drop
;
1859 ldout(cct
, 25) << "Now have: " << ccap_string(cap
.issued
) << dendl
;
1864 ceph_mds_request_release rel
;
1866 rel
.cap_id
= cap
.cap_id
;
1868 rel
.issue_seq
= cap
.issue_seq
;
1869 rel
.mseq
= cap
.mseq
;
1870 rel
.caps
= cap
.implemented
;
1871 rel
.wanted
= cap
.wanted
;
1874 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1877 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1878 << released
<< dendl
;
1882 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1883 mds_rank_t mds
, int drop
, int unless
)
1885 ldout(cct
, 20) << __func__
<< " enter(dn:"
1886 << dn
<< ")" << dendl
;
1889 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1890 mds
, drop
, unless
, 1);
1891 if (released
&& dn
->lease_mds
== mds
) {
1892 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1893 auto& rel
= req
->cap_releases
.back();
1894 rel
.item
.dname_len
= dn
->name
.length();
1895 rel
.item
.dname_seq
= dn
->lease_seq
;
1896 rel
.dname
= dn
->name
;
1898 ldout(cct
, 25) << __func__
<< " exit(dn:"
1899 << dn
<< ")" << dendl
;
1904 * This requires the MClientRequest *request member to be set.
1905 * It will error out horribly without one.
1906 * Additionally, if you set any *drop member, you'd better have
1907 * set the corresponding dentry!
1909 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1911 ldout(cct
, 20) << __func__
<< " enter (req: "
1912 << req
<< ", mds: " << mds
<< ")" << dendl
;
1913 if (req
->inode_drop
&& req
->inode())
1914 encode_inode_release(req
->inode(), req
,
1915 mds
, req
->inode_drop
,
1918 if (req
->old_inode_drop
&& req
->old_inode())
1919 encode_inode_release(req
->old_inode(), req
,
1920 mds
, req
->old_inode_drop
,
1921 req
->old_inode_unless
);
1922 if (req
->other_inode_drop
&& req
->other_inode())
1923 encode_inode_release(req
->other_inode(), req
,
1924 mds
, req
->other_inode_drop
,
1925 req
->other_inode_unless
);
1927 if (req
->dentry_drop
&& req
->dentry())
1928 encode_dentry_release(req
->dentry(), req
,
1929 mds
, req
->dentry_drop
,
1930 req
->dentry_unless
);
1932 if (req
->old_dentry_drop
&& req
->old_dentry())
1933 encode_dentry_release(req
->old_dentry(), req
,
1934 mds
, req
->old_dentry_drop
,
1935 req
->old_dentry_unless
);
1936 ldout(cct
, 25) << __func__
<< " exit (req: "
1937 << req
<< ", mds " << mds
<<dendl
;
1940 bool Client::have_open_session(mds_rank_t mds
)
1942 const auto &it
= mds_sessions
.find(mds
);
1943 return it
!= mds_sessions
.end() &&
1944 (it
->second
.state
== MetaSession::STATE_OPEN
||
1945 it
->second
.state
== MetaSession::STATE_STALE
);
1948 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1950 const auto &it
= mds_sessions
.find(mds
);
1951 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1958 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1960 auto it
= mds_sessions
.find(mds
);
1961 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1965 * Populate a map of strings with client-identifying metadata,
1966 * such as the hostname. Call this once at initialization.
1968 void Client::populate_metadata(const std::string
&mount_root
)
1974 metadata
["hostname"] = u
.nodename
;
1975 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1977 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1980 metadata
["pid"] = stringify(getpid());
1982 // Ceph entity id (the '0' in "client.0")
1983 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1985 // Our mount position
1986 if (!mount_root
.empty()) {
1987 metadata
["root"] = mount_root
;
1991 metadata
["ceph_version"] = pretty_version_to_str();
1992 metadata
["ceph_sha1"] = git_version_to_str();
1994 // Apply any metadata from the user's configured overrides
1995 std::vector
<std::string
> tokens
;
1996 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
1997 for (const auto &i
: tokens
) {
1998 auto eqpos
= i
.find("=");
1999 // Throw out anything that isn't of the form "<str>=<str>"
2000 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2001 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2004 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2009 * Optionally add or override client metadata fields.
2011 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2013 std::lock_guard
l(client_lock
);
2014 ceph_assert(initialized
);
2016 auto it
= metadata
.find(k
);
2017 if (it
!= metadata
.end()) {
2018 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2019 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2025 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2027 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2028 auto addrs
= mdsmap
->get_addrs(mds
);
2029 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2030 std::forward_as_tuple(mds
),
2031 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2032 ceph_assert(em
.second
); /* not already present */
2033 MetaSession
*session
= &em
.first
->second
;
2035 // Maybe skip sending a request to open if this MDS daemon
2036 // has previously sent us a REJECT.
2037 if (rejected_by_mds
.count(mds
)) {
2038 if (rejected_by_mds
[mds
] == session
->addrs
) {
2039 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2040 "because we were rejected" << dendl
;
2043 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2044 "rejected us, trying with new inst" << dendl
;
2045 rejected_by_mds
.erase(mds
);
2049 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2050 m
->metadata
= metadata
;
2051 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2052 session
->con
->send_message2(std::move(m
));
2056 void Client::_close_mds_session(MetaSession
*s
)
2058 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2059 s
->state
= MetaSession::STATE_CLOSING
;
2060 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2063 void Client::_closed_mds_session(MetaSession
*s
)
2065 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2066 s
->state
= MetaSession::STATE_CLOSED
;
2067 s
->con
->mark_down();
2068 signal_context_list(s
->waiting_for_open
);
2069 mount_cond
.notify_all();
2070 remove_session_caps(s
);
2071 kick_requests_closed(s
);
2072 mds_sessions
.erase(s
->mds_num
);
2075 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2077 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2078 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2080 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2082 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2086 switch (m
->get_op()) {
2087 case CEPH_SESSION_OPEN
:
2089 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2090 missing_features
-= m
->supported_features
;
2091 if (!missing_features
.empty()) {
2092 lderr(cct
) << "mds." << from
<< " lacks required features '"
2093 << missing_features
<< "', closing session " << dendl
;
2094 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2095 _close_mds_session(session
);
2096 _closed_mds_session(session
);
2099 session
->mds_features
= std::move(m
->supported_features
);
2101 renew_caps(session
);
2102 session
->state
= MetaSession::STATE_OPEN
;
2104 mount_cond
.notify_all();
2106 connect_mds_targets(from
);
2107 signal_context_list(session
->waiting_for_open
);
2111 case CEPH_SESSION_CLOSE
:
2112 _closed_mds_session(session
);
2115 case CEPH_SESSION_RENEWCAPS
:
2116 if (session
->cap_renew_seq
== m
->get_seq()) {
2117 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2119 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2121 wake_up_session_caps(session
, false);
2125 case CEPH_SESSION_STALE
:
2126 // invalidate session caps/leases
2128 session
->cap_ttl
= ceph_clock_now();
2129 session
->cap_ttl
-= 1;
2130 renew_caps(session
);
2133 case CEPH_SESSION_RECALL_STATE
:
2134 trim_caps(session
, m
->get_max_caps());
2137 case CEPH_SESSION_FLUSHMSG
:
2138 /* flush cap release */
2139 if (auto& m
= session
->release
; m
) {
2140 session
->con
->send_message2(std::move(m
));
2142 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2145 case CEPH_SESSION_FORCE_RO
:
2146 force_session_readonly(session
);
2149 case CEPH_SESSION_REJECT
:
2151 std::string_view error_str
;
2152 auto it
= m
->metadata
.find("error_string");
2153 if (it
!= m
->metadata
.end())
2154 error_str
= it
->second
;
2156 error_str
= "unknown error";
2157 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2159 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2160 _closed_mds_session(session
);
2169 bool Client::_any_stale_sessions() const
2171 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2173 for (const auto &p
: mds_sessions
) {
2174 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2182 void Client::_kick_stale_sessions()
2184 ldout(cct
, 1) << __func__
<< dendl
;
2186 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2187 MetaSession
&s
= it
->second
;
2189 if (s
.state
== MetaSession::STATE_STALE
)
2190 _closed_mds_session(&s
);
2194 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2195 bool drop_cap_releases
)
2198 mds_rank_t mds
= session
->mds_num
;
2199 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2200 << " for mds." << mds
<< dendl
;
2201 auto r
= build_client_request(request
);
2202 if (request
->dentry()) {
2203 r
->set_dentry_wanted();
2205 if (request
->got_unsafe
) {
2206 r
->set_replayed_op();
2207 if (request
->target
)
2208 r
->head
.ino
= request
->target
->ino
;
2210 encode_cap_releases(request
, mds
);
2211 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2212 request
->cap_releases
.clear();
2214 r
->releases
.swap(request
->cap_releases
);
2216 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2217 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2218 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2219 r
->set_osdmap_epoch(o
.get_epoch());
2223 if (request
->mds
== -1) {
2224 request
->sent_stamp
= ceph_clock_now();
2225 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2229 Inode
*in
= request
->inode();
2231 auto it
= in
->caps
.find(mds
);
2232 if (it
!= in
->caps
.end()) {
2233 request
->sent_on_mseq
= it
->second
.mseq
;
2237 session
->requests
.push_back(&request
->item
);
2239 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2240 session
->con
->send_message2(std::move(r
));
2243 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2245 auto req
= make_message
<MClientRequest
>(request
->get_op());
2246 req
->set_tid(request
->tid
);
2247 req
->set_stamp(request
->op_stamp
);
2248 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2250 // if the filepath's haven't been set, set them!
2251 if (request
->path
.empty()) {
2252 Inode
*in
= request
->inode();
2253 Dentry
*de
= request
->dentry();
2255 in
->make_nosnap_relative_path(request
->path
);
2258 de
->inode
->make_nosnap_relative_path(request
->path
);
2260 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2261 request
->path
.push_dentry(de
->name
);
2263 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2264 << " No path, inode, or appropriately-endowed dentry given!"
2266 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2267 << " No path, inode, or dentry given!"
2270 req
->set_filepath(request
->get_filepath());
2271 req
->set_filepath2(request
->get_filepath2());
2272 req
->set_data(request
->data
);
2273 req
->set_retry_attempt(request
->retry_attempt
++);
2274 req
->head
.num_fwd
= request
->num_fwd
;
2276 int gid_count
= request
->perms
.get_gids(&_gids
);
2277 req
->set_gid_list(gid_count
, _gids
);
2283 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2285 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2286 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2290 ceph_tid_t tid
= fwd
->get_tid();
2292 if (mds_requests
.count(tid
) == 0) {
2293 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2297 MetaRequest
*request
= mds_requests
[tid
];
2298 ceph_assert(request
);
2300 // reset retry counter
2301 request
->retry_attempt
= 0;
2303 // request not forwarded, or dest mds has no session.
2305 ldout(cct
, 10) << __func__
<< " tid " << tid
2306 << " fwd " << fwd
->get_num_fwd()
2307 << " to mds." << fwd
->get_dest_mds()
2308 << ", resending to " << fwd
->get_dest_mds()
2312 request
->item
.remove_myself();
2313 request
->num_fwd
= fwd
->get_num_fwd();
2314 request
->resend_mds
= fwd
->get_dest_mds();
2315 request
->caller_cond
->notify_all();
2318 bool Client::is_dir_operation(MetaRequest
*req
)
2320 int op
= req
->get_op();
2321 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2322 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2323 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2324 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2329 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2331 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2332 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2337 ceph_tid_t tid
= reply
->get_tid();
2338 bool is_safe
= reply
->is_safe();
2340 if (mds_requests
.count(tid
) == 0) {
2341 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2342 << " safe is:" << is_safe
<< dendl
;
2345 MetaRequest
*request
= mds_requests
.at(tid
);
2347 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2348 << " tid " << tid
<< dendl
;
2350 if (request
->got_unsafe
&& !is_safe
) {
2351 //duplicate response
2352 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2353 << mds_num
<< " safe:" << is_safe
<< dendl
;
2357 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2358 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2359 << " from mds." << request
->mds
<< dendl
;
2360 request
->send_to_auth
= true;
2361 request
->resend_mds
= choose_target_mds(request
);
2362 Inode
*in
= request
->inode();
2363 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2364 if (request
->resend_mds
>= 0 &&
2365 request
->resend_mds
== request
->mds
&&
2367 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2368 request
->sent_on_mseq
== it
->second
.mseq
)) {
2369 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2371 request
->caller_cond
->notify_all();
2376 ceph_assert(!request
->reply
);
2377 request
->reply
= reply
;
2378 insert_trace(request
, session
);
2380 // Handle unsafe reply
2382 request
->got_unsafe
= true;
2383 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2384 if (is_dir_operation(request
)) {
2385 Inode
*dir
= request
->inode();
2387 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2389 if (request
->target
) {
2390 InodeRef
&in
= request
->target
;
2391 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2395 // Only signal the caller once (on the first reply):
2396 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2397 if (!is_safe
|| !request
->got_unsafe
) {
2398 ceph::condition_variable cond
;
2399 request
->dispatch_cond
= &cond
;
2402 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2403 request
->caller_cond
->notify_all();
2405 // wake for kick back
2406 std::unique_lock l
{client_lock
, std::adopt_lock
};
2407 cond
.wait(l
, [tid
, request
, &cond
, this] {
2408 if (request
->dispatch_cond
) {
2409 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2410 << tid
<< " " << &cond
<< dendl
;
2412 return !request
->dispatch_cond
;
2418 // the filesystem change is committed to disk
2419 // we're done, clean up
2420 if (request
->got_unsafe
) {
2421 request
->unsafe_item
.remove_myself();
2422 request
->unsafe_dir_item
.remove_myself();
2423 request
->unsafe_target_item
.remove_myself();
2424 signal_cond_list(request
->waitfor_safe
);
2426 request
->item
.remove_myself();
2427 unregister_request(request
);
2430 mount_cond
.notify_all();
2433 void Client::_handle_full_flag(int64_t pool
)
2435 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2436 << "on " << pool
<< dendl
;
2437 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2438 // to do this rather than blocking, because otherwise when we fill up we
2439 // potentially lock caps forever on files with dirty pages, and we need
2440 // to be able to release those caps to the MDS so that it can delete files
2441 // and free up space.
2442 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2444 // For all inodes with layouts in this pool and a pending flush write op
2445 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2446 // from ObjectCacher so that it doesn't re-issue the write in response to
2447 // the ENOSPC error.
2448 // Fortunately since we're cancelling everything in a given pool, we don't
2449 // need to know which ops belong to which ObjectSet, we can just blow all
2450 // the un-flushed cached data away and mark any dirty inodes' async_err
2451 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2452 // affecting this pool, and all the objectsets we're purging were also
2454 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2455 i
!= inode_map
.end(); ++i
)
2457 Inode
*inode
= i
->second
;
2458 if (inode
->oset
.dirty_or_tx
2459 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2460 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2461 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2462 objectcacher
->purge_set(&inode
->oset
);
2463 inode
->set_async_err(-ENOSPC
);
2467 if (cancelled_epoch
!= (epoch_t
)-1) {
2468 set_cap_epoch_barrier(cancelled_epoch
);
2472 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2474 std::set
<entity_addr_t
> new_blacklists
;
2475 objecter
->consume_blacklist_events(&new_blacklists
);
2477 const auto myaddrs
= messenger
->get_myaddrs();
2478 bool new_blacklist
= false;
2479 bool prenautilus
= objecter
->with_osdmap(
2480 [&](const OSDMap
& o
) {
2481 return o
.require_osd_release
< ceph_release_t::nautilus
;
2484 for (auto a
: myaddrs
.v
) {
2485 // blacklist entries are always TYPE_ANY for nautilus+
2486 a
.set_type(entity_addr_t::TYPE_ANY
);
2487 if (new_blacklists
.count(a
)) {
2488 new_blacklist
= true;
2492 // ...except pre-nautilus, they were TYPE_LEGACY
2493 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2494 if (new_blacklists
.count(a
)) {
2495 new_blacklist
= true;
2501 if (new_blacklist
) {
2502 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2503 return o
.get_epoch();
2505 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2508 _abort_mds_sessions(-EBLACKLISTED
);
2510 // Since we know all our OSD ops will fail, cancel them all preemtively,
2511 // so that on an unhealthy cluster we can umount promptly even if e.g.
2512 // some PGs were inaccessible.
2513 objecter
->op_cancel_writes(-EBLACKLISTED
);
2515 } else if (blacklisted
) {
2516 // Handle case where we were blacklisted but no longer are
2517 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2518 return o
.is_blacklisted(myaddrs
);});
2521 // Always subscribe to next osdmap for blacklisted client
2522 // until this client is not blacklisted.
2524 objecter
->maybe_request_map();
2527 if (objecter
->osdmap_full_flag()) {
2528 _handle_full_flag(-1);
2530 // Accumulate local list of full pools so that I can drop
2531 // the objecter lock before re-entering objecter in
2533 std::vector
<int64_t> full_pools
;
2535 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2536 for (const auto& kv
: o
.get_pools()) {
2537 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2538 full_pools
.push_back(kv
.first
);
2543 for (auto p
: full_pools
)
2544 _handle_full_flag(p
);
2546 // Subscribe to subsequent maps to watch for the full flag going
2547 // away. For the global full flag objecter does this for us, but
2548 // it pays no attention to the per-pool full flag so in this branch
2549 // we do it ourselves.
2550 if (!full_pools
.empty()) {
2551 objecter
->maybe_request_map();
2557 // ------------------------
2558 // incoming messages
2561 bool Client::ms_dispatch2(const MessageRef
&m
)
2563 std::lock_guard
l(client_lock
);
2565 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2569 switch (m
->get_type()) {
2570 // mounting and mds sessions
2571 case CEPH_MSG_MDS_MAP
:
2572 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2574 case CEPH_MSG_FS_MAP
:
2575 handle_fs_map(ref_cast
<MFSMap
>(m
));
2577 case CEPH_MSG_FS_MAP_USER
:
2578 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2580 case CEPH_MSG_CLIENT_SESSION
:
2581 handle_client_session(ref_cast
<MClientSession
>(m
));
2584 case CEPH_MSG_OSD_MAP
:
2585 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2589 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2590 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2592 case CEPH_MSG_CLIENT_REPLY
:
2593 handle_client_reply(ref_cast
<MClientReply
>(m
));
2597 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2598 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2601 case CEPH_MSG_CLIENT_SNAP
:
2602 handle_snap(ref_cast
<MClientSnap
>(m
));
2604 case CEPH_MSG_CLIENT_CAPS
:
2605 handle_caps(ref_cast
<MClientCaps
>(m
));
2607 case CEPH_MSG_CLIENT_LEASE
:
2608 handle_lease(ref_cast
<MClientLease
>(m
));
2610 case MSG_COMMAND_REPLY
:
2611 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2612 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2617 case CEPH_MSG_CLIENT_QUOTA
:
2618 handle_quota(ref_cast
<MClientQuota
>(m
));
2627 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2628 << "+" << inode_map
.size() << dendl
;
2629 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2631 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2632 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2633 mount_cond
.notify_all();
2635 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2636 << "+" << inode_map
.size() << dendl
;
2643 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2645 fsmap
.reset(new FSMap(m
->get_fsmap()));
2647 signal_cond_list(waiting_for_fsmap
);
2649 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2652 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2654 fsmap_user
.reset(new FSMapUser
);
2655 *fsmap_user
= m
->get_fsmap();
2657 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2658 signal_cond_list(waiting_for_fsmap
);
2661 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2663 mds_gid_t old_inc
, new_inc
;
2664 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2665 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2666 << " is identical to or older than our "
2667 << mdsmap
->get_epoch() << dendl
;
2671 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2673 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2674 oldmap
.swap(mdsmap
);
2676 mdsmap
->decode(m
->get_encoded());
2678 // Cancel any commands for missing or laggy GIDs
2679 std::list
<ceph_tid_t
> cancel_ops
;
2680 auto &commands
= command_table
.get_commands();
2681 for (const auto &i
: commands
) {
2682 auto &op
= i
.second
;
2683 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2684 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2685 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2686 cancel_ops
.push_back(i
.first
);
2688 std::ostringstream ss
;
2689 ss
<< "MDS " << op_mds_gid
<< " went away";
2690 *(op
.outs
) = ss
.str();
2692 op
.con
->mark_down();
2694 op
.on_finish
->complete(-ETIMEDOUT
);
2699 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2700 i
!= cancel_ops
.end(); ++i
) {
2701 command_table
.erase(*i
);
2705 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2706 mds_rank_t mds
= p
->first
;
2707 MetaSession
*session
= &p
->second
;
2710 int oldstate
= oldmap
->get_state(mds
);
2711 int newstate
= mdsmap
->get_state(mds
);
2712 if (!mdsmap
->is_up(mds
)) {
2713 session
->con
->mark_down();
2714 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2715 old_inc
= oldmap
->get_incarnation(mds
);
2716 new_inc
= mdsmap
->get_incarnation(mds
);
2717 if (old_inc
!= new_inc
) {
2718 ldout(cct
, 1) << "mds incarnation changed from "
2719 << old_inc
<< " to " << new_inc
<< dendl
;
2720 oldstate
= MDSMap::STATE_NULL
;
2722 session
->con
->mark_down();
2723 session
->addrs
= mdsmap
->get_addrs(mds
);
2724 // When new MDS starts to take over, notify kernel to trim unused entries
2725 // in its dcache/icache. Hopefully, the kernel will release some unused
2726 // inodes before the new MDS enters reconnect state.
2727 trim_cache_for_reconnect(session
);
2728 } else if (oldstate
== newstate
)
2729 continue; // no change
2731 session
->mds_state
= newstate
;
2732 if (newstate
== MDSMap::STATE_RECONNECT
) {
2733 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2734 send_reconnect(session
);
2735 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2736 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2737 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2738 _closed_mds_session(session
);
2741 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2742 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2743 // kick new requests
2744 kick_requests(session
);
2745 kick_flushing_caps(session
);
2746 signal_context_list(session
->waiting_for_open
);
2747 wake_up_session_caps(session
, true);
2749 connect_mds_targets(mds
);
2751 } else if (newstate
== MDSMap::STATE_NULL
&&
2752 mds
>= mdsmap
->get_max_mds()) {
2753 _closed_mds_session(session
);
2757 // kick any waiting threads
2758 signal_cond_list(waiting_for_mdsmap
);
2760 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2763 void Client::send_reconnect(MetaSession
*session
)
2765 mds_rank_t mds
= session
->mds_num
;
2766 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2768 // trim unused caps to reduce MDS's cache rejoin time
2769 trim_cache_for_reconnect(session
);
2771 session
->readonly
= false;
2773 session
->release
.reset();
2775 // reset my cap seq number
2777 //connect to the mds' offload targets
2778 connect_mds_targets(mds
);
2779 //make sure unsafe requests get saved
2780 resend_unsafe_requests(session
);
2782 early_kick_flushing_caps(session
);
2784 auto m
= make_message
<MClientReconnect
>();
2785 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2787 // i have an open session.
2788 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2789 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2790 p
!= inode_map
.end();
2792 Inode
*in
= p
->second
;
2793 auto it
= in
->caps
.find(mds
);
2794 if (it
!= in
->caps
.end()) {
2796 m
->get_approx_size() >=
2797 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2799 session
->con
->send_message2(std::move(m
));
2801 m
= make_message
<MClientReconnect
>();
2804 Cap
&cap
= it
->second
;
2805 ldout(cct
, 10) << " caps on " << p
->first
2806 << " " << ccap_string(cap
.issued
)
2807 << " wants " << ccap_string(in
->caps_wanted())
2810 in
->make_long_path(path
);
2811 ldout(cct
, 10) << " path " << path
<< dendl
;
2814 _encode_filelocks(in
, flockbl
);
2816 cap
.seq
= 0; // reset seq.
2817 cap
.issue_seq
= 0; // reset seq.
2818 cap
.mseq
= 0; // reset seq.
2819 // cap gen should catch up with session cap_gen
2820 if (cap
.gen
< session
->cap_gen
) {
2821 cap
.gen
= session
->cap_gen
;
2822 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2824 cap
.issued
= cap
.implemented
;
2826 snapid_t snap_follows
= 0;
2827 if (!in
->cap_snaps
.empty())
2828 snap_follows
= in
->cap_snaps
.begin()->first
;
2830 m
->add_cap(p
->first
.ino
,
2832 path
.get_ino(), path
.get_path(), // ino
2833 in
->caps_wanted(), // wanted
2834 cap
.issued
, // issued
2839 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2840 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2841 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2842 did_snaprealm
.insert(in
->snaprealm
->ino
);
2848 m
->set_encoding_version(0); // use connection features to choose encoding
2849 session
->con
->send_message2(std::move(m
));
2851 mount_cond
.notify_all();
2853 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2854 signal_cond_list(waiting_for_reclaim
);
2858 void Client::kick_requests(MetaSession
*session
)
2860 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2861 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2862 p
!= mds_requests
.end();
2864 MetaRequest
*req
= p
->second
;
2865 if (req
->got_unsafe
)
2867 if (req
->aborted()) {
2868 if (req
->caller_cond
) {
2870 req
->caller_cond
->notify_all();
2874 if (req
->retry_attempt
> 0)
2875 continue; // new requests only
2876 if (req
->mds
== session
->mds_num
) {
2877 send_request(p
->second
, session
);
2882 void Client::resend_unsafe_requests(MetaSession
*session
)
2884 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2887 send_request(*iter
, session
);
2889 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2890 // process completed requests in clientreplay stage.
2891 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2892 p
!= mds_requests
.end();
2894 MetaRequest
*req
= p
->second
;
2895 if (req
->got_unsafe
)
2899 if (req
->retry_attempt
== 0)
2900 continue; // old requests only
2901 if (req
->mds
== session
->mds_num
)
2902 send_request(req
, session
, true);
2906 void Client::wait_unsafe_requests()
2908 list
<MetaRequest
*> last_unsafe_reqs
;
2909 for (const auto &p
: mds_sessions
) {
2910 const MetaSession
&s
= p
.second
;
2911 if (!s
.unsafe_requests
.empty()) {
2912 MetaRequest
*req
= s
.unsafe_requests
.back();
2914 last_unsafe_reqs
.push_back(req
);
2918 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2919 p
!= last_unsafe_reqs
.end();
2921 MetaRequest
*req
= *p
;
2922 if (req
->unsafe_item
.is_on_list())
2923 wait_on_list(req
->waitfor_safe
);
2928 void Client::kick_requests_closed(MetaSession
*session
)
2930 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2931 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2932 p
!= mds_requests
.end(); ) {
2933 MetaRequest
*req
= p
->second
;
2935 if (req
->mds
== session
->mds_num
) {
2936 if (req
->caller_cond
) {
2938 req
->caller_cond
->notify_all();
2940 req
->item
.remove_myself();
2941 if (req
->got_unsafe
) {
2942 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2943 req
->unsafe_item
.remove_myself();
2944 if (is_dir_operation(req
)) {
2945 Inode
*dir
= req
->inode();
2947 dir
->set_async_err(-EIO
);
2948 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2949 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2950 req
->unsafe_dir_item
.remove_myself();
2953 InodeRef
&in
= req
->target
;
2954 in
->set_async_err(-EIO
);
2955 lderr(cct
) << "kick_requests_closed drop req of inode : "
2956 << in
->ino
<< " " << req
->get_tid() << dendl
;
2957 req
->unsafe_target_item
.remove_myself();
2959 signal_cond_list(req
->waitfor_safe
);
2960 unregister_request(req
);
2964 ceph_assert(session
->requests
.empty());
2965 ceph_assert(session
->unsafe_requests
.empty());
2975 void Client::got_mds_push(MetaSession
*s
)
2978 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2979 if (s
->state
== MetaSession::STATE_CLOSING
) {
2980 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2984 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2986 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2988 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2990 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2991 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2996 got_mds_push(session
);
2998 ceph_seq_t seq
= m
->get_seq();
3001 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3002 if (inode_map
.count(vino
) == 0) {
3003 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3006 in
= inode_map
[vino
];
3008 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3009 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3010 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3013 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3014 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3020 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3021 m
->get_mask(), m
->get_ino(),
3022 m
->get_first(), m
->get_last(), m
->dname
);
3023 m
->get_connection()->send_message2(std::move(reply
));
3027 void Client::put_inode(Inode
*in
, int n
)
3029 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3030 int left
= in
->_put(n
);
3033 remove_all_caps(in
);
3035 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3036 bool unclean
= objectcacher
->release_set(&in
->oset
);
3037 ceph_assert(!unclean
);
3038 inode_map
.erase(in
->vino());
3039 if (use_faked_inos())
3040 _release_faked_ino(in
);
3045 while (!root_parents
.empty())
3046 root_parents
.erase(root_parents
.begin());
3053 void Client::close_dir(Dir
*dir
)
3055 Inode
*in
= dir
->parent_inode
;
3056 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3057 ceph_assert(dir
->is_empty());
3058 ceph_assert(in
->dir
== dir
);
3059 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3060 if (!in
->dentries
.empty())
3061 in
->get_first_parent()->put(); // unpin dentry
3065 put_inode(in
); // unpin inode
3069 * Don't call this with in==NULL, use get_or_create for that
3070 * leave dn set to default NULL unless you're trying to add
3071 * a new inode to a pre-created Dentry
3073 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3076 // create a new Dentry
3077 dn
= new Dentry(dir
, name
);
3079 lru
.lru_insert_mid(dn
); // mid or top?
3081 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3082 << " dn " << dn
<< " (new dn)" << dendl
;
3084 ceph_assert(!dn
->inode
);
3085 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3086 << " dn " << dn
<< " (old dn)" << dendl
;
3089 if (in
) { // link to inode
3091 // only one parent for directories!
3092 if (in
->is_dir() && !in
->dentries
.empty()) {
3093 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3094 Dentry
*olddn
= in
->get_first_parent();
3095 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3096 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3097 old_diri
->dir_release_count
++;
3098 clear_dir_complete_and_ordered(old_diri
, true);
3099 unlink(olddn
, true, true); // keep dir, dentry
3103 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3109 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3111 InodeRef
in(dn
->inode
);
3112 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3113 << " inode " << dn
->inode
<< dendl
;
3115 // unlink from inode
3118 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3124 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3134 if (dir
->is_empty() && !keepdir
)
3140 * For asynchronous flushes, check for errors from the IO and
3141 * update the inode if necessary
3143 class C_Client_FlushComplete
: public Context
{
3148 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3149 void finish(int r
) override
{
3150 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3152 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3153 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3154 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3155 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3156 inode
->set_async_err(r
);
3166 void Client::get_cap_ref(Inode
*in
, int cap
)
3168 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3169 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3170 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3173 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3174 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3175 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3178 in
->get_cap_ref(cap
);
3181 void Client::put_cap_ref(Inode
*in
, int cap
)
3183 int last
= in
->put_cap_ref(cap
);
3186 int drop
= last
& ~in
->caps_issued();
3187 if (in
->snapid
== CEPH_NOSNAP
) {
3188 if ((last
& CEPH_CAP_FILE_WR
) &&
3189 !in
->cap_snaps
.empty() &&
3190 in
->cap_snaps
.rbegin()->second
.writing
) {
3191 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3192 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3193 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3194 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3196 if (last
& CEPH_CAP_FILE_BUFFER
) {
3197 for (auto &p
: in
->cap_snaps
)
3198 p
.second
.dirty_data
= 0;
3199 signal_cond_list(in
->waitfor_commit
);
3200 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3204 if (last
& CEPH_CAP_FILE_CACHE
) {
3205 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3211 put_inode(in
, put_nref
);
3215 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3217 int r
= check_pool_perm(in
, need
);
3222 int file_wanted
= in
->caps_file_wanted();
3223 if ((file_wanted
& need
) != need
) {
3224 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3225 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3231 int have
= in
->caps_issued(&implemented
);
3233 bool waitfor_caps
= false;
3234 bool waitfor_commit
= false;
3236 if (have
& need
& CEPH_CAP_FILE_WR
) {
3238 (endoff
>= (loff_t
)in
->max_size
||
3239 endoff
> (loff_t
)(in
->size
<< 1)) &&
3240 endoff
> (loff_t
)in
->wanted_max_size
) {
3241 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3242 in
->wanted_max_size
= endoff
;
3246 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3247 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3248 waitfor_caps
= true;
3250 if (!in
->cap_snaps
.empty()) {
3251 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3252 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3253 waitfor_caps
= true;
3255 for (auto &p
: in
->cap_snaps
) {
3256 if (p
.second
.dirty_data
) {
3257 waitfor_commit
= true;
3261 if (waitfor_commit
) {
3262 _flush(in
, new C_Client_FlushComplete(this, in
));
3263 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3268 if (!waitfor_caps
&& !waitfor_commit
) {
3269 if ((have
& need
) == need
) {
3270 int revoking
= implemented
& ~have
;
3271 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3272 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3273 << " revoking " << ccap_string(revoking
)
3275 if ((revoking
& want
) == 0) {
3276 *phave
= need
| (have
& want
);
3277 in
->get_cap_ref(need
);
3281 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3282 waitfor_caps
= true;
3285 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3286 in
->auth_cap
->session
->readonly
)
3289 if (in
->flags
& I_CAP_DROPPED
) {
3290 int mds_wanted
= in
->caps_mds_wanted();
3291 if ((mds_wanted
& need
) != need
) {
3292 int ret
= _renew_caps(in
);
3297 if (!(file_wanted
& ~mds_wanted
))
3298 in
->flags
&= ~I_CAP_DROPPED
;
3302 wait_on_list(in
->waitfor_caps
);
3303 else if (waitfor_commit
)
3304 wait_on_list(in
->waitfor_commit
);
3308 int Client::get_caps_used(Inode
*in
)
3310 unsigned used
= in
->caps_used();
3311 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3312 !objectcacher
->set_is_empty(&in
->oset
))
3313 used
|= CEPH_CAP_FILE_CACHE
;
3317 void Client::cap_delay_requeue(Inode
*in
)
3319 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3320 in
->hold_caps_until
= ceph_clock_now();
3321 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3322 delayed_list
.push_back(&in
->delay_cap_item
);
3325 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3326 int flags
, int used
, int want
, int retain
,
3327 int flush
, ceph_tid_t flush_tid
)
3329 int held
= cap
->issued
| cap
->implemented
;
3330 int revoking
= cap
->implemented
& ~cap
->issued
;
3331 retain
&= ~revoking
;
3332 int dropping
= cap
->issued
& ~retain
;
3333 int op
= CEPH_CAP_OP_UPDATE
;
3335 ldout(cct
, 10) << __func__
<< " " << *in
3336 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3337 << " used " << ccap_string(used
)
3338 << " want " << ccap_string(want
)
3339 << " flush " << ccap_string(flush
)
3340 << " retain " << ccap_string(retain
)
3341 << " held "<< ccap_string(held
)
3342 << " revoking " << ccap_string(revoking
)
3343 << " dropping " << ccap_string(dropping
)
3346 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3347 const int would_have_issued
= cap
->issued
& retain
;
3348 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3350 // - tell the server we think issued is whatever they issued plus whatever we implemented
3351 // - leave what we have implemented in place
3352 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3353 cap
->issued
= cap
->issued
| cap
->implemented
;
3355 // Make an exception for revoking xattr caps: we are injecting
3356 // failure to release other caps, but allow xattr because client
3357 // will block on xattr ops if it can't release these to MDS (#9800)
3358 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3359 cap
->issued
^= xattr_mask
& revoking
;
3360 cap
->implemented
^= xattr_mask
& revoking
;
3362 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3363 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3366 cap
->issued
&= retain
;
3367 cap
->implemented
&= cap
->issued
| used
;
3370 snapid_t follows
= 0;
3373 follows
= in
->snaprealm
->get_snap_context().seq
;
3375 auto m
= make_message
<MClientCaps
>(op
,
3378 cap
->cap_id
, cap
->seq
,
3384 m
->caller_uid
= in
->cap_dirtier_uid
;
3385 m
->caller_gid
= in
->cap_dirtier_gid
;
3387 m
->head
.issue_seq
= cap
->issue_seq
;
3388 m
->set_tid(flush_tid
);
3390 m
->head
.uid
= in
->uid
;
3391 m
->head
.gid
= in
->gid
;
3392 m
->head
.mode
= in
->mode
;
3394 m
->head
.nlink
= in
->nlink
;
3396 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3397 encode(in
->xattrs
, m
->xattrbl
);
3398 m
->head
.xattr_version
= in
->xattr_version
;
3402 m
->max_size
= in
->max_size
;
3403 m
->truncate_seq
= in
->truncate_seq
;
3404 m
->truncate_size
= in
->truncate_size
;
3405 m
->mtime
= in
->mtime
;
3406 m
->atime
= in
->atime
;
3407 m
->ctime
= in
->ctime
;
3408 m
->btime
= in
->btime
;
3409 m
->time_warp_seq
= in
->time_warp_seq
;
3410 m
->change_attr
= in
->change_attr
;
3412 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3413 !in
->cap_snaps
.empty() &&
3414 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3415 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3418 if (flush
& CEPH_CAP_FILE_WR
) {
3419 m
->inline_version
= in
->inline_version
;
3420 m
->inline_data
= in
->inline_data
;
3423 in
->reported_size
= in
->size
;
3424 m
->set_snap_follows(follows
);
3426 if (cap
== in
->auth_cap
) {
3427 m
->set_max_size(in
->wanted_max_size
);
3428 in
->requested_max_size
= in
->wanted_max_size
;
3429 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3432 if (!session
->flushing_caps_tids
.empty())
3433 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3435 session
->con
->send_message2(std::move(m
));
3438 static bool is_max_size_approaching(Inode
*in
)
3440 /* mds will adjust max size according to the reported size */
3441 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3443 if (in
->size
>= in
->max_size
)
3445 /* half of previous max_size increment has been used */
3446 if (in
->max_size
> in
->reported_size
&&
3447 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3452 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3454 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3456 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3459 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3460 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3461 used
&= ~CEPH_CAP_FILE_CACHE
;
3462 used
|= CEPH_CAP_FILE_LAZYIO
;
3464 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3465 used
&= ~CEPH_CAP_FILE_BUFFER
;
3466 used
|= CEPH_CAP_FILE_LAZYIO
;
3469 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3470 used
&= ~CEPH_CAP_FILE_CACHE
;
3471 used
|= CEPH_CAP_FILE_LAZYIO
;
3473 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3474 used
&= ~CEPH_CAP_FILE_BUFFER
;
3475 used
|= CEPH_CAP_FILE_LAZYIO
;
3484 * Examine currently used and wanted versus held caps. Release, flush or ack
3485 * revoked caps to the MDS as appropriate.
3487 * @param in the inode to check
3488 * @param flags flags to apply to cap check
3490 void Client::check_caps(Inode
*in
, unsigned flags
)
3492 unsigned wanted
= in
->caps_wanted();
3493 unsigned used
= get_caps_used(in
);
3497 int issued
= in
->caps_issued(&implemented
);
3498 int revoking
= implemented
& ~issued
;
3500 int orig_used
= used
;
3501 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3503 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3504 if (!unmounting
&& in
->nlink
> 0) {
3506 retain
|= CEPH_CAP_ANY
;
3507 } else if (in
->is_dir() &&
3508 (issued
& CEPH_CAP_FILE_SHARED
) &&
3509 (in
->flags
& I_COMPLETE
)) {
3510 // we do this here because we don't want to drop to Fs (and then
3511 // drop the Fs if we do a create!) if that alone makes us send lookups
3512 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3513 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3516 retain
|= CEPH_CAP_ANY_SHARED
;
3517 // keep RD only if we didn't have the file open RW,
3518 // because then the mds would revoke it anyway to
3519 // journal max_size=0.
3520 if (in
->max_size
== 0)
3521 retain
|= CEPH_CAP_ANY_RD
;
3525 ldout(cct
, 10) << __func__
<< " on " << *in
3526 << " wanted " << ccap_string(wanted
)
3527 << " used " << ccap_string(used
)
3528 << " issued " << ccap_string(issued
)
3529 << " revoking " << ccap_string(revoking
)
3530 << " flags=" << flags
3533 if (in
->snapid
!= CEPH_NOSNAP
)
3534 return; //snap caps last forever, can't write
3536 if (in
->caps
.empty())
3537 return; // guard if at end of func
3539 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3540 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3542 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3546 for (auto &p
: in
->caps
) {
3547 mds_rank_t mds
= p
.first
;
3548 Cap
&cap
= p
.second
;
3550 MetaSession
*session
= &mds_sessions
.at(mds
);
3553 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3554 cap_used
&= ~in
->auth_cap
->issued
;
3556 revoking
= cap
.implemented
& ~cap
.issued
;
3558 ldout(cct
, 10) << " cap mds." << mds
3559 << " issued " << ccap_string(cap
.issued
)
3560 << " implemented " << ccap_string(cap
.implemented
)
3561 << " revoking " << ccap_string(revoking
) << dendl
;
3563 if (in
->wanted_max_size
> in
->max_size
&&
3564 in
->wanted_max_size
> in
->requested_max_size
&&
3565 &cap
== in
->auth_cap
)
3568 /* approaching file_max? */
3569 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3570 &cap
== in
->auth_cap
&&
3571 is_max_size_approaching(in
)) {
3572 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3573 << ", reported " << in
->reported_size
<< dendl
;
3577 /* completed revocation? */
3578 if (revoking
&& (revoking
& cap_used
) == 0) {
3579 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3583 /* want more caps from mds? */
3584 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3587 if (!revoking
&& unmounting
&& (cap_used
== 0))
3590 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3591 !in
->dirty_caps
) // and we have no dirty caps
3594 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3595 ldout(cct
, 10) << "delaying cap release" << dendl
;
3596 cap_delay_requeue(in
);
3601 if (&cap
== in
->auth_cap
) {
3602 if (in
->flags
& I_KICK_FLUSH
) {
3603 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3604 << " to mds." << mds
<< dendl
;
3605 kick_flushing_caps(in
, session
);
3607 if (!in
->cap_snaps
.empty() &&
3608 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3613 ceph_tid_t flush_tid
;
3614 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3615 flushing
= mark_caps_flushing(in
, &flush_tid
);
3621 int msg_flags
= (flags
& CHECK_CAPS_SYNCHRONOUS
) ? MClientCaps::FLAG_SYNC
: 0;
3622 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3623 flushing
, flush_tid
);
3628 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3630 int used
= get_caps_used(in
);
3631 int dirty
= in
->caps_dirty();
3632 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3634 if (in
->cap_snaps
.size() &&
3635 in
->cap_snaps
.rbegin()->second
.writing
) {
3636 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3638 } else if (in
->caps_dirty() ||
3639 (used
& CEPH_CAP_FILE_WR
) ||
3640 (dirty
& CEPH_CAP_ANY_WR
)) {
3641 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3642 ceph_assert(capsnapem
.second
); /* element inserted */
3643 CapSnap
&capsnap
= capsnapem
.first
->second
;
3644 capsnap
.context
= old_snapc
;
3645 capsnap
.issued
= in
->caps_issued();
3646 capsnap
.dirty
= in
->caps_dirty();
3648 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3650 capsnap
.uid
= in
->uid
;
3651 capsnap
.gid
= in
->gid
;
3652 capsnap
.mode
= in
->mode
;
3653 capsnap
.btime
= in
->btime
;
3654 capsnap
.xattrs
= in
->xattrs
;
3655 capsnap
.xattr_version
= in
->xattr_version
;
3656 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3657 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3659 if (used
& CEPH_CAP_FILE_WR
) {
3660 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3661 capsnap
.writing
= 1;
3663 finish_cap_snap(in
, capsnap
, used
);
3666 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3670 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3672 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3673 capsnap
.size
= in
->size
;
3674 capsnap
.mtime
= in
->mtime
;
3675 capsnap
.atime
= in
->atime
;
3676 capsnap
.ctime
= in
->ctime
;
3677 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3678 capsnap
.change_attr
= in
->change_attr
;
3679 capsnap
.dirty
|= in
->caps_dirty();
3681 /* Only reset it if it wasn't set before */
3682 if (capsnap
.cap_dirtier_uid
== -1) {
3683 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3684 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3687 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3688 capsnap
.inline_data
= in
->inline_data
;
3689 capsnap
.inline_version
= in
->inline_version
;
3692 if (used
& CEPH_CAP_FILE_BUFFER
) {
3693 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3694 << " WRBUFFER, delaying" << dendl
;
3696 capsnap
.dirty_data
= 0;
3701 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3703 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3704 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3708 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3709 snapid_t follows
, CapSnap
& capsnap
)
3711 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3712 in
->ino
, in
->snaprealm
->ino
, 0,
3713 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3714 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3715 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3717 m
->set_client_tid(capsnap
.flush_tid
);
3718 m
->head
.snap_follows
= follows
;
3720 m
->head
.caps
= capsnap
.issued
;
3721 m
->head
.dirty
= capsnap
.dirty
;
3723 m
->head
.uid
= capsnap
.uid
;
3724 m
->head
.gid
= capsnap
.gid
;
3725 m
->head
.mode
= capsnap
.mode
;
3726 m
->btime
= capsnap
.btime
;
3728 m
->size
= capsnap
.size
;
3730 m
->head
.xattr_version
= capsnap
.xattr_version
;
3731 encode(capsnap
.xattrs
, m
->xattrbl
);
3733 m
->ctime
= capsnap
.ctime
;
3734 m
->btime
= capsnap
.btime
;
3735 m
->mtime
= capsnap
.mtime
;
3736 m
->atime
= capsnap
.atime
;
3737 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3738 m
->change_attr
= capsnap
.change_attr
;
3740 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3741 m
->inline_version
= in
->inline_version
;
3742 m
->inline_data
= in
->inline_data
;
3745 ceph_assert(!session
->flushing_caps_tids
.empty());
3746 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3748 session
->con
->send_message2(std::move(m
));
3751 void Client::flush_snaps(Inode
*in
)
3753 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3754 ceph_assert(in
->cap_snaps
.size());
3757 ceph_assert(in
->auth_cap
);
3758 MetaSession
*session
= in
->auth_cap
->session
;
3760 for (auto &p
: in
->cap_snaps
) {
3761 CapSnap
&capsnap
= p
.second
;
3762 // only do new flush
3763 if (capsnap
.flush_tid
> 0)
3766 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3767 << " follows " << p
.first
3768 << " size " << capsnap
.size
3769 << " mtime " << capsnap
.mtime
3770 << " dirty_data=" << capsnap
.dirty_data
3771 << " writing=" << capsnap
.writing
3772 << " on " << *in
<< dendl
;
3773 if (capsnap
.dirty_data
|| capsnap
.writing
)
3776 capsnap
.flush_tid
= ++last_flush_tid
;
3777 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3778 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3779 if (!in
->flushing_cap_item
.is_on_list())
3780 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3782 send_flush_snap(in
, session
, p
.first
, capsnap
);
3786 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3788 ceph::condition_variable cond
;
3789 ls
.push_back(&cond
);
3790 std::unique_lock l
{client_lock
, std::adopt_lock
};
3796 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3798 for (auto cond
: ls
) {
3803 void Client::wait_on_context_list(list
<Context
*>& ls
)
3805 ceph::condition_variable cond
;
3808 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3809 std::unique_lock l
{client_lock
, std::adopt_lock
};
3810 cond
.wait(l
, [&done
] { return done
;});
3814 void Client::signal_context_list(list
<Context
*>& ls
)
3816 while (!ls
.empty()) {
3817 ls
.front()->complete(0);
3822 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3824 for (const auto &cap
: s
->caps
) {
3825 auto &in
= cap
->inode
;
3827 in
.requested_max_size
= 0;
3828 in
.wanted_max_size
= 0;
3830 if (cap
->gen
< s
->cap_gen
) {
3831 // mds did not re-issue stale cap.
3832 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3833 // make sure mds knows what we want.
3834 if (in
.caps_file_wanted() & ~cap
->wanted
)
3835 in
.flags
|= I_CAP_DROPPED
;
3838 signal_cond_list(in
.waitfor_caps
);
3843 // flush dirty data (from objectcache)
3845 class C_Client_CacheInvalidate
: public Context
{
3849 int64_t offset
, length
;
3851 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3852 client(c
), offset(off
), length(len
) {
3853 if (client
->use_faked_inos())
3854 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3858 void finish(int r
) override
{
3859 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3860 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
3861 client
->_async_invalidate(ino
, offset
, length
);
3865 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3869 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3870 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3873 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3875 if (ino_invalidate_cb
)
3876 // we queue the invalidate, which calls the callback and decrements the ref
3877 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3880 void Client::_invalidate_inode_cache(Inode
*in
)
3882 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3884 // invalidate our userspace inode cache
3885 if (cct
->_conf
->client_oc
) {
3886 objectcacher
->release_set(&in
->oset
);
3887 if (!objectcacher
->set_is_empty(&in
->oset
))
3888 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3891 _schedule_invalidate_callback(in
, 0, 0);
3894 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3896 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3898 // invalidate our userspace inode cache
3899 if (cct
->_conf
->client_oc
) {
3900 vector
<ObjectExtent
> ls
;
3901 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3902 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3905 _schedule_invalidate_callback(in
, off
, len
);
3908 bool Client::_release(Inode
*in
)
3910 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3911 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3912 _invalidate_inode_cache(in
);
3918 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3920 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3922 if (!in
->oset
.dirty_or_tx
) {
3923 ldout(cct
, 10) << " nothing to flush" << dendl
;
3924 onfinish
->complete(0);
3928 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3929 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3930 objectcacher
->purge_set(&in
->oset
);
3932 onfinish
->complete(-ENOSPC
);
3937 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3940 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3942 ceph_assert(ceph_mutex_is_locked(client_lock
));
3943 if (!in
->oset
.dirty_or_tx
) {
3944 ldout(cct
, 10) << " nothing to flush" << dendl
;
3948 C_SaferCond
onflush("Client::_flush_range flock");
3949 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3950 offset
, size
, &onflush
);
3953 client_lock
.unlock();
3959 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3961 // std::lock_guard l(client_lock);
3962 ceph_assert(ceph_mutex_is_locked(client_lock
)); // will be called via dispatch() -> objecter -> ...
3963 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3968 void Client::_flushed(Inode
*in
)
3970 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3972 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3977 // checks common to add_update_cap, handle_cap_grant
3978 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
3980 unsigned had
= in
->caps_issued();
3982 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3983 !(had
& CEPH_CAP_FILE_CACHE
))
3986 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3987 !(had
& CEPH_CAP_FILE_SHARED
)) {
3991 clear_dir_complete_and_ordered(in
, true);
3995 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3996 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
3997 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
3999 if (!in
->is_any_caps()) {
4000 ceph_assert(in
->snaprealm
== 0);
4001 in
->snaprealm
= get_snap_realm(realm
);
4002 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4003 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4005 ceph_assert(in
->snaprealm
);
4006 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4007 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4008 in
->snaprealm_item
.remove_myself();
4009 auto oldrealm
= in
->snaprealm
;
4010 in
->snaprealm
= get_snap_realm(realm
);
4011 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4012 put_snap_realm(oldrealm
);
4016 mds_rank_t mds
= mds_session
->mds_num
;
4017 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4018 Cap
&cap
= capem
.first
->second
;
4019 if (!capem
.second
) {
4020 if (cap
.gen
< mds_session
->cap_gen
)
4021 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4024 * auth mds of the inode changed. we received the cap export
4025 * message, but still haven't received the cap import message.
4026 * handle_cap_export() updated the new auth MDS' cap.
4028 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4029 * a message that was send before the cap import message. So
4030 * don't remove caps.
4032 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4033 if (&cap
!= in
->auth_cap
)
4034 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4036 ceph_assert(cap
.cap_id
== cap_id
);
4039 issued
|= cap
.issued
;
4040 flags
|= CEPH_CAP_FLAG_AUTH
;
4044 check_cap_issue(in
, issued
);
4046 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4047 if (in
->auth_cap
!= &cap
&&
4048 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4049 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4050 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4051 << "add myself to new auth MDS' flushing caps list" << dendl
;
4052 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4054 in
->auth_cap
= &cap
;
4058 unsigned old_caps
= cap
.issued
;
4059 cap
.cap_id
= cap_id
;
4060 cap
.issued
= issued
;
4061 cap
.implemented
|= issued
;
4062 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4063 cap
.wanted
= wanted
;
4065 cap
.wanted
|= wanted
;
4067 cap
.issue_seq
= seq
;
4069 cap
.gen
= mds_session
->cap_gen
;
4070 cap
.latest_perms
= cap_perms
;
4071 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4072 << " from mds." << mds
4076 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4077 // non-auth MDS is revoking the newly grant caps ?
4078 for (auto &p
: in
->caps
) {
4079 if (&p
.second
== &cap
)
4081 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4082 check_caps(in
, CHECK_CAPS_NODELAY
);
4088 if (issued
& ~old_caps
)
4089 signal_cond_list(in
->waitfor_caps
);
4092 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4094 auto &in
= cap
->inode
;
4095 MetaSession
*session
= cap
->session
;
4096 mds_rank_t mds
= cap
->session
->mds_num
;
4098 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4100 if (queue_release
) {
4101 session
->enqueue_cap_release(
4109 if (in
.auth_cap
== cap
) {
4110 if (in
.flushing_cap_item
.is_on_list()) {
4111 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4112 in
.flushing_cap_item
.remove_myself();
4116 size_t n
= in
.caps
.erase(mds
);
4117 ceph_assert(n
== 1);
4120 if (!in
.is_any_caps()) {
4121 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4122 in
.snaprealm_item
.remove_myself();
4123 put_snap_realm(in
.snaprealm
);
4128 void Client::remove_all_caps(Inode
*in
)
4130 while (!in
->caps
.empty())
4131 remove_cap(&in
->caps
.begin()->second
, true);
4134 void Client::remove_session_caps(MetaSession
*s
)
4136 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4138 while (s
->caps
.size()) {
4139 Cap
*cap
= *s
->caps
.begin();
4140 InodeRef
in(&cap
->inode
);
4141 bool dirty_caps
= false;
4142 if (in
->auth_cap
== cap
) {
4143 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4144 in
->wanted_max_size
= 0;
4145 in
->requested_max_size
= 0;
4147 if (cap
->wanted
| cap
->issued
)
4148 in
->flags
|= I_CAP_DROPPED
;
4149 remove_cap(cap
, false);
4150 in
->cap_snaps
.clear();
4152 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4153 if (in
->flushing_caps
) {
4154 num_flushing_caps
--;
4155 in
->flushing_cap_tids
.clear();
4157 in
->flushing_caps
= 0;
4158 in
->mark_caps_clean();
4159 put_inode(in
.get());
4161 signal_cond_list(in
->waitfor_caps
);
4163 s
->flushing_caps_tids
.clear();
4164 sync_cond
.notify_all();
4167 int Client::_do_remount(bool retry_on_error
)
4169 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4172 int r
= remount_cb(callback_handle
);
4174 retries_on_invalidate
= 0;
4177 client_t whoami
= get_nodeid();
4180 "failed to remount (to trim kernel dentries): "
4181 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4184 "failed to remount (to trim kernel dentries): "
4185 "return code = " << r
<< dendl
;
4188 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4189 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4190 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4191 if (should_abort
&& !unmounting
) {
4192 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4199 class C_Client_Remount
: public Context
{
4203 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4204 void finish(int r
) override
{
4205 ceph_assert(r
== 0);
4206 client
->_do_remount(true);
4210 void Client::_invalidate_kernel_dcache()
4214 if (can_invalidate_dentries
) {
4215 if (dentry_invalidate_cb
&& root
->dir
) {
4216 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4217 p
!= root
->dir
->dentries
.end();
4219 if (p
->second
->inode
)
4220 _schedule_invalidate_dentry_callback(p
->second
, false);
4223 } else if (remount_cb
) {
4225 // when remounting a file system, linux kernel trims all unused dentries in the fs
4226 remount_finisher
.queue(new C_Client_Remount(this));
4230 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4236 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4237 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4238 Dentry
*dn
= p
->second
;
4240 ceph_assert(!dn
->inode
);
4241 if (dn
->lru_is_expireable())
4242 unlink(dn
, true, false); // keep dir, drop dentry
4244 if (dir
->dentries
.empty()) {
4249 if (in
->flags
& I_SNAPDIR_OPEN
) {
4250 InodeRef snapdir
= open_snapdir(in
.get());
4251 _trim_negative_child_dentries(snapdir
);
4255 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4257 mds_rank_t mds
= s
->mds_num
;
4258 size_t caps_size
= s
->caps
.size();
4259 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4260 << " caps " << caps_size
<< dendl
;
4262 uint64_t trimmed
= 0;
4263 auto p
= s
->caps
.begin();
4264 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4265 * looking at from getting deleted during traversal. */
4266 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4268 InodeRef
in(&cap
->inode
);
4270 // Increment p early because it will be invalidated if cap
4271 // is deleted inside remove_cap
4274 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4275 int mine
= cap
->issued
| cap
->implemented
;
4276 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4277 // disposable non-auth cap
4278 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4279 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4280 cap
= (remove_cap(cap
, true), nullptr);
4284 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4285 _trim_negative_child_dentries(in
);
4287 auto q
= in
->dentries
.begin();
4288 while (q
!= in
->dentries
.end()) {
4291 if (dn
->lru_is_expireable()) {
4292 if (can_invalidate_dentries
&&
4293 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4294 // Only issue one of these per DN for inodes in root: handle
4295 // others more efficiently by calling for root-child DNs at
4296 // the end of this function.
4297 _schedule_invalidate_dentry_callback(dn
, true);
4299 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4302 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4306 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4307 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4312 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4313 for (const auto &dn
: to_trim
) {
4318 caps_size
= s
->caps
.size();
4319 if (caps_size
> (size_t)max
)
4320 _invalidate_kernel_dcache();
4323 void Client::force_session_readonly(MetaSession
*s
)
4326 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4327 auto &in
= (*p
)->inode
;
4328 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4329 signal_cond_list(in
.waitfor_caps
);
4333 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4335 MetaSession
*session
= in
->auth_cap
->session
;
4337 int flushing
= in
->dirty_caps
;
4338 ceph_assert(flushing
);
4340 ceph_tid_t flush_tid
= ++last_flush_tid
;
4341 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4343 if (!in
->flushing_caps
) {
4344 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4345 num_flushing_caps
++;
4347 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4350 in
->flushing_caps
|= flushing
;
4351 in
->mark_caps_clean();
4353 if (!in
->flushing_cap_item
.is_on_list())
4354 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4355 session
->flushing_caps_tids
.insert(flush_tid
);
4361 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4363 for (auto &p
: in
->cap_snaps
) {
4364 CapSnap
&capsnap
= p
.second
;
4365 if (capsnap
.flush_tid
> 0) {
4366 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4367 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4370 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4371 it
!= in
->flushing_cap_tids
.end();
4373 old_s
->flushing_caps_tids
.erase(it
->first
);
4374 new_s
->flushing_caps_tids
.insert(it
->first
);
4376 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4380 * Flush all caps back to the MDS. Because the callers generally wait on the
4381 * result of this function (syncfs and umount cases), we set
4382 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4384 void Client::flush_caps_sync()
4386 ldout(cct
, 10) << __func__
<< dendl
;
4387 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4389 unsigned flags
= CHECK_CAPS_NODELAY
;
4393 delayed_list
.pop_front();
4394 if (p
.end() && dirty_list
.empty())
4395 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4396 check_caps(in
, flags
);
4400 p
= dirty_list
.begin();
4402 unsigned flags
= CHECK_CAPS_NODELAY
;
4407 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4408 check_caps(in
, flags
);
4412 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4414 while (in
->flushing_caps
) {
4415 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4416 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4417 if (it
->first
> want
)
4419 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4420 << ccap_string(it
->second
) << " want " << want
4421 << " last " << it
->first
<< dendl
;
4422 wait_on_list(in
->waitfor_caps
);
4426 void Client::wait_sync_caps(ceph_tid_t want
)
4429 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4430 << num_flushing_caps
<< " total flushing)" << dendl
;
4431 for (auto &p
: mds_sessions
) {
4432 MetaSession
*s
= &p
.second
;
4433 if (s
->flushing_caps_tids
.empty())
4435 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4436 if (oldest_tid
<= want
) {
4437 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4438 << " (want " << want
<< ")" << dendl
;
4439 std::unique_lock l
{client_lock
, std::adopt_lock
};
4447 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4449 in
->flags
&= ~I_KICK_FLUSH
;
4451 Cap
*cap
= in
->auth_cap
;
4452 ceph_assert(cap
->session
== session
);
4454 ceph_tid_t last_snap_flush
= 0;
4455 for (auto p
= in
->flushing_cap_tids
.rbegin();
4456 p
!= in
->flushing_cap_tids
.rend();
4459 last_snap_flush
= p
->first
;
4464 int wanted
= in
->caps_wanted();
4465 int used
= get_caps_used(in
) | in
->caps_dirty();
4466 auto it
= in
->cap_snaps
.begin();
4467 for (auto& p
: in
->flushing_cap_tids
) {
4469 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4470 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4473 ceph_assert(it
!= in
->cap_snaps
.end());
4474 ceph_assert(it
->second
.flush_tid
== p
.first
);
4475 send_flush_snap(in
, session
, it
->first
, it
->second
);
4481 void Client::kick_flushing_caps(MetaSession
*session
)
4483 mds_rank_t mds
= session
->mds_num
;
4484 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4486 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4488 if (in
->flags
& I_KICK_FLUSH
) {
4489 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4490 kick_flushing_caps(in
, session
);
4495 void Client::early_kick_flushing_caps(MetaSession
*session
)
4497 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4499 Cap
*cap
= in
->auth_cap
;
4502 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4503 // stage. This guarantees that MDS processes the cap flush message before issuing
4504 // the flushing caps to other client.
4505 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4506 in
->flags
|= I_KICK_FLUSH
;
4510 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4511 << " to mds." << session
->mds_num
<< dendl
;
4512 // send_reconnect() also will reset these sequence numbers. make sure
4513 // sequence numbers in cap flush message match later reconnect message.
4517 cap
->issued
= cap
->implemented
;
4519 kick_flushing_caps(in
, session
);
4523 void SnapRealm::build_snap_context()
4525 set
<snapid_t
> snaps
;
4526 snapid_t max_seq
= seq
;
4528 // start with prior_parents?
4529 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4530 snaps
.insert(prior_parent_snaps
[i
]);
4532 // current parent's snaps
4534 const SnapContext
& psnapc
= pparent
->get_snap_context();
4535 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4536 if (psnapc
.snaps
[i
] >= parent_since
)
4537 snaps
.insert(psnapc
.snaps
[i
]);
4538 if (psnapc
.seq
> max_seq
)
4539 max_seq
= psnapc
.seq
;
4543 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4544 snaps
.insert(my_snaps
[i
]);
4547 cached_snap_context
.seq
= max_seq
;
4548 cached_snap_context
.snaps
.resize(0);
4549 cached_snap_context
.snaps
.reserve(snaps
.size());
4550 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4551 cached_snap_context
.snaps
.push_back(*p
);
4554 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4559 while (!q
.empty()) {
4563 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4564 realm
->invalidate_cache();
4566 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4567 p
!= realm
->pchildren
.end();
4573 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4575 SnapRealm
*realm
= snap_realms
[r
];
4577 snap_realms
[r
] = realm
= new SnapRealm(r
);
4578 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4583 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4585 if (snap_realms
.count(r
) == 0) {
4586 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4589 SnapRealm
*realm
= snap_realms
[r
];
4590 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4595 void Client::put_snap_realm(SnapRealm
*realm
)
4597 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4598 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4599 if (--realm
->nref
== 0) {
4600 snap_realms
.erase(realm
->ino
);
4601 if (realm
->pparent
) {
4602 realm
->pparent
->pchildren
.erase(realm
);
4603 put_snap_realm(realm
->pparent
);
4609 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4611 if (realm
->parent
!= parent
) {
4612 ldout(cct
, 10) << __func__
<< " " << *realm
4613 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4614 realm
->parent
= parent
;
4615 if (realm
->pparent
) {
4616 realm
->pparent
->pchildren
.erase(realm
);
4617 put_snap_realm(realm
->pparent
);
4619 realm
->pparent
= get_snap_realm(parent
);
4620 realm
->pparent
->pchildren
.insert(realm
);
4626 static bool has_new_snaps(const SnapContext
& old_snapc
,
4627 const SnapContext
& new_snapc
)
4629 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4633 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4635 SnapRealm
*first_realm
= NULL
;
4636 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4638 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4640 auto p
= bl
.cbegin();
4644 SnapRealm
*realm
= get_snap_realm(info
.ino());
4646 bool invalidate
= false;
4648 if (info
.seq() > realm
->seq
) {
4649 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4653 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4654 // flush me + children
4657 while (!q
.empty()) {
4658 SnapRealm
*realm
= q
.front();
4661 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4662 p
!= realm
->pchildren
.end();
4666 if (dirty_realms
.count(realm
) == 0) {
4668 dirty_realms
[realm
] = realm
->get_snap_context();
4674 realm
->seq
= info
.seq();
4675 realm
->created
= info
.created();
4676 realm
->parent_since
= info
.parent_since();
4677 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4678 realm
->my_snaps
= info
.my_snaps
;
4682 // _always_ verify parent
4683 if (adjust_realm_parent(realm
, info
.parent()))
4687 invalidate_snaprealm_and_children(realm
);
4688 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4689 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4691 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4692 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4696 first_realm
= realm
;
4698 put_snap_realm(realm
);
4701 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4702 q
!= dirty_realms
.end();
4704 SnapRealm
*realm
= q
->first
;
4705 // if there are new snaps ?
4706 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4707 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4708 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4712 queue_cap_snap(in
, q
->second
);
4715 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4717 put_snap_realm(realm
);
4721 *realm_ret
= first_realm
;
4723 put_snap_realm(first_realm
);
4726 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4728 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4729 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4730 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4735 got_mds_push(session
);
4737 map
<Inode
*, SnapContext
> to_move
;
4738 SnapRealm
*realm
= 0;
4740 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4741 ceph_assert(m
->head
.split
);
4743 auto p
= m
->bl
.cbegin();
4745 ceph_assert(info
.ino() == m
->head
.split
);
4747 // flush, then move, ino's.
4748 realm
= get_snap_realm(info
.ino());
4749 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4750 for (auto& ino
: m
->split_inos
) {
4751 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4752 if (inode_map
.count(vino
)) {
4753 Inode
*in
= inode_map
[vino
];
4754 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4756 if (in
->snaprealm
->created
> info
.created()) {
4757 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4758 << *in
->snaprealm
<< dendl
;
4761 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4764 in
->snaprealm_item
.remove_myself();
4765 to_move
[in
] = in
->snaprealm
->get_snap_context();
4766 put_snap_realm(in
->snaprealm
);
4770 // move child snaprealms, too
4771 for (auto& child_realm
: m
->split_realms
) {
4772 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4773 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4776 adjust_realm_parent(child
, realm
->ino
);
4777 put_snap_realm(child
);
4781 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4784 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4785 Inode
*in
= p
->first
;
4786 in
->snaprealm
= realm
;
4787 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4789 // queue for snap writeback
4790 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4791 queue_cap_snap(in
, p
->second
);
4793 put_snap_realm(realm
);
4797 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4799 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4800 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4805 got_mds_push(session
);
4807 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4809 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4810 if (inode_map
.count(vino
)) {
4812 in
= inode_map
[vino
];
4815 in
->quota
= m
->quota
;
4816 in
->rstat
= m
->rstat
;
4821 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4823 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4824 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4829 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4830 // Pause RADOS operations until we see the required epoch
4831 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4834 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4835 // Record the barrier so that we will transmit it to MDS when releasing
4836 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4839 got_mds_push(session
);
4842 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4843 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4846 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4847 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4848 session
->enqueue_cap_release(
4855 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4858 // in case the mds is waiting on e.g. a revocation
4859 flush_cap_releases();
4863 switch (m
->get_op()) {
4864 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4865 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4866 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4869 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4870 Cap
&cap
= in
->caps
.at(mds
);
4872 switch (m
->get_op()) {
4873 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4874 case CEPH_CAP_OP_IMPORT
:
4875 case CEPH_CAP_OP_REVOKE
:
4876 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4877 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4880 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4885 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4887 mds_rank_t mds
= session
->mds_num
;
4889 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4890 << " IMPORT from mds." << mds
<< dendl
;
4892 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4895 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4897 cap_perms
= cap
->latest_perms
;
4901 SnapRealm
*realm
= NULL
;
4902 update_snap_trace(m
->snapbl
, &realm
);
4904 add_update_cap(in
, session
, m
->get_cap_id(),
4905 m
->get_caps(), m
->get_wanted(), m
->get_seq(), m
->get_mseq(),
4906 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4908 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4909 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4913 put_snap_realm(realm
);
4915 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4916 // reflush any/all caps (if we are now the auth_cap)
4917 kick_flushing_caps(in
, session
);
4921 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4923 mds_rank_t mds
= session
->mds_num
;
4925 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4926 << " EXPORT from mds." << mds
<< dendl
;
4928 auto it
= in
->caps
.find(mds
);
4929 if (it
!= in
->caps
.end()) {
4930 Cap
&cap
= it
->second
;
4931 if (cap
.cap_id
== m
->get_cap_id()) {
4932 if (m
->peer
.cap_id
) {
4933 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
4934 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4935 auto it
= in
->caps
.find(peer_mds
);
4936 if (it
!= in
->caps
.end()) {
4937 Cap
&tcap
= it
->second
;
4938 if (tcap
.cap_id
== m
->peer
.cap_id
&&
4939 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
4940 tcap
.cap_id
= m
->peer
.cap_id
;
4941 tcap
.seq
= m
->peer
.seq
- 1;
4942 tcap
.issue_seq
= tcap
.seq
;
4943 tcap
.issued
|= cap
.issued
;
4944 tcap
.implemented
|= cap
.issued
;
4945 if (&cap
== in
->auth_cap
)
4946 in
->auth_cap
= &tcap
;
4947 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
4948 adjust_session_flushing_caps(in
, session
, tsession
);
4951 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
4952 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4953 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4957 if (cap
.wanted
| cap
.issued
)
4958 in
->flags
|= I_CAP_DROPPED
;
4961 remove_cap(&cap
, false);
4966 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4968 mds_rank_t mds
= session
->mds_num
;
4969 ceph_assert(in
->caps
.count(mds
));
4971 ldout(cct
, 10) << __func__
<< " on ino " << *in
4972 << " size " << in
->size
<< " -> " << m
->get_size()
4976 in
->caps_issued(&issued
);
4977 issued
|= in
->caps_dirty();
4978 update_inode_file_size(in
, issued
, m
->get_size(),
4979 m
->get_truncate_seq(), m
->get_truncate_size());
4982 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
4984 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4985 int dirty
= m
->get_dirty();
4989 auto it
= in
->flushing_cap_tids
.begin();
4990 if (it
->first
< flush_ack_tid
) {
4991 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
4992 << " got unexpected flush ack tid " << flush_ack_tid
4993 << " expected is " << it
->first
<< dendl
;
4995 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5001 if (it
->first
== flush_ack_tid
)
5002 cleaned
= it
->second
;
5003 if (it
->first
<= flush_ack_tid
) {
5004 session
->flushing_caps_tids
.erase(it
->first
);
5005 in
->flushing_cap_tids
.erase(it
++);
5009 cleaned
&= ~it
->second
;
5015 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5016 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5017 << " with " << ccap_string(dirty
) << dendl
;
5020 signal_cond_list(in
->waitfor_caps
);
5021 if (session
->flushing_caps_tids
.empty() ||
5022 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5023 sync_cond
.notify_all();
5027 in
->cap_dirtier_uid
= -1;
5028 in
->cap_dirtier_gid
= -1;
5032 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5034 if (in
->flushing_caps
) {
5035 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5036 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5037 in
->flushing_caps
&= ~cleaned
;
5038 if (in
->flushing_caps
== 0) {
5039 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5040 num_flushing_caps
--;
5041 if (in
->flushing_cap_tids
.empty())
5042 in
->flushing_cap_item
.remove_myself();
5044 if (!in
->caps_dirty())
5051 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5053 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5054 mds_rank_t mds
= session
->mds_num
;
5055 ceph_assert(in
->caps
.count(mds
));
5056 snapid_t follows
= m
->get_snap_follows();
5058 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5059 auto& capsnap
= it
->second
;
5060 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5061 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5063 InodeRef
tmp_ref(in
);
5064 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5065 << " on " << *in
<< dendl
;
5066 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5067 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5068 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5069 in
->flushing_cap_item
.remove_myself();
5070 in
->cap_snaps
.erase(it
);
5072 signal_cond_list(in
->waitfor_caps
);
5073 if (session
->flushing_caps_tids
.empty() ||
5074 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5075 sync_cond
.notify_all();
5078 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5079 << " on " << *in
<< dendl
;
5080 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5084 class C_Client_DentryInvalidate
: public Context
{
5091 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5092 client(c
), name(dn
->name
) {
5093 if (client
->use_faked_inos()) {
5094 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5096 ino
.ino
= dn
->inode
->faked_ino
;
5098 dirino
= dn
->dir
->parent_inode
->vino();
5100 ino
= dn
->inode
->vino();
5103 ino
.ino
= inodeno_t();
5105 void finish(int r
) override
{
5106 // _async_dentry_invalidate is responsible for its own locking
5107 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5108 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5112 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5116 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5117 << " in dir " << dirino
<< dendl
;
5118 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5121 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5123 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5124 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5127 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5129 int ref
= in
->get_num_ref();
5130 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5132 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5133 for (auto p
= in
->dir
->dentries
.begin();
5134 p
!= in
->dir
->dentries
.end(); ) {
5135 Dentry
*dn
= p
->second
;
5137 /* rmsnap removes whole subtree, need trim inodes recursively.
5138 * we don't need to invalidate dentries recursively. because
5139 * invalidating a directory dentry effectively invalidate
5141 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5142 _try_to_trim_inode(dn
->inode
.get(), false);
5144 if (dn
->lru_is_expireable())
5145 unlink(dn
, true, false); // keep dir, drop dentry
5147 if (in
->dir
->dentries
.empty()) {
5153 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5154 InodeRef snapdir
= open_snapdir(in
);
5155 _try_to_trim_inode(snapdir
.get(), false);
5160 auto q
= in
->dentries
.begin();
5161 while (q
!= in
->dentries
.end()) {
5164 if( in
->ll_ref
> 0 && sched_inval
) {
5165 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5166 // so in->dentries doesn't always reflect the state of kernel's dcache.
5167 _schedule_invalidate_dentry_callback(dn
, true);
5169 unlink(dn
, true, true);
5174 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5176 mds_rank_t mds
= session
->mds_num
;
5177 int used
= get_caps_used(in
);
5178 int wanted
= in
->caps_wanted();
5180 const unsigned new_caps
= m
->get_caps();
5181 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5182 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5183 << " mds." << mds
<< " seq " << m
->get_seq()
5184 << " caps now " << ccap_string(new_caps
)
5185 << " was " << ccap_string(cap
->issued
)
5186 << (was_stale
? " (stale)" : "") << dendl
;
5189 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5190 cap
->seq
= m
->get_seq();
5191 cap
->gen
= session
->cap_gen
;
5193 check_cap_issue(in
, new_caps
);
5197 in
->caps_issued(&issued
);
5198 issued
|= in
->caps_dirty();
5200 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5201 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5202 in
->mode
= m
->head
.mode
;
5203 in
->uid
= m
->head
.uid
;
5204 in
->gid
= m
->head
.gid
;
5205 in
->btime
= m
->btime
;
5207 bool deleted_inode
= false;
5208 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5209 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5210 in
->nlink
= m
->head
.nlink
;
5211 if (in
->nlink
== 0 &&
5212 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5213 deleted_inode
= true;
5215 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5216 m
->xattrbl
.length() &&
5217 m
->head
.xattr_version
> in
->xattr_version
) {
5218 auto p
= m
->xattrbl
.cbegin();
5219 decode(in
->xattrs
, p
);
5220 in
->xattr_version
= m
->head
.xattr_version
;
5223 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5224 in
->dirstat
.nfiles
= m
->get_nfiles();
5225 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5228 if (new_caps
& CEPH_CAP_ANY_RD
) {
5229 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5230 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5233 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5234 in
->layout
= m
->get_layout();
5235 update_inode_file_size(in
, issued
, m
->get_size(),
5236 m
->get_truncate_seq(), m
->get_truncate_size());
5239 if (m
->inline_version
> in
->inline_version
) {
5240 in
->inline_data
= m
->inline_data
;
5241 in
->inline_version
= m
->inline_version
;
5244 /* always take a newer change attr */
5245 if (m
->get_change_attr() > in
->change_attr
)
5246 in
->change_attr
= m
->get_change_attr();
5249 if (cap
== in
->auth_cap
&&
5250 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5251 (m
->get_max_size() != in
->max_size
)) {
5252 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5253 in
->max_size
= m
->get_max_size();
5254 if (in
->max_size
> in
->wanted_max_size
) {
5255 in
->wanted_max_size
= 0;
5256 in
->requested_max_size
= 0;
5261 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5262 (wanted
& ~(cap
->wanted
| new_caps
))) {
5263 // If mds is importing cap, prior cap messages that update 'wanted'
5264 // may get dropped by mds (migrate seq mismatch).
5266 // We don't send cap message to update 'wanted' if what we want are
5267 // already issued. If mds revokes caps, cap message that releases caps
5268 // also tells mds what we want. But if caps got revoked by mds forcedly
5269 // (session stale). We may haven't told mds what we want.
5275 auto revoked
= cap
->issued
& ~new_caps
;
5277 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5278 cap
->issued
= new_caps
;
5279 cap
->implemented
|= new_caps
;
5281 // recall delegations if we're losing caps necessary for them
5282 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5283 in
->recall_deleg(false);
5284 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5285 in
->recall_deleg(true);
5287 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5288 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5289 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5290 // waitin' for flush
5291 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5295 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5298 } else if (cap
->issued
== new_caps
) {
5299 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5301 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5302 cap
->issued
= new_caps
;
5303 cap
->implemented
|= new_caps
;
5305 if (cap
== in
->auth_cap
) {
5306 // non-auth MDS is revoking the newly grant caps ?
5307 for (const auto &p
: in
->caps
) {
5308 if (&p
.second
== cap
)
5310 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5323 signal_cond_list(in
->waitfor_caps
);
5325 // may drop inode's last ref
5327 _try_to_trim_inode(in
, true);
5330 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5332 if (perms
.uid() == 0)
5335 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5336 int ret
= _posix_acl_permission(in
, perms
, want
);
5341 // check permissions before doing anything else
5342 if (!in
->check_mode(perms
, want
))
5347 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5348 const UserPerm
& perms
)
5350 int r
= _getattr_for_perm(in
, perms
);
5355 if (strncmp(name
, "system.", 7) == 0) {
5356 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5359 r
= inode_permission(in
, perms
, want
);
5362 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5366 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5367 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5371 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5372 const UserPerm
& perms
)
5374 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5375 int r
= _getattr_for_perm(in
, perms
);
5379 if (mask
& CEPH_SETATTR_SIZE
) {
5380 r
= inode_permission(in
, perms
, MAY_WRITE
);
5386 if (mask
& CEPH_SETATTR_UID
) {
5387 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5390 if (mask
& CEPH_SETATTR_GID
) {
5391 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5392 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5396 if (mask
& CEPH_SETATTR_MODE
) {
5397 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5400 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5401 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5402 stx
->stx_mode
&= ~S_ISGID
;
5405 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5406 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5407 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5408 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5409 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5410 check_mask
|= CEPH_SETATTR_MTIME
;
5411 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5412 check_mask
|= CEPH_SETATTR_ATIME
;
5413 if (check_mask
& mask
) {
5416 r
= inode_permission(in
, perms
, MAY_WRITE
);
5424 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5428 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5430 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5433 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5435 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5436 want
= MAY_READ
| MAY_WRITE
;
5437 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5439 if (flags
& O_TRUNC
)
5443 switch (in
->mode
& S_IFMT
) {
5448 if (want
& MAY_WRITE
) {
5455 r
= _getattr_for_perm(in
, perms
);
5459 r
= inode_permission(in
, perms
, want
);
5461 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5465 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5467 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5468 int r
= _getattr_for_perm(dir
, perms
);
5472 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5474 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5478 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5480 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5481 int r
= _getattr_for_perm(dir
, perms
);
5485 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5487 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5491 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5493 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5494 int r
= _getattr_for_perm(dir
, perms
);
5498 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5502 /* 'name == NULL' means rmsnap */
5503 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5505 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5508 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5512 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5516 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5518 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5519 int r
= _getattr_for_perm(in
, perms
);
5523 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5529 if (!S_ISREG(in
->mode
))
5532 if (in
->mode
& S_ISUID
)
5535 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5538 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5540 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5544 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5546 int mask
= CEPH_STAT_CAP_MODE
;
5548 if (acl_type
!= NO_ACL
) {
5549 mask
|= CEPH_STAT_CAP_XATTR
;
5550 force
= in
->xattr_version
== 0;
5552 return _getattr(in
, mask
, perms
, force
);
5555 vinodeno_t
Client::_get_vino(Inode
*in
)
5557 /* The caller must hold the client lock */
5558 return vinodeno_t(in
->ino
, in
->snapid
);
5562 * Resolve an MDS spec to a list of MDS daemon GIDs.
5564 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5565 * It may be '*' in which case it matches all GIDs.
5567 * If no error is returned, the `targets` vector will be populated with at least
5570 int Client::resolve_mds(
5571 const std::string
&mds_spec
,
5572 std::vector
<mds_gid_t
> *targets
)
5575 ceph_assert(targets
!= nullptr);
5578 std::stringstream ss
;
5579 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5581 // We got a role, resolve it to a GID
5582 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5583 << role
<< "'" << dendl
;
5585 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5589 std::string strtol_err
;
5590 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5591 if (strtol_err
.empty()) {
5592 // It is a possible GID
5593 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5594 if (fsmap
->gid_exists(mds_gid
)) {
5595 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5596 targets
->push_back(mds_gid
);
5598 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5602 } else if (mds_spec
== "*") {
5603 // It is a wildcard: use all MDSs
5604 const auto mds_info
= fsmap
->get_mds_info();
5606 if (mds_info
.empty()) {
5607 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5611 for (const auto i
: mds_info
) {
5612 targets
->push_back(i
.first
);
5615 // It did not parse as an integer, it is not a wildcard, it must be a name
5616 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5618 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5620 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5624 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5625 << "' to GID " << mds_gid
<< dendl
;
5626 targets
->push_back(mds_gid
);
5635 * Authenticate with mon and establish global ID
5637 int Client::authenticate()
5639 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5641 if (monclient
->is_authenticated()) {
5645 client_lock
.unlock();
5646 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5652 whoami
= monclient
->get_global_id();
5653 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5658 int Client::fetch_fsmap(bool user
)
5661 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5662 // rather than MDSMap because no one MDSMap contains all the daemons, and
5663 // a `tell` can address any daemon.
5664 version_t fsmap_latest
;
5667 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5668 client_lock
.unlock();
5671 } while (r
== -EAGAIN
);
5674 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5678 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5681 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5682 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5683 monclient
->renew_subs();
5684 wait_on_list(waiting_for_fsmap
);
5686 ceph_assert(fsmap_user
);
5687 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5689 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5690 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5691 monclient
->renew_subs();
5692 wait_on_list(waiting_for_fsmap
);
5695 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5697 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5698 << fsmap_latest
<< dendl
;
5704 * @mds_spec one of ID, rank, GID, "*"
5707 int Client::mds_command(
5708 const std::string
&mds_spec
,
5709 const vector
<string
>& cmd
,
5710 const bufferlist
& inbl
,
5715 std::lock_guard
lock(client_lock
);
5726 r
= fetch_fsmap(false);
5731 // Look up MDS target(s) of the command
5732 std::vector
<mds_gid_t
> targets
;
5733 r
= resolve_mds(mds_spec
, &targets
);
5738 // If daemons are laggy, we won't send them commands. If all
5739 // are laggy then we fail.
5740 std::vector
<mds_gid_t
> non_laggy
;
5741 for (const auto gid
: targets
) {
5742 const auto info
= fsmap
->get_info_gid(gid
);
5743 if (!info
.laggy()) {
5744 non_laggy
.push_back(gid
);
5747 if (non_laggy
.size() == 0) {
5748 *outs
= "All targeted MDS daemons are laggy";
5752 if (metadata
.empty()) {
5753 // We are called on an unmounted client, so metadata
5754 // won't be initialized yet.
5755 populate_metadata("");
5758 // Send commands to targets
5759 C_GatherBuilder
gather(cct
, onfinish
);
5760 for (const auto target_gid
: non_laggy
) {
5761 const auto info
= fsmap
->get_info_gid(target_gid
);
5763 // Open a connection to the target MDS
5764 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5766 // Generate MDSCommandOp state
5767 auto &op
= command_table
.start_command();
5769 op
.on_finish
= gather
.new_sub();
5774 op
.mds_gid
= target_gid
;
5777 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5778 << " tid=" << op
.tid
<< cmd
<< dendl
;
5780 // Construct and send MCommand
5781 auto m
= op
.get_message(monclient
->get_fsid());
5782 conn
->send_message2(std::move(m
));
5789 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5791 ceph_tid_t
const tid
= m
->get_tid();
5793 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5795 if (!command_table
.exists(tid
)) {
5796 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5800 auto &op
= command_table
.get_command(tid
);
5802 *op
.outbl
= m
->get_data();
5809 op
.on_finish
->complete(m
->r
);
5812 command_table
.erase(tid
);
5815 // -------------------
5818 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5820 int r
= authenticate();
5822 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5826 std::string resolved_fs_name
;
5827 if (fs_name
.empty()) {
5828 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
5829 if (resolved_fs_name
.empty())
5830 // Try the backwards compatibility fs name option
5831 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5833 resolved_fs_name
= fs_name
;
5836 std::string want
= "mdsmap";
5837 if (!resolved_fs_name
.empty()) {
5838 r
= fetch_fsmap(true);
5841 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5842 if (fscid
== FS_CLUSTER_ID_NONE
) {
5846 std::ostringstream oss
;
5847 oss
<< want
<< "." << fscid
;
5850 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5852 monclient
->sub_want(want
, 0, 0);
5853 monclient
->renew_subs();
5858 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5859 bool require_mds
, const std::string
&fs_name
)
5861 std::lock_guard
lock(client_lock
);
5864 ldout(cct
, 5) << "already mounted" << dendl
;
5870 int r
= subscribe_mdsmap(fs_name
);
5872 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5876 tick(); // start tick
5880 auto availability
= mdsmap
->is_cluster_available();
5881 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5883 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5884 return CEPH_FUSE_NO_MDS_UP
;
5885 } else if (availability
== MDSMap::AVAILABLE
) {
5886 // Continue to mount
5888 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5889 // Else, wait. MDSMonitor will update the map to bring
5890 // us to a conclusion eventually.
5891 wait_on_list(waiting_for_mdsmap
);
5893 // Unexpected value!
5899 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5901 filepath
fp(CEPH_INO_ROOT
);
5902 if (!mount_root
.empty()) {
5903 fp
= filepath(mount_root
.c_str());
5906 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5907 req
->set_filepath(fp
);
5908 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5909 int res
= make_request(req
, perms
);
5911 if (res
== -EACCES
&& root
) {
5912 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5930 if (!cct
->_conf
->client_trace
.empty()) {
5931 traceout
.open(cct
->_conf
->client_trace
.c_str());
5932 if (traceout
.is_open()) {
5933 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5935 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5940 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5941 ldout(cct, 3) << "op: struct stat st;" << dendl;
5942 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5943 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5944 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5945 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5946 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5947 ldout(cct, 3) << "op: int fd;" << dendl;
5954 void Client::_close_sessions()
5956 while (!mds_sessions
.empty()) {
5957 // send session closes!
5958 for (auto &p
: mds_sessions
) {
5959 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
5960 _close_mds_session(&p
.second
);
5964 // wait for sessions to close
5965 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5966 std::unique_lock l
{client_lock
, std::adopt_lock
};
5972 void Client::flush_mdlog_sync()
5974 if (mds_requests
.empty())
5976 for (auto &p
: mds_sessions
) {
5977 flush_mdlog(&p
.second
);
5981 void Client::flush_mdlog(MetaSession
*session
)
5983 // Only send this to Luminous or newer MDS daemons, older daemons
5984 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5985 const uint64_t features
= session
->con
->get_features();
5986 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5987 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5988 session
->con
->send_message2(std::move(m
));
5993 void Client::_abort_mds_sessions(int err
)
5995 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
5996 auto req
= p
->second
;
5998 // unsafe requests will be removed during close session below.
5999 if (req
->got_unsafe
)
6003 if (req
->caller_cond
) {
6005 req
->caller_cond
->notify_all();
6009 // Process aborts on any requests that were on this waitlist.
6010 // Any requests that were on a waiting_for_open session waitlist
6011 // will get kicked during close session below.
6012 signal_cond_list(waiting_for_mdsmap
);
6014 // Force-close all sessions
6015 while(!mds_sessions
.empty()) {
6016 auto& session
= mds_sessions
.begin()->second
;
6017 _closed_mds_session(&session
);
6021 void Client::_unmount(bool abort
)
6023 std::unique_lock lock
{client_lock
, std::adopt_lock
};
6027 if (abort
|| blacklisted
) {
6028 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6030 ldout(cct
, 2) << "unmounting" << dendl
;
6037 // Abort all mds sessions
6038 _abort_mds_sessions(-ENOTCONN
);
6040 objecter
->op_cancel_writes(-ENOTCONN
);
6042 // flush the mdlog for pending requests, if any
6046 mount_cond
.wait(lock
, [this] {
6047 if (!mds_requests
.empty()) {
6048 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6051 return mds_requests
.empty();
6054 timer
.cancel_event(tick_event
);
6059 // clean up any unclosed files
6060 while (!fd_map
.empty()) {
6061 Fh
*fh
= fd_map
.begin()->second
;
6062 fd_map
.erase(fd_map
.begin());
6063 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6067 while (!ll_unclosed_fh_set
.empty()) {
6068 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6070 ll_unclosed_fh_set
.erase(fh
);
6071 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6075 while (!opened_dirs
.empty()) {
6076 dir_result_t
*dirp
= *opened_dirs
.begin();
6077 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6083 mount_cond
.wait(lock
, [this] {
6084 if (unsafe_sync_write
> 0) {
6085 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting"
6088 return unsafe_sync_write
<= 0;
6091 if (cct
->_conf
->client_oc
) {
6092 // flush/release all buffered data
6093 std::list
<InodeRef
> anchor
;
6094 for (auto& p
: inode_map
) {
6095 Inode
*in
= p
.second
;
6097 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6101 // prevent inode from getting freed
6102 anchor
.emplace_back(in
);
6104 if (abort
|| blacklisted
) {
6105 objectcacher
->purge_set(&in
->oset
);
6106 } else if (!in
->caps
.empty()) {
6108 _flush(in
, new C_Client_FlushComplete(this, in
));
6113 if (abort
|| blacklisted
) {
6114 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6117 if (in
->dirty_caps
) {
6118 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6119 in
->mark_caps_clean();
6125 wait_sync_caps(last_flush_tid
);
6131 while (lru
.lru_get_size() > 0 ||
6132 !inode_map
.empty()) {
6133 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6134 << "+" << inode_map
.size() << " items"
6135 << ", waiting (for caps to release?)"
6137 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6138 r
== std::cv_status::timeout
) {
6142 ceph_assert(lru
.lru_get_size() == 0);
6143 ceph_assert(inode_map
.empty());
6146 if (!cct
->_conf
->client_trace
.empty()) {
6147 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6156 ldout(cct
, 2) << "unmounted." << dendl
;
6159 void Client::unmount()
6161 std::lock_guard
lock(client_lock
);
6165 void Client::abort_conn()
6167 std::lock_guard
lock(client_lock
);
6171 void Client::flush_cap_releases()
6173 // send any cap releases
6174 for (auto &p
: mds_sessions
) {
6175 auto &session
= p
.second
;
6176 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6178 if (cct
->_conf
->client_inject_release_failure
) {
6179 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6181 session
.con
->send_message2(std::move(session
.release
));
6183 session
.release
.reset();
6190 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6191 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6192 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6193 cct
->_conf
.apply_changes(nullptr);
6196 ldout(cct
, 21) << "tick" << dendl
;
6197 tick_event
= timer
.add_event_after(
6198 cct
->_conf
->client_tick_interval
,
6199 new LambdaContext([this](int) {
6200 // Called back via Timer, which takes client_lock for us
6201 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6204 utime_t now
= ceph_clock_now();
6206 if (!mounted
&& !mds_requests
.empty()) {
6207 MetaRequest
*req
= mds_requests
.begin()->second
;
6208 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6209 req
->abort(-ETIMEDOUT
);
6210 if (req
->caller_cond
) {
6212 req
->caller_cond
->notify_all();
6214 signal_cond_list(waiting_for_mdsmap
);
6215 for (auto &p
: mds_sessions
) {
6216 signal_context_list(p
.second
.waiting_for_open
);
6221 if (mdsmap
->get_epoch()) {
6223 utime_t el
= now
- last_cap_renew
;
6224 if (el
> mdsmap
->get_session_timeout() / 3.0)
6227 flush_cap_releases();
6231 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6235 if (in
->hold_caps_until
> now
)
6237 delayed_list
.pop_front();
6238 check_caps(in
, CHECK_CAPS_NODELAY
);
6244 void Client::renew_caps()
6246 ldout(cct
, 10) << "renew_caps()" << dendl
;
6247 last_cap_renew
= ceph_clock_now();
6249 for (auto &p
: mds_sessions
) {
6250 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6251 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6252 renew_caps(&p
.second
);
6256 void Client::renew_caps(MetaSession
*session
)
6258 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6259 session
->last_cap_renew_request
= ceph_clock_now();
6260 uint64_t seq
= ++session
->cap_renew_seq
;
6261 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6265 // ===============================================================
6266 // high level (POSIXy) interface
6268 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6269 InodeRef
*target
, const UserPerm
& perms
)
6271 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6272 MetaRequest
*req
= new MetaRequest(op
);
6274 dir
->make_nosnap_relative_path(path
);
6275 path
.push_dentry(name
);
6276 req
->set_filepath(path
);
6277 req
->set_inode(dir
);
6278 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6279 mask
|= DEBUG_GETATTR_CAPS
;
6280 req
->head
.args
.getattr
.mask
= mask
;
6282 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6284 int r
= make_request(req
, perms
, target
);
6285 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6289 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6290 const UserPerm
& perms
)
6295 if (dname
== "..") {
6296 if (dir
->dentries
.empty()) {
6297 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6298 filepath
path(dir
->ino
);
6299 req
->set_filepath(path
);
6302 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6305 Inode
*tempino
= tmptarget
.get();
6308 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6314 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6323 if (!dir
->is_dir()) {
6328 if (dname
.length() > NAME_MAX
) {
6333 if (dname
== cct
->_conf
->client_snapdir
&&
6334 dir
->snapid
== CEPH_NOSNAP
) {
6335 *target
= open_snapdir(dir
);
6340 dir
->dir
->dentries
.count(dname
)) {
6341 dn
= dir
->dir
->dentries
[dname
];
6343 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6344 << " seq " << dn
->lease_seq
6347 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6348 // is dn lease valid?
6349 utime_t now
= ceph_clock_now();
6350 if (dn
->lease_mds
>= 0 &&
6351 dn
->lease_ttl
> now
&&
6352 mds_sessions
.count(dn
->lease_mds
)) {
6353 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6354 if (s
.cap_ttl
> now
&&
6355 s
.cap_gen
== dn
->lease_gen
) {
6356 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6357 // make trim_caps() behave.
6358 dir
->try_touch_cap(dn
->lease_mds
);
6361 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6362 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6365 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6366 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6367 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6369 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6370 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6371 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6376 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6379 // can we conclude ENOENT locally?
6380 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6381 (dir
->flags
& I_COMPLETE
)) {
6382 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6387 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6392 *target
= dn
->inode
;
6400 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6402 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6406 int Client::get_or_create(Inode
*dir
, const char* name
,
6407 Dentry
**pdn
, bool expect_null
)
6410 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6412 if (dir
->dir
->dentries
.count(name
)) {
6413 Dentry
*dn
= dir
->dir
->dentries
[name
];
6415 // is dn lease valid?
6416 utime_t now
= ceph_clock_now();
6418 dn
->lease_mds
>= 0 &&
6419 dn
->lease_ttl
> now
&&
6420 mds_sessions
.count(dn
->lease_mds
)) {
6421 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6422 if (s
.cap_ttl
> now
&&
6423 s
.cap_gen
== dn
->lease_gen
) {
6430 // otherwise link up a new one
6431 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6438 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6439 const UserPerm
& perms
, bool followsym
, int mask
)
6441 filepath path
= origpath
;
6443 if (origpath
.absolute())
6449 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6454 while (i
< path
.depth() && cur
) {
6456 const string
&dname
= path
[i
];
6457 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6458 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6460 if (cct
->_conf
->client_permissions
) {
6461 int r
= may_lookup(cur
.get(), perms
);
6464 caps
= CEPH_CAP_AUTH_SHARED
;
6467 /* Get extra requested caps on the last component */
6468 if (i
== (path
.depth() - 1))
6470 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6473 // only follow trailing symlink if followsym. always follow
6474 // 'directory' symlinks.
6475 if (next
&& next
->is_symlink()) {
6477 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6478 if (symlinks
> MAXSYMLINKS
) {
6482 if (i
< path
.depth() - 1) {
6484 // replace consumed components of path with symlink dir target
6485 filepath
resolved(next
->symlink
.c_str());
6486 resolved
.append(path
.postfixpath(i
+ 1));
6489 if (next
->symlink
[0] == '/') {
6493 } else if (followsym
) {
6494 if (next
->symlink
[0] == '/') {
6495 path
= next
->symlink
.c_str();
6500 filepath
more(next
->symlink
.c_str());
6501 // we need to remove the symlink component from off of the path
6502 // before adding the target that the symlink points to. remain
6503 // at the same position in the path.
6523 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6525 std::lock_guard
lock(client_lock
);
6526 tout(cct
) << "link" << std::endl
;
6527 tout(cct
) << relexisting
<< std::endl
;
6528 tout(cct
) << relpath
<< std::endl
;
6533 filepath
existing(relexisting
);
6536 int r
= path_walk(existing
, &in
, perm
, true);
6539 if (std::string(relpath
) == "/") {
6543 filepath
path(relpath
);
6544 string name
= path
.last_dentry();
6547 r
= path_walk(path
, &dir
, perm
, true);
6550 if (cct
->_conf
->client_permissions
) {
6551 if (S_ISDIR(in
->mode
)) {
6555 r
= may_hardlink(in
.get(), perm
);
6558 r
= may_create(dir
.get(), perm
);
6562 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6566 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6568 std::lock_guard
lock(client_lock
);
6569 tout(cct
) << __func__
<< std::endl
;
6570 tout(cct
) << relpath
<< std::endl
;
6575 if (std::string(relpath
) == "/")
6578 filepath
path(relpath
);
6579 string name
= path
.last_dentry();
6582 int r
= path_walk(path
, &dir
, perm
);
6585 if (cct
->_conf
->client_permissions
) {
6586 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6590 return _unlink(dir
.get(), name
.c_str(), perm
);
6593 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6595 std::lock_guard
lock(client_lock
);
6596 tout(cct
) << __func__
<< std::endl
;
6597 tout(cct
) << relfrom
<< std::endl
;
6598 tout(cct
) << relto
<< std::endl
;
6603 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6606 filepath
from(relfrom
);
6608 string fromname
= from
.last_dentry();
6610 string toname
= to
.last_dentry();
6613 InodeRef fromdir
, todir
;
6614 int r
= path_walk(from
, &fromdir
, perm
);
6617 r
= path_walk(to
, &todir
, perm
);
6621 if (cct
->_conf
->client_permissions
) {
6622 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6625 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6626 if (r
< 0 && r
!= -ENOENT
)
6629 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6636 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6638 std::lock_guard
lock(client_lock
);
6639 tout(cct
) << __func__
<< std::endl
;
6640 tout(cct
) << relpath
<< std::endl
;
6641 tout(cct
) << mode
<< std::endl
;
6642 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6647 if (std::string(relpath
) == "/")
6650 filepath
path(relpath
);
6651 string name
= path
.last_dentry();
6654 int r
= path_walk(path
, &dir
, perm
);
6657 if (cct
->_conf
->client_permissions
) {
6658 r
= may_create(dir
.get(), perm
);
6662 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6665 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6667 std::lock_guard
lock(client_lock
);
6668 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6669 tout(cct
) << __func__
<< std::endl
;
6670 tout(cct
) << relpath
<< std::endl
;
6671 tout(cct
) << mode
<< std::endl
;
6676 //get through existing parts of path
6677 filepath
path(relpath
);
6679 int r
= 0, caps
= 0;
6682 for (i
=0; i
<path
.depth(); ++i
) {
6683 if (cct
->_conf
->client_permissions
) {
6684 r
= may_lookup(cur
.get(), perms
);
6687 caps
= CEPH_CAP_AUTH_SHARED
;
6689 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6694 if (r
!=-ENOENT
) return r
;
6695 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6696 //make new directory at each level
6697 for (; i
<path
.depth(); ++i
) {
6698 if (cct
->_conf
->client_permissions
) {
6699 r
= may_create(cur
.get(), perms
);
6704 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6706 //check proper creation/existence
6707 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6708 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6712 //move to new dir and continue
6714 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6715 << filepath(cur
->ino
).get_path() << dendl
;
6720 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6722 std::lock_guard
lock(client_lock
);
6723 tout(cct
) << __func__
<< std::endl
;
6724 tout(cct
) << relpath
<< std::endl
;
6729 if (std::string(relpath
) == "/")
6732 filepath
path(relpath
);
6733 string name
= path
.last_dentry();
6736 int r
= path_walk(path
, &dir
, perms
);
6739 if (cct
->_conf
->client_permissions
) {
6740 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6744 return _rmdir(dir
.get(), name
.c_str(), perms
);
6747 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6749 std::lock_guard
lock(client_lock
);
6750 tout(cct
) << __func__
<< std::endl
;
6751 tout(cct
) << relpath
<< std::endl
;
6752 tout(cct
) << mode
<< std::endl
;
6753 tout(cct
) << rdev
<< std::endl
;
6758 if (std::string(relpath
) == "/")
6761 filepath
path(relpath
);
6762 string name
= path
.last_dentry();
6765 int r
= path_walk(path
, &dir
, perms
);
6768 if (cct
->_conf
->client_permissions
) {
6769 int r
= may_create(dir
.get(), perms
);
6773 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6778 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6780 std::lock_guard
lock(client_lock
);
6781 tout(cct
) << __func__
<< std::endl
;
6782 tout(cct
) << target
<< std::endl
;
6783 tout(cct
) << relpath
<< std::endl
;
6788 if (std::string(relpath
) == "/")
6791 filepath
path(relpath
);
6792 string name
= path
.last_dentry();
6795 int r
= path_walk(path
, &dir
, perms
);
6798 if (cct
->_conf
->client_permissions
) {
6799 int r
= may_create(dir
.get(), perms
);
6803 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6806 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6808 std::lock_guard
lock(client_lock
);
6809 tout(cct
) << __func__
<< std::endl
;
6810 tout(cct
) << relpath
<< std::endl
;
6815 filepath
path(relpath
);
6817 int r
= path_walk(path
, &in
, perms
, false);
6821 return _readlink(in
.get(), buf
, size
);
6824 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6826 if (!in
->is_symlink())
6829 // copy into buf (at most size bytes)
6830 int r
= in
->symlink
.length();
6833 memcpy(buf
, in
->symlink
.c_str(), r
);
6840 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6842 bool yes
= in
->caps_issued_mask(mask
, true);
6844 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6848 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6850 in
->make_nosnap_relative_path(path
);
6851 req
->set_filepath(path
);
6853 req
->head
.args
.getattr
.mask
= mask
;
6855 int res
= make_request(req
, perms
);
6856 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6860 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6861 const UserPerm
& perms
, InodeRef
*inp
)
6863 int issued
= in
->caps_issued();
6865 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6866 ccap_string(issued
) << dendl
;
6868 if (in
->snapid
!= CEPH_NOSNAP
) {
6871 if ((mask
& CEPH_SETATTR_SIZE
) &&
6872 (unsigned long)stx
->stx_size
> in
->size
&&
6873 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6878 // make the change locally?
6879 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6880 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6881 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6882 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6883 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6886 * This works because we implicitly flush the caps as part of the
6887 * request, so the cap update check will happen with the writeback
6888 * cap context, and then the setattr check will happen with the
6891 * In reality this pattern is likely pretty rare (different users
6892 * setattr'ing the same file). If that turns out not to be the
6893 * case later, we can build a more complex pipelined cap writeback
6897 mask
|= CEPH_SETATTR_CTIME
;
6902 // caller just needs us to bump the ctime
6903 in
->ctime
= ceph_clock_now();
6904 in
->cap_dirtier_uid
= perms
.uid();
6905 in
->cap_dirtier_gid
= perms
.gid();
6906 if (issued
& CEPH_CAP_AUTH_EXCL
)
6907 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6908 else if (issued
& CEPH_CAP_FILE_EXCL
)
6909 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6910 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6911 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6913 mask
|= CEPH_SETATTR_CTIME
;
6916 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6917 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6919 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6921 if (mask
& CEPH_SETATTR_UID
) {
6922 in
->ctime
= ceph_clock_now();
6923 in
->cap_dirtier_uid
= perms
.uid();
6924 in
->cap_dirtier_gid
= perms
.gid();
6925 in
->uid
= stx
->stx_uid
;
6926 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6927 mask
&= ~CEPH_SETATTR_UID
;
6929 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6931 if (mask
& CEPH_SETATTR_GID
) {
6932 in
->ctime
= ceph_clock_now();
6933 in
->cap_dirtier_uid
= perms
.uid();
6934 in
->cap_dirtier_gid
= perms
.gid();
6935 in
->gid
= stx
->stx_gid
;
6936 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6937 mask
&= ~CEPH_SETATTR_GID
;
6939 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6942 if (mask
& CEPH_SETATTR_MODE
) {
6943 in
->ctime
= ceph_clock_now();
6944 in
->cap_dirtier_uid
= perms
.uid();
6945 in
->cap_dirtier_gid
= perms
.gid();
6946 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6947 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6948 mask
&= ~CEPH_SETATTR_MODE
;
6949 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6950 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6951 /* Must squash the any setuid/setgid bits with an ownership change */
6952 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6953 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6956 if (mask
& CEPH_SETATTR_BTIME
) {
6957 in
->ctime
= ceph_clock_now();
6958 in
->cap_dirtier_uid
= perms
.uid();
6959 in
->cap_dirtier_gid
= perms
.gid();
6960 in
->btime
= utime_t(stx
->stx_btime
);
6961 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6962 mask
&= ~CEPH_SETATTR_BTIME
;
6963 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6965 } else if (mask
& CEPH_SETATTR_SIZE
) {
6966 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6967 mask
|= CEPH_SETATTR_KILL_SGUID
;
6970 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6971 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6972 if (mask
& CEPH_SETATTR_MTIME
)
6973 in
->mtime
= utime_t(stx
->stx_mtime
);
6974 if (mask
& CEPH_SETATTR_ATIME
)
6975 in
->atime
= utime_t(stx
->stx_atime
);
6976 in
->ctime
= ceph_clock_now();
6977 in
->cap_dirtier_uid
= perms
.uid();
6978 in
->cap_dirtier_gid
= perms
.gid();
6979 in
->time_warp_seq
++;
6980 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6981 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6990 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6994 in
->make_nosnap_relative_path(path
);
6995 req
->set_filepath(path
);
6998 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6999 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7001 if (mask
& CEPH_SETATTR_MODE
) {
7002 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7003 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7004 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7006 if (mask
& CEPH_SETATTR_UID
) {
7007 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7008 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7009 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7011 if (mask
& CEPH_SETATTR_GID
) {
7012 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7013 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7014 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7016 if (mask
& CEPH_SETATTR_BTIME
) {
7017 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7018 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7020 if (mask
& CEPH_SETATTR_MTIME
) {
7021 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7022 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7025 if (mask
& CEPH_SETATTR_ATIME
) {
7026 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7027 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7030 if (mask
& CEPH_SETATTR_SIZE
) {
7031 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7032 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7033 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7036 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7039 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7042 req
->head
.args
.setattr
.mask
= mask
;
7044 req
->regetattr_mask
= mask
;
7046 int res
= make_request(req
, perms
, inp
);
7047 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7051 /* Note that we only care about attrs that setattr cares about */
7052 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7054 stx
->stx_size
= st
->st_size
;
7055 stx
->stx_mode
= st
->st_mode
;
7056 stx
->stx_uid
= st
->st_uid
;
7057 stx
->stx_gid
= st
->st_gid
;
7059 stx
->stx_mtime
= st
->st_mtimespec
;
7060 stx
->stx_atime
= st
->st_atimespec
;
7062 stx
->stx_mtime
= st
->st_mtim
;
7063 stx
->stx_atime
= st
->st_atim
;
7067 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7068 const UserPerm
& perms
, InodeRef
*inp
)
7070 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7073 if (mask
& CEPH_SETATTR_MODE
)
7074 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7078 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7079 const UserPerm
& perms
)
7081 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7082 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7083 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7084 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7085 if (cct
->_conf
->client_permissions
) {
7086 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7090 return __setattrx(in
.get(), stx
, mask
, perms
);
7093 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7094 const UserPerm
& perms
)
7096 struct ceph_statx stx
;
7098 stat_to_statx(attr
, &stx
);
7099 mask
&= ~CEPH_SETATTR_BTIME
;
7101 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7102 mask
&= ~CEPH_SETATTR_UID
;
7104 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7105 mask
&= ~CEPH_SETATTR_GID
;
7108 return _setattrx(in
, &stx
, mask
, perms
);
7111 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7112 const UserPerm
& perms
)
7114 std::lock_guard
lock(client_lock
);
7115 tout(cct
) << __func__
<< std::endl
;
7116 tout(cct
) << relpath
<< std::endl
;
7117 tout(cct
) << mask
<< std::endl
;
7122 filepath
path(relpath
);
7124 int r
= path_walk(path
, &in
, perms
);
7127 return _setattr(in
, attr
, mask
, perms
);
7130 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7131 const UserPerm
& perms
, int flags
)
7133 std::lock_guard
lock(client_lock
);
7134 tout(cct
) << __func__
<< std::endl
;
7135 tout(cct
) << relpath
<< std::endl
;
7136 tout(cct
) << mask
<< std::endl
;
7141 filepath
path(relpath
);
7143 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7146 return _setattrx(in
, stx
, mask
, perms
);
7149 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7151 std::lock_guard
lock(client_lock
);
7152 tout(cct
) << __func__
<< std::endl
;
7153 tout(cct
) << fd
<< std::endl
;
7154 tout(cct
) << mask
<< std::endl
;
7159 Fh
*f
= get_filehandle(fd
);
7162 #if defined(__linux__) && defined(O_PATH)
7163 if (f
->flags
& O_PATH
)
7166 return _setattr(f
->inode
, attr
, mask
, perms
);
7169 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7171 std::lock_guard
lock(client_lock
);
7172 tout(cct
) << __func__
<< std::endl
;
7173 tout(cct
) << fd
<< std::endl
;
7174 tout(cct
) << mask
<< std::endl
;
7179 Fh
*f
= get_filehandle(fd
);
7182 #if defined(__linux__) && defined(O_PATH)
7183 if (f
->flags
& O_PATH
)
7186 return _setattrx(f
->inode
, stx
, mask
, perms
);
7189 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7190 frag_info_t
*dirstat
, int mask
)
7192 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7193 std::lock_guard
lock(client_lock
);
7194 tout(cct
) << "stat" << std::endl
;
7195 tout(cct
) << relpath
<< std::endl
;
7200 filepath
path(relpath
);
7202 int r
= path_walk(path
, &in
, perms
, true, mask
);
7205 r
= _getattr(in
, mask
, perms
);
7207 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7210 fill_stat(in
, stbuf
, dirstat
);
7211 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7215 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7219 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7220 if (flags
& AT_NO_ATTR_SYNC
)
7223 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7224 mask
|= CEPH_CAP_PIN
;
7225 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7226 mask
|= CEPH_CAP_AUTH_SHARED
;
7227 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7228 mask
|= CEPH_CAP_LINK_SHARED
;
7229 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7230 mask
|= CEPH_CAP_FILE_SHARED
;
7231 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7232 mask
|= CEPH_CAP_XATTR_SHARED
;
7237 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7238 const UserPerm
& perms
,
7239 unsigned int want
, unsigned int flags
)
7241 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7242 std::lock_guard
lock(client_lock
);
7243 tout(cct
) << "statx" << std::endl
;
7244 tout(cct
) << relpath
<< std::endl
;
7249 filepath
path(relpath
);
7252 unsigned mask
= statx_to_mask(flags
, want
);
7254 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7258 r
= _getattr(in
, mask
, perms
);
7260 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7264 fill_statx(in
, mask
, stx
);
7265 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7269 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7270 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7272 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7273 std::lock_guard
lock(client_lock
);
7274 tout(cct
) << __func__
<< std::endl
;
7275 tout(cct
) << relpath
<< std::endl
;
7280 filepath
path(relpath
);
7282 // don't follow symlinks
7283 int r
= path_walk(path
, &in
, perms
, false, mask
);
7286 r
= _getattr(in
, mask
, perms
);
7288 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7291 fill_stat(in
, stbuf
, dirstat
);
7292 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7296 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7298 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7299 << " mode 0" << oct
<< in
->mode
<< dec
7300 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7301 memset(st
, 0, sizeof(struct stat
));
7302 if (use_faked_inos())
7303 st
->st_ino
= in
->faked_ino
;
7305 st
->st_ino
= in
->ino
;
7306 st
->st_dev
= in
->snapid
;
7307 st
->st_mode
= in
->mode
;
7308 st
->st_rdev
= in
->rdev
;
7310 switch (in
->nlink
) {
7312 st
->st_nlink
= 0; /* dir is unlinked */
7315 st
->st_nlink
= 1 /* parent dentry */
7317 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7323 st
->st_nlink
= in
->nlink
;
7325 st
->st_uid
= in
->uid
;
7326 st
->st_gid
= in
->gid
;
7327 if (in
->ctime
> in
->mtime
) {
7328 stat_set_ctime_sec(st
, in
->ctime
.sec());
7329 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7331 stat_set_ctime_sec(st
, in
->mtime
.sec());
7332 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7334 stat_set_atime_sec(st
, in
->atime
.sec());
7335 stat_set_atime_nsec(st
, in
->atime
.nsec());
7336 stat_set_mtime_sec(st
, in
->mtime
.sec());
7337 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7339 if (cct
->_conf
->client_dirsize_rbytes
)
7340 st
->st_size
= in
->rstat
.rbytes
;
7342 st
->st_size
= in
->dirstat
.size();
7345 st
->st_size
= in
->size
;
7346 st
->st_blocks
= (in
->size
+ 511) >> 9;
7348 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7351 *dirstat
= in
->dirstat
;
7355 return in
->caps_issued();
7358 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7360 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7361 << " mode 0" << oct
<< in
->mode
<< dec
7362 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7363 memset(stx
, 0, sizeof(struct ceph_statx
));
7366 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7367 * so that all bits are set.
7372 /* These are always considered to be available */
7373 stx
->stx_dev
= in
->snapid
;
7374 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7376 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7377 stx
->stx_mode
= S_IFMT
& in
->mode
;
7378 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7379 stx
->stx_rdev
= in
->rdev
;
7380 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7382 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7383 stx
->stx_uid
= in
->uid
;
7384 stx
->stx_gid
= in
->gid
;
7385 stx
->stx_mode
= in
->mode
;
7386 in
->btime
.to_timespec(&stx
->stx_btime
);
7387 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7390 if (mask
& CEPH_CAP_LINK_SHARED
) {
7392 switch (in
->nlink
) {
7394 stx
->stx_nlink
= 0; /* dir is unlinked */
7397 stx
->stx_nlink
= 1 /* parent dentry */
7399 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7405 stx
->stx_nlink
= in
->nlink
;
7407 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7410 if (mask
& CEPH_CAP_FILE_SHARED
) {
7412 in
->atime
.to_timespec(&stx
->stx_atime
);
7413 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7416 if (cct
->_conf
->client_dirsize_rbytes
)
7417 stx
->stx_size
= in
->rstat
.rbytes
;
7419 stx
->stx_size
= in
->dirstat
.size();
7420 stx
->stx_blocks
= 1;
7422 stx
->stx_size
= in
->size
;
7423 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7425 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7426 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7429 /* Change time and change_attr both require all shared caps to view */
7430 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7431 stx
->stx_version
= in
->change_attr
;
7432 if (in
->ctime
> in
->mtime
)
7433 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7435 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7436 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7441 void Client::touch_dn(Dentry
*dn
)
7446 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7448 std::lock_guard
lock(client_lock
);
7449 tout(cct
) << __func__
<< std::endl
;
7450 tout(cct
) << relpath
<< std::endl
;
7451 tout(cct
) << mode
<< std::endl
;
7456 filepath
path(relpath
);
7458 int r
= path_walk(path
, &in
, perms
);
7462 attr
.st_mode
= mode
;
7463 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7466 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7468 std::lock_guard
lock(client_lock
);
7469 tout(cct
) << __func__
<< std::endl
;
7470 tout(cct
) << fd
<< std::endl
;
7471 tout(cct
) << mode
<< std::endl
;
7476 Fh
*f
= get_filehandle(fd
);
7479 #if defined(__linux__) && defined(O_PATH)
7480 if (f
->flags
& O_PATH
)
7484 attr
.st_mode
= mode
;
7485 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7488 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7490 std::lock_guard
lock(client_lock
);
7491 tout(cct
) << __func__
<< std::endl
;
7492 tout(cct
) << relpath
<< std::endl
;
7493 tout(cct
) << mode
<< std::endl
;
7498 filepath
path(relpath
);
7500 // don't follow symlinks
7501 int r
= path_walk(path
, &in
, perms
, false);
7505 attr
.st_mode
= mode
;
7506 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7509 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7510 const UserPerm
& perms
)
7512 std::lock_guard
lock(client_lock
);
7513 tout(cct
) << __func__
<< std::endl
;
7514 tout(cct
) << relpath
<< std::endl
;
7515 tout(cct
) << new_uid
<< std::endl
;
7516 tout(cct
) << new_gid
<< std::endl
;
7521 filepath
path(relpath
);
7523 int r
= path_walk(path
, &in
, perms
);
7527 attr
.st_uid
= new_uid
;
7528 attr
.st_gid
= new_gid
;
7529 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7532 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7534 std::lock_guard
lock(client_lock
);
7535 tout(cct
) << __func__
<< std::endl
;
7536 tout(cct
) << fd
<< std::endl
;
7537 tout(cct
) << new_uid
<< std::endl
;
7538 tout(cct
) << new_gid
<< std::endl
;
7543 Fh
*f
= get_filehandle(fd
);
7546 #if defined(__linux__) && defined(O_PATH)
7547 if (f
->flags
& O_PATH
)
7551 attr
.st_uid
= new_uid
;
7552 attr
.st_gid
= new_gid
;
7554 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7555 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7556 return _setattr(f
->inode
, &attr
, mask
, perms
);
7559 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7560 const UserPerm
& perms
)
7562 std::lock_guard
lock(client_lock
);
7563 tout(cct
) << __func__
<< std::endl
;
7564 tout(cct
) << relpath
<< std::endl
;
7565 tout(cct
) << new_uid
<< std::endl
;
7566 tout(cct
) << new_gid
<< std::endl
;
7571 filepath
path(relpath
);
7573 // don't follow symlinks
7574 int r
= path_walk(path
, &in
, perms
, false);
7578 attr
.st_uid
= new_uid
;
7579 attr
.st_gid
= new_gid
;
7581 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7582 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7583 return _setattr(in
, &attr
, mask
, perms
);
7586 static void attr_set_atime_and_mtime(struct stat
*attr
,
7587 const utime_t
&atime
,
7588 const utime_t
&mtime
)
7590 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7591 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7592 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7593 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7596 // for [l]utime() invoke the timeval variant as the timespec
7597 // variant are not yet implemented. for futime[s](), invoke
7598 // the timespec variant.
7599 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7600 const UserPerm
& perms
)
7602 struct timeval tv
[2];
7603 tv
[0].tv_sec
= buf
->actime
;
7605 tv
[1].tv_sec
= buf
->modtime
;
7608 return utimes(relpath
, tv
, perms
);
7611 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7612 const UserPerm
& perms
)
7614 struct timeval tv
[2];
7615 tv
[0].tv_sec
= buf
->actime
;
7617 tv
[1].tv_sec
= buf
->modtime
;
7620 return lutimes(relpath
, tv
, perms
);
7623 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7625 struct timespec ts
[2];
7626 ts
[0].tv_sec
= buf
->actime
;
7628 ts
[1].tv_sec
= buf
->modtime
;
7631 return futimens(fd
, ts
, perms
);
7634 int Client::utimes(const char *relpath
, struct timeval times
[2],
7635 const UserPerm
& perms
)
7637 std::lock_guard
lock(client_lock
);
7638 tout(cct
) << __func__
<< std::endl
;
7639 tout(cct
) << relpath
<< std::endl
;
7640 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7642 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7648 filepath
path(relpath
);
7650 int r
= path_walk(path
, &in
, perms
);
7654 utime_t
atime(times
[0]);
7655 utime_t
mtime(times
[1]);
7657 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7658 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7661 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7662 const UserPerm
& perms
)
7664 std::lock_guard
lock(client_lock
);
7665 tout(cct
) << __func__
<< std::endl
;
7666 tout(cct
) << relpath
<< std::endl
;
7667 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7669 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7675 filepath
path(relpath
);
7677 int r
= path_walk(path
, &in
, perms
, false);
7681 utime_t
atime(times
[0]);
7682 utime_t
mtime(times
[1]);
7684 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7685 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7688 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7690 struct timespec ts
[2];
7691 ts
[0].tv_sec
= times
[0].tv_sec
;
7692 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7693 ts
[1].tv_sec
= times
[1].tv_sec
;
7694 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7696 return futimens(fd
, ts
, perms
);
7699 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7701 std::lock_guard
lock(client_lock
);
7702 tout(cct
) << __func__
<< std::endl
;
7703 tout(cct
) << fd
<< std::endl
;
7704 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7706 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7712 Fh
*f
= get_filehandle(fd
);
7715 #if defined(__linux__) && defined(O_PATH)
7716 if (f
->flags
& O_PATH
)
7720 utime_t
atime(times
[0]);
7721 utime_t
mtime(times
[1]);
7723 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7724 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7727 int Client::flock(int fd
, int operation
, uint64_t owner
)
7729 std::lock_guard
lock(client_lock
);
7730 tout(cct
) << __func__
<< std::endl
;
7731 tout(cct
) << fd
<< std::endl
;
7732 tout(cct
) << operation
<< std::endl
;
7733 tout(cct
) << owner
<< std::endl
;
7738 Fh
*f
= get_filehandle(fd
);
7742 return _flock(f
, operation
, owner
);
7745 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7747 std::lock_guard
lock(client_lock
);
7748 tout(cct
) << __func__
<< std::endl
;
7749 tout(cct
) << relpath
<< std::endl
;
7754 filepath
path(relpath
);
7756 int r
= path_walk(path
, &in
, perms
, true);
7759 if (cct
->_conf
->client_permissions
) {
7760 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7764 r
= _opendir(in
.get(), dirpp
, perms
);
7765 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7767 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7771 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7775 *dirpp
= new dir_result_t(in
, perms
);
7776 opened_dirs
.insert(*dirpp
);
7777 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7782 int Client::closedir(dir_result_t
*dir
)
7784 std::lock_guard
lock(client_lock
);
7785 tout(cct
) << __func__
<< std::endl
;
7786 tout(cct
) << (unsigned long)dir
<< std::endl
;
7788 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7793 void Client::_closedir(dir_result_t
*dirp
)
7795 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7797 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7798 dirp
->inode
.reset();
7800 _readdir_drop_dirp_buffer(dirp
);
7801 opened_dirs
.erase(dirp
);
7805 void Client::rewinddir(dir_result_t
*dirp
)
7807 std::lock_guard
lock(client_lock
);
7808 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7813 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7814 _readdir_drop_dirp_buffer(d
);
7818 loff_t
Client::telldir(dir_result_t
*dirp
)
7820 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7821 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7825 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7827 std::lock_guard
lock(client_lock
);
7829 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7834 if (offset
== dirp
->offset
)
7837 if (offset
> dirp
->offset
)
7838 dirp
->release_count
= 0; // bump if we do a forward seek
7840 dirp
->ordered_count
= 0; // disable filling readdir cache
7842 if (dirp
->hash_order()) {
7843 if (dirp
->offset
> offset
) {
7844 _readdir_drop_dirp_buffer(dirp
);
7849 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7850 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7851 _readdir_drop_dirp_buffer(dirp
);
7856 dirp
->offset
= offset
;
7861 // ino_t d_ino; /* inode number */
7862 // off_t d_off; /* offset to the next dirent */
7863 // unsigned short d_reclen; /* length of this record */
7864 // unsigned char d_type; /* type of file */
7865 // char d_name[256]; /* filename */
7867 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7869 strncpy(de
->d_name
, name
, 255);
7870 de
->d_name
[255] = '\0';
7873 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7874 de
->d_off
= next_off
;
7877 de
->d_type
= IFTODT(type
);
7878 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7879 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7883 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7885 frag_t fg
= dirp
->buffer_frag
;
7887 if (fg
.is_rightmost()) {
7888 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7895 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7897 if (dirp
->hash_order()) {
7899 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7900 if (dirp
->offset
< new_offset
) // don't decrease offset
7901 dirp
->offset
= new_offset
;
7903 dirp
->last_name
.clear();
7904 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7905 _readdir_rechoose_frag(dirp
);
7909 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7911 ceph_assert(dirp
->inode
);
7913 if (dirp
->hash_order())
7916 frag_t cur
= frag_t(dirp
->offset_high());
7917 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7919 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7920 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7921 dirp
->last_name
.clear();
7922 dirp
->next_offset
= 2;
7926 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7928 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7929 dirp
->buffer
.clear();
7932 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7935 ceph_assert(dirp
->inode
);
7937 // get the current frag.
7939 if (dirp
->hash_order())
7940 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7942 fg
= frag_t(dirp
->offset_high());
7944 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7945 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7947 int op
= CEPH_MDS_OP_READDIR
;
7948 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7949 op
= CEPH_MDS_OP_LSSNAP
;
7951 InodeRef
& diri
= dirp
->inode
;
7953 MetaRequest
*req
= new MetaRequest(op
);
7955 diri
->make_nosnap_relative_path(path
);
7956 req
->set_filepath(path
);
7957 req
->set_inode(diri
.get());
7958 req
->head
.args
.readdir
.frag
= fg
;
7959 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7960 if (dirp
->last_name
.length()) {
7961 req
->path2
.set_path(dirp
->last_name
);
7962 } else if (dirp
->hash_order()) {
7963 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7968 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7970 if (res
== -EAGAIN
) {
7971 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
7972 _readdir_rechoose_frag(dirp
);
7973 return _readdir_get_frag(dirp
);
7977 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
7978 << " size " << dirp
->buffer
.size() << dendl
;
7980 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
7987 struct dentry_off_lt
{
7988 bool operator()(const Dentry
* dn
, int64_t off
) const {
7989 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7993 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7994 int caps
, bool getref
)
7996 ceph_assert(ceph_mutex_is_locked(client_lock
));
7997 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
7998 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8000 Dir
*dir
= dirp
->inode
->dir
;
8003 ldout(cct
, 10) << " dir is empty" << dendl
;
8008 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8009 dir
->readdir_cache
.end(),
8010 dirp
->offset
, dentry_off_lt());
8014 if (!dirp
->inode
->is_complete_and_ordered())
8016 if (pd
== dir
->readdir_cache
.end())
8019 if (dn
->inode
== NULL
) {
8020 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8024 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8025 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8030 int idx
= pd
- dir
->readdir_cache
.begin();
8031 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8035 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8036 pd
= dir
->readdir_cache
.begin() + idx
;
8037 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8040 struct ceph_statx stx
;
8042 fill_statx(dn
->inode
, caps
, &stx
);
8044 uint64_t next_off
= dn
->offset
+ 1;
8045 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8047 if (pd
== dir
->readdir_cache
.end())
8048 next_off
= dir_result_t::END
;
8052 in
= dn
->inode
.get();
8056 dn_name
= dn
->name
; // fill in name while we have lock
8058 client_lock
.unlock();
8059 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8061 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8062 << " = " << r
<< dendl
;
8067 dirp
->offset
= next_off
;
8069 dirp
->next_offset
= 2;
8071 dirp
->next_offset
= dirp
->offset_low();
8072 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8073 dirp
->release_count
= 0; // last_name no longer match cache index
8078 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8083 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8084 unsigned want
, unsigned flags
, bool getref
)
8086 int caps
= statx_to_mask(flags
, want
);
8088 std::lock_guard
lock(client_lock
);
8093 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8095 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8096 << dec
<< " at_end=" << dirp
->at_end()
8097 << " hash_order=" << dirp
->hash_order() << dendl
;
8100 struct ceph_statx stx
;
8101 memset(&de
, 0, sizeof(de
));
8102 memset(&stx
, 0, sizeof(stx
));
8104 InodeRef
& diri
= dirp
->inode
;
8109 if (dirp
->offset
== 0) {
8110 ldout(cct
, 15) << " including ." << dendl
;
8111 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8112 uint64_t next_off
= 1;
8115 r
= _getattr(diri
, caps
, dirp
->perms
);
8119 fill_statx(diri
, caps
, &stx
);
8120 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8122 Inode
*inode
= NULL
;
8128 client_lock
.unlock();
8129 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8134 dirp
->offset
= next_off
;
8138 if (dirp
->offset
== 1) {
8139 ldout(cct
, 15) << " including .." << dendl
;
8140 uint64_t next_off
= 2;
8142 if (diri
->dentries
.empty())
8145 in
= diri
->get_first_parent()->dir
->parent_inode
;
8148 r
= _getattr(in
, caps
, dirp
->perms
);
8152 fill_statx(in
, caps
, &stx
);
8153 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8155 Inode
*inode
= NULL
;
8161 client_lock
.unlock();
8162 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8167 dirp
->offset
= next_off
;
8172 // can we read from our cache?
8173 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8174 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8175 << dirp
->inode
->is_complete_and_ordered()
8176 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8178 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8179 dirp
->inode
->is_complete_and_ordered() &&
8180 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8181 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8190 bool check_caps
= true;
8191 if (!dirp
->is_cached()) {
8192 int r
= _readdir_get_frag(dirp
);
8195 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8196 // different than the requested one. (our dirfragtree was outdated)
8199 frag_t fg
= dirp
->buffer_frag
;
8201 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8202 << " offset " << hex
<< dirp
->offset
<< dendl
;
8204 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8205 dirp
->offset
, dir_result_t::dentry_off_lt());
8206 it
!= dirp
->buffer
.end();
8208 dir_result_t::dentry
&entry
= *it
;
8210 uint64_t next_off
= entry
.offset
+ 1;
8214 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8219 fill_statx(entry
.inode
, caps
, &stx
);
8220 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8222 Inode
*inode
= NULL
;
8224 inode
= entry
.inode
.get();
8228 client_lock
.unlock();
8229 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8232 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8233 << " = " << r
<< dendl
;
8237 dirp
->offset
= next_off
;
8242 if (dirp
->next_offset
> 2) {
8243 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8244 _readdir_drop_dirp_buffer(dirp
);
8248 if (!fg
.is_rightmost()) {
8250 _readdir_next_frag(dirp
);
8254 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8255 diri
->dir_release_count
== dirp
->release_count
) {
8256 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8257 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8259 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8260 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8262 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8264 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8265 diri
->flags
|= I_COMPLETE
;
8277 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8279 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8286 * 1 if we got a dirent
8287 * 0 for end of directory
8291 struct single_readdir
{
8293 struct ceph_statx
*stx
;
8298 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8299 struct ceph_statx
*stx
, off_t off
,
8302 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8305 return -1; // already filled this dirent
8315 struct dirent
*Client::readdir(dir_result_t
*d
)
8318 static struct dirent de
;
8325 // our callback fills the dirent and sets sr.full=true on first
8326 // call, and returns -1 the second time around.
8327 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8329 errno
= -ret
; // this sucks.
8330 return (dirent
*) NULL
;
8335 return (dirent
*) NULL
;
8338 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8339 struct ceph_statx
*stx
, unsigned want
,
8340 unsigned flags
, Inode
**out
)
8348 // our callback fills the dirent and sets sr.full=true on first
8349 // call, and returns -1 the second time around.
8350 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8362 struct getdents_result
{
8369 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8370 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8372 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8378 dlen
= strlen(de
->d_name
) + 1;
8380 if (c
->pos
+ dlen
> c
->buflen
)
8381 return -1; // doesn't fit
8384 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8386 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8392 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8397 gr
.fullent
= fullent
;
8400 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8402 if (r
< 0) { // some error
8403 if (r
== -1) { // buffer ran out of space
8404 if (gr
.pos
) { // but we got some entries already!
8406 } // or we need a larger buffer
8408 } else { // actual error, return it
8417 struct getdir_result
{
8418 list
<string
> *contents
;
8422 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8424 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8426 r
->contents
->push_back(de
->d_name
);
8431 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8432 const UserPerm
& perms
)
8434 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8436 std::lock_guard
lock(client_lock
);
8437 tout(cct
) << "getdir" << std::endl
;
8438 tout(cct
) << relpath
<< std::endl
;
8442 int r
= opendir(relpath
, &d
, perms
);
8447 gr
.contents
= &contents
;
8449 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8459 /****** file i/o **********/
8460 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8461 mode_t mode
, int stripe_unit
, int stripe_count
,
8462 int object_size
, const char *data_pool
)
8464 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8465 std::lock_guard
lock(client_lock
);
8466 tout(cct
) << "open" << std::endl
;
8467 tout(cct
) << relpath
<< std::endl
;
8468 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8475 #if defined(__linux__) && defined(O_PATH)
8476 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8477 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8478 * in kernel (fs/open.c). */
8480 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8483 filepath
path(relpath
);
8485 bool created
= false;
8486 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8487 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8488 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8490 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8493 #if defined(__linux__) && defined(O_PATH)
8494 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8496 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8500 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8501 filepath dirpath
= path
;
8502 string dname
= dirpath
.last_dentry();
8503 dirpath
.pop_dentry();
8505 r
= path_walk(dirpath
, &dir
, perms
, true,
8506 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8509 if (cct
->_conf
->client_permissions
) {
8510 r
= may_create(dir
.get(), perms
);
8514 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8515 stripe_count
, object_size
, data_pool
, &created
, perms
);
8521 // posix says we can only check permissions of existing files
8522 if (cct
->_conf
->client_permissions
) {
8523 r
= may_open(in
.get(), flags
, perms
);
8530 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8532 // allocate a integer file descriptor
8535 ceph_assert(fd_map
.count(r
) == 0);
8540 tout(cct
) << r
<< std::endl
;
8541 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8545 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8547 /* Use default file striping parameters */
8548 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8551 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8552 const UserPerm
& perms
)
8554 std::lock_guard
lock(client_lock
);
8555 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8560 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8562 req
->set_filepath(path
);
8564 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8566 sprintf(f
, "%u", h
);
8567 filepath
path2(dirino
);
8568 path2
.push_dentry(string(f
));
8569 req
->set_filepath2(path2
);
8571 int r
= make_request(req
, perms
, NULL
, NULL
,
8572 rand() % mdsmap
->get_num_in_mds());
8573 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8579 * Load inode into local cache.
8581 * If inode pointer is non-NULL, and take a reference on
8582 * the resulting Inode object in one operation, so that caller
8583 * can safely assume inode will still be there after return.
8585 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8587 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8592 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8594 req
->set_filepath(path
);
8596 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8597 if (r
== 0 && inode
!= NULL
) {
8598 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8599 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8600 ceph_assert(p
!= inode_map
.end());
8604 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8608 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8610 std::lock_guard
lock(client_lock
);
8611 return _lookup_ino(ino
, perms
, inode
);
8615 * Find the parent inode of `ino` and insert it into
8616 * our cache. Conditionally also set `parent` to a referenced
8617 * Inode* if caller provides non-NULL value.
8619 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8621 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8623 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8624 filepath
path(ino
->ino
);
8625 req
->set_filepath(path
);
8628 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8629 // Give caller a reference to the parent ino if they provided a pointer.
8630 if (parent
!= NULL
) {
8632 *parent
= target
.get();
8634 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8639 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8644 * Populate the parent dentry for `ino`, provided it is
8645 * a child of `parent`.
8647 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8649 ceph_assert(parent
->is_dir());
8650 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8655 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8656 req
->set_filepath2(filepath(parent
->ino
));
8657 req
->set_filepath(filepath(ino
->ino
));
8658 req
->set_inode(ino
);
8660 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8661 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8665 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8667 std::lock_guard
lock(client_lock
);
8668 return _lookup_name(ino
, parent
, perms
);
8671 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8674 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8676 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8678 if (in
->snapid
!= CEPH_NOSNAP
) {
8679 in
->snap_cap_refs
++;
8680 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8681 << ccap_string(in
->caps_issued()) << dendl
;
8684 const auto& conf
= cct
->_conf
;
8685 f
->readahead
.set_trigger_requests(1);
8686 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8687 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8688 if (conf
->client_readahead_max_bytes
) {
8689 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8691 if (conf
->client_readahead_max_periods
) {
8692 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8694 f
->readahead
.set_max_readahead_size(max_readahead
);
8695 vector
<uint64_t> alignments
;
8696 alignments
.push_back(in
->layout
.get_period());
8697 alignments
.push_back(in
->layout
.stripe_unit
);
8698 f
->readahead
.set_alignments(alignments
);
8703 int Client::_release_fh(Fh
*f
)
8705 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8706 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8707 Inode
*in
= f
->inode
.get();
8708 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8712 if (in
->snapid
== CEPH_NOSNAP
) {
8713 if (in
->put_open_ref(f
->mode
)) {
8714 _flush(in
, new C_Client_FlushComplete(this, in
));
8718 ceph_assert(in
->snap_cap_refs
> 0);
8719 in
->snap_cap_refs
--;
8722 _release_filelocks(f
);
8724 // Finally, read any async err (i.e. from flushes)
8725 int err
= f
->take_async_err();
8727 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8728 << cpp_strerror(err
) << dendl
;
8730 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8738 void Client::_put_fh(Fh
*f
)
8740 int left
= f
->put();
8746 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8747 const UserPerm
& perms
)
8749 if (in
->snapid
!= CEPH_NOSNAP
&&
8750 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8754 // use normalized flags to generate cmode
8755 int cflags
= ceph_flags_sys2wire(flags
);
8756 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8757 cflags
|= CEPH_O_LAZY
;
8759 int cmode
= ceph_flags_to_mode(cflags
);
8760 int want
= ceph_caps_for_mode(cmode
);
8763 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8765 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8767 check_caps(in
, CHECK_CAPS_NODELAY
);
8770 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8772 in
->make_nosnap_relative_path(path
);
8773 req
->set_filepath(path
);
8774 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8775 req
->head
.args
.open
.mode
= mode
;
8776 req
->head
.args
.open
.pool
= -1;
8777 if (cct
->_conf
->client_debug_getattr_caps
)
8778 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8780 req
->head
.args
.open
.mask
= 0;
8781 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8783 result
= make_request(req
, perms
);
8786 * NFS expects that delegations will be broken on a conflicting open,
8787 * not just when there is actual conflicting access to the file. SMB leases
8788 * and oplocks also have similar semantics.
8790 * Ensure that clients that have delegations enabled will wait on minimal
8791 * caps during open, just to ensure that other clients holding delegations
8792 * return theirs first.
8794 if (deleg_timeout
&& result
== 0) {
8797 if (cmode
& CEPH_FILE_MODE_WR
)
8798 need
|= CEPH_CAP_FILE_WR
;
8799 if (cmode
& CEPH_FILE_MODE_RD
)
8800 need
|= CEPH_CAP_FILE_RD
;
8802 result
= get_caps(in
, need
, want
, &have
, -1);
8804 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8805 " . Denying open: " <<
8806 cpp_strerror(result
) << dendl
;
8807 in
->put_open_ref(cmode
);
8809 put_cap_ref(in
, need
);
8817 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8819 in
->put_open_ref(cmode
);
8827 int Client::_renew_caps(Inode
*in
)
8829 int wanted
= in
->caps_file_wanted();
8830 if (in
->is_any_caps() &&
8831 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8832 check_caps(in
, CHECK_CAPS_NODELAY
);
8837 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8839 else if (wanted
& CEPH_CAP_FILE_RD
)
8841 else if (wanted
& CEPH_CAP_FILE_WR
)
8844 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8846 in
->make_nosnap_relative_path(path
);
8847 req
->set_filepath(path
);
8848 req
->head
.args
.open
.flags
= flags
;
8849 req
->head
.args
.open
.pool
= -1;
8850 if (cct
->_conf
->client_debug_getattr_caps
)
8851 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8853 req
->head
.args
.open
.mask
= 0;
8856 // duplicate in case Cap goes away; not sure if that race is a concern?
8857 const UserPerm
*pperm
= in
->get_best_perms();
8861 int ret
= make_request(req
, perms
);
8865 int Client::close(int fd
)
8867 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8868 std::lock_guard
lock(client_lock
);
8869 tout(cct
) << "close" << std::endl
;
8870 tout(cct
) << fd
<< std::endl
;
8875 Fh
*fh
= get_filehandle(fd
);
8878 int err
= _release_fh(fh
);
8881 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8889 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8891 std::lock_guard
lock(client_lock
);
8892 tout(cct
) << "lseek" << std::endl
;
8893 tout(cct
) << fd
<< std::endl
;
8894 tout(cct
) << offset
<< std::endl
;
8895 tout(cct
) << whence
<< std::endl
;
8900 Fh
*f
= get_filehandle(fd
);
8903 #if defined(__linux__) && defined(O_PATH)
8904 if (f
->flags
& O_PATH
)
8907 return _lseek(f
, offset
, whence
);
8910 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8912 Inode
*in
= f
->inode
.get();
8913 bool whence_check
= false;
8918 whence_check
= true;
8923 whence_check
= true;
8929 whence_check
= true;
8935 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8946 pos
= f
->pos
+ offset
;
8950 pos
= in
->size
+ offset
;
8955 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
8963 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
8970 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
8980 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8985 void Client::lock_fh_pos(Fh
*f
)
8987 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8989 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8990 ceph::condition_variable cond
;
8991 f
->pos_waiters
.push_back(&cond
);
8992 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
8993 std::unique_lock l
{client_lock
, std::adopt_lock
};
8994 cond
.wait(l
, [f
, me
=&cond
] {
8995 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
8998 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
8999 ceph_assert(f
->pos_waiters
.front() == &cond
);
9000 f
->pos_waiters
.pop_front();
9003 f
->pos_locked
= true;
9006 void Client::unlock_fh_pos(Fh
*f
)
9008 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9009 f
->pos_locked
= false;
9012 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9014 if (!in
->inline_data
.length()) {
9015 onfinish
->complete(0);
9020 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9021 object_t oid
= oid_buf
;
9023 ObjectOperation create_ops
;
9024 create_ops
.create(false);
9026 objecter
->mutate(oid
,
9027 OSDMap::file_to_object_locator(in
->layout
),
9029 in
->snaprealm
->get_snap_context(),
9030 ceph::real_clock::now(),
9034 bufferlist inline_version_bl
;
9035 encode(in
->inline_version
, inline_version_bl
);
9037 ObjectOperation uninline_ops
;
9038 uninline_ops
.cmpxattr("inline_version",
9039 CEPH_OSD_CMPXATTR_OP_GT
,
9040 CEPH_OSD_CMPXATTR_MODE_U64
,
9042 bufferlist inline_data
= in
->inline_data
;
9043 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9044 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9046 objecter
->mutate(oid
,
9047 OSDMap::file_to_object_locator(in
->layout
),
9049 in
->snaprealm
->get_snap_context(),
9050 ceph::real_clock::now(),
9059 // blocking osd interface
9061 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9063 std::lock_guard
lock(client_lock
);
9064 tout(cct
) << "read" << std::endl
;
9065 tout(cct
) << fd
<< std::endl
;
9066 tout(cct
) << size
<< std::endl
;
9067 tout(cct
) << offset
<< std::endl
;
9072 Fh
*f
= get_filehandle(fd
);
9075 #if defined(__linux__) && defined(O_PATH)
9076 if (f
->flags
& O_PATH
)
9080 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9081 size
= std::min(size
, (loff_t
)INT_MAX
);
9082 int r
= _read(f
, offset
, size
, &bl
);
9083 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9085 bl
.begin().copy(bl
.length(), buf
);
9091 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9095 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9098 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9101 bool movepos
= false;
9102 std::unique_ptr
<C_SaferCond
> onuninline
;
9104 const auto& conf
= cct
->_conf
;
9105 Inode
*in
= f
->inode
.get();
9107 utime_t start
= ceph_clock_now();
9109 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9111 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9118 loff_t start_pos
= offset
;
9120 if (in
->inline_version
== 0) {
9121 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9125 ceph_assert(in
->inline_version
> 0);
9129 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9130 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9132 want
= CEPH_CAP_FILE_CACHE
;
9133 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9137 if (f
->flags
& O_DIRECT
)
9138 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9140 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9141 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9142 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9143 uninline_data(in
, onuninline
.get());
9145 uint32_t len
= in
->inline_data
.length();
9146 uint64_t endoff
= offset
+ size
;
9147 if (endoff
> in
->size
)
9151 if (endoff
<= len
) {
9152 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9154 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9155 bl
->append_zero(endoff
- len
);
9157 r
= endoff
- offset
;
9158 } else if ((uint64_t)offset
< endoff
) {
9159 bl
->append_zero(endoff
- offset
);
9160 r
= endoff
- offset
;
9168 if (!conf
->client_debug_force_sync_read
&&
9170 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9172 if (f
->flags
& O_RSYNC
) {
9173 _flush_range(in
, offset
, size
);
9175 r
= _read_async(f
, offset
, size
, bl
);
9179 if (f
->flags
& O_DIRECT
)
9180 _flush_range(in
, offset
, size
);
9182 bool checkeof
= false;
9183 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9190 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9193 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9198 if ((uint64_t)offset
< in
->size
)
9204 ceph_assert(r
>= 0);
9207 f
->pos
= start_pos
+ r
;
9210 lat
= ceph_clock_now();
9212 logger
->tinc(l_c_read
, lat
);
9218 client_lock
.unlock();
9219 int ret
= onuninline
->wait();
9221 if (ret
>= 0 || ret
== -ECANCELED
) {
9222 in
->inline_data
.clear();
9223 in
->inline_version
= CEPH_INLINE_NONE
;
9224 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9230 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9238 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9241 f
->readahead
.inc_pending();
9244 Client::C_Readahead::~C_Readahead() {
9245 f
->readahead
.dec_pending();
9249 void Client::C_Readahead::finish(int r
) {
9250 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9251 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9254 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9256 const auto& conf
= cct
->_conf
;
9257 Inode
*in
= f
->inode
.get();
9259 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9261 // trim read based on file size?
9262 if (off
>= in
->size
)
9266 if (off
+ len
> in
->size
) {
9267 len
= in
->size
- off
;
9270 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9271 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9272 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9274 // read (and possibly block)
9276 C_SaferCond
onfinish("Client::_read_async flock");
9277 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9278 off
, len
, bl
, 0, &onfinish
);
9280 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9281 client_lock
.unlock();
9282 r
= onfinish
.wait();
9284 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9287 if(f
->readahead
.get_min_readahead_size() > 0) {
9288 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9289 if (readahead_extent
.second
> 0) {
9290 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9291 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9292 Context
*onfinish2
= new C_Readahead(this, f
);
9293 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9294 readahead_extent
.first
, readahead_extent
.second
,
9295 NULL
, 0, onfinish2
);
9297 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9298 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9300 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9309 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9312 Inode
*in
= f
->inode
.get();
9317 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9320 C_SaferCond
onfinish("Client::_read_sync flock");
9324 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9326 in
->truncate_size
, in
->truncate_seq
,
9328 client_lock
.unlock();
9329 int r
= onfinish
.wait();
9332 // if we get ENOENT from OSD, assume 0 bytes returned
9343 bl
->claim_append(tbl
);
9346 if (r
>= 0 && r
< wanted
) {
9347 if (pos
< in
->size
) {
9348 // zero up to known EOF
9349 int64_t some
= in
->size
- pos
;
9352 auto z
= buffer::ptr_node::create(some
);
9354 bl
->push_back(std::move(z
));
9371 * we keep count of uncommitted sync writes on the inode, so that
9374 void Client::_sync_write_commit(Inode
*in
)
9376 ceph_assert(unsafe_sync_write
> 0);
9377 unsafe_sync_write
--;
9379 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9381 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9382 if (unsafe_sync_write
== 0 && unmounting
) {
9383 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9384 mount_cond
.notify_all();
9388 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9390 std::lock_guard
lock(client_lock
);
9391 tout(cct
) << "write" << std::endl
;
9392 tout(cct
) << fd
<< std::endl
;
9393 tout(cct
) << size
<< std::endl
;
9394 tout(cct
) << offset
<< std::endl
;
9399 Fh
*fh
= get_filehandle(fd
);
9402 #if defined(__linux__) && defined(O_PATH)
9403 if (fh
->flags
& O_PATH
)
9406 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9407 size
= std::min(size
, (loff_t
)INT_MAX
);
9408 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9409 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9413 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9417 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9420 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9421 unsigned iovcnt
, int64_t offset
, bool write
,
9424 #if defined(__linux__) && defined(O_PATH)
9425 if (fh
->flags
& O_PATH
)
9428 loff_t totallen
= 0;
9429 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9430 totallen
+= iov
[i
].iov_len
;
9434 * Some of the API functions take 64-bit size values, but only return
9435 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9436 * we don't do I/Os larger than the values we can return.
9439 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9442 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9443 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9447 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9448 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9452 auto iter
= bl
.cbegin();
9453 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9455 * This piece of code aims to handle the case that bufferlist does not have enough data
9456 * to fill in the iov
9458 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
9459 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
9460 resid
-= round_size
;
9461 /* iter is self-updating */
9467 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9469 std::lock_guard
lock(client_lock
);
9470 tout(cct
) << fd
<< std::endl
;
9471 tout(cct
) << offset
<< std::endl
;
9476 Fh
*fh
= get_filehandle(fd
);
9479 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9482 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9483 const struct iovec
*iov
, int iovcnt
)
9487 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9490 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9491 Inode
*in
= f
->inode
.get();
9493 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9497 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9499 // was Fh opened as writeable?
9500 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9503 // use/adjust fd pos?
9507 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9508 * change out from under us.
9510 if (f
->flags
& O_APPEND
) {
9511 auto r
= _lseek(f
, 0, SEEK_END
);
9523 uint64_t endoff
= offset
+ size
;
9524 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9529 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9531 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9534 utime_t start
= ceph_clock_now();
9536 if (in
->inline_version
== 0) {
9537 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9540 ceph_assert(in
->inline_version
> 0);
9543 // copy into fresh buffer (since our write may be resub, async)
9547 bl
.append(buf
, size
);
9549 for (int i
= 0; i
< iovcnt
; i
++) {
9550 if (iov
[i
].iov_len
> 0) {
9551 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9557 uint64_t totalwritten
;
9559 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9560 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9562 want
= CEPH_CAP_FILE_BUFFER
;
9563 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9567 /* clear the setuid/setgid bits, if any */
9568 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9569 struct ceph_statx stx
= { 0 };
9571 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9572 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9576 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9579 if (f
->flags
& O_DIRECT
)
9580 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9582 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9584 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9586 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9587 if (endoff
> cct
->_conf
->client_max_inline_size
||
9588 endoff
> CEPH_INLINE_MAX_SIZE
||
9589 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9590 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9591 uninline_data(in
, onuninline
.get());
9593 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9595 uint32_t len
= in
->inline_data
.length();
9598 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
9601 in
->inline_data
.splice(offset
, len
- offset
);
9602 else if (offset
> len
)
9603 in
->inline_data
.append_zero(offset
- len
);
9605 in
->inline_data
.append(bl
);
9606 in
->inline_version
++;
9608 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9614 if (cct
->_conf
->client_oc
&&
9615 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9616 // do buffered write
9617 if (!in
->oset
.dirty_or_tx
)
9618 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9620 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9622 // async, caching, non-blocking.
9623 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9624 in
->snaprealm
->get_snap_context(),
9625 offset
, size
, bl
, ceph::real_clock::now(),
9627 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9632 // flush cached write if O_SYNC is set on file fh
9633 // O_DSYNC == O_SYNC on linux < 2.6.33
9634 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9635 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9636 _flush_range(in
, offset
, size
);
9639 if (f
->flags
& O_DIRECT
)
9640 _flush_range(in
, offset
, size
);
9642 // simple, non-atomic sync write
9643 C_SaferCond
onfinish("Client::_write flock");
9644 unsafe_sync_write
++;
9645 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9647 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9648 offset
, size
, bl
, ceph::real_clock::now(), 0,
9649 in
->truncate_size
, in
->truncate_seq
,
9651 client_lock
.unlock();
9654 _sync_write_commit(in
);
9657 // if we get here, write was successful, update client metadata
9660 lat
= ceph_clock_now();
9662 logger
->tinc(l_c_wrlat
, lat
);
9669 totalwritten
= size
;
9670 r
= (int64_t)totalwritten
;
9673 if (totalwritten
+ offset
> in
->size
) {
9674 in
->size
= totalwritten
+ offset
;
9675 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9677 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9678 check_caps(in
, CHECK_CAPS_NODELAY
);
9679 } else if (is_max_size_approaching(in
)) {
9683 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9685 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9689 in
->mtime
= in
->ctime
= ceph_clock_now();
9691 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9695 if (nullptr != onuninline
) {
9696 client_lock
.unlock();
9697 int uninline_ret
= onuninline
->wait();
9700 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9701 in
->inline_data
.clear();
9702 in
->inline_version
= CEPH_INLINE_NONE
;
9703 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9709 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9713 int Client::_flush(Fh
*f
)
9715 Inode
*in
= f
->inode
.get();
9716 int err
= f
->take_async_err();
9718 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9719 << cpp_strerror(err
) << dendl
;
9721 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9727 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9729 struct ceph_statx stx
;
9730 stx
.stx_size
= length
;
9731 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9734 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9736 std::lock_guard
lock(client_lock
);
9737 tout(cct
) << __func__
<< std::endl
;
9738 tout(cct
) << fd
<< std::endl
;
9739 tout(cct
) << length
<< std::endl
;
9744 Fh
*f
= get_filehandle(fd
);
9747 #if defined(__linux__) && defined(O_PATH)
9748 if (f
->flags
& O_PATH
)
9752 attr
.st_size
= length
;
9753 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9756 int Client::fsync(int fd
, bool syncdataonly
)
9758 std::lock_guard
lock(client_lock
);
9759 tout(cct
) << "fsync" << std::endl
;
9760 tout(cct
) << fd
<< std::endl
;
9761 tout(cct
) << syncdataonly
<< std::endl
;
9766 Fh
*f
= get_filehandle(fd
);
9769 #if defined(__linux__) && defined(O_PATH)
9770 if (f
->flags
& O_PATH
)
9773 int r
= _fsync(f
, syncdataonly
);
9775 // The IOs in this fsync were okay, but maybe something happened
9776 // in the background that we shoudl be reporting?
9777 r
= f
->take_async_err();
9778 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9779 << ") = 0, async_err = " << r
<< dendl
;
9781 // Assume that an error we encountered during fsync, even reported
9782 // synchronously, would also have applied the error to the Fh, and we
9783 // should clear it here to avoid returning the same error again on next
9785 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9787 f
->take_async_err();
9792 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9795 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9796 ceph_tid_t flush_tid
= 0;
9799 utime_t start
= ceph_clock_now();
9801 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9803 if (cct
->_conf
->client_oc
) {
9804 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9805 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9806 _flush(in
, object_cacher_completion
.get());
9807 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9810 if (!syncdataonly
&& in
->dirty_caps
) {
9811 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9812 if (in
->flushing_caps
)
9813 flush_tid
= last_flush_tid
;
9814 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9816 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9819 MetaRequest
*req
= in
->unsafe_ops
.back();
9820 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9823 wait_on_list(req
->waitfor_safe
);
9827 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9828 client_lock
.unlock();
9829 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9830 r
= object_cacher_completion
->wait();
9832 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9834 // FIXME: this can starve
9835 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9836 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9837 << " uncommitted, waiting" << dendl
;
9838 wait_on_list(in
->waitfor_commit
);
9844 wait_sync_caps(in
, flush_tid
);
9846 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9848 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9849 << cpp_strerror(-r
) << dendl
;
9852 lat
= ceph_clock_now();
9854 logger
->tinc(l_c_fsync
, lat
);
9859 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9861 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9862 return _fsync(f
->inode
.get(), syncdataonly
);
9865 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9867 std::lock_guard
lock(client_lock
);
9868 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9869 tout(cct
) << fd
<< std::endl
;
9874 Fh
*f
= get_filehandle(fd
);
9877 int r
= _getattr(f
->inode
, mask
, perms
);
9880 fill_stat(f
->inode
, stbuf
, NULL
);
9881 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9885 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9886 unsigned int want
, unsigned int flags
)
9888 std::lock_guard
lock(client_lock
);
9889 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9890 tout(cct
) << fd
<< std::endl
;
9895 Fh
*f
= get_filehandle(fd
);
9899 unsigned mask
= statx_to_mask(flags
, want
);
9902 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9903 r
= _getattr(f
->inode
, mask
, perms
);
9905 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9910 fill_statx(f
->inode
, mask
, stx
);
9911 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9915 // not written yet, but i want to link!
9917 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9918 const UserPerm
& perms
)
9920 std::lock_guard
lock(client_lock
);
9921 tout(cct
) << "chdir" << std::endl
;
9922 tout(cct
) << relpath
<< std::endl
;
9927 filepath
path(relpath
);
9929 int r
= path_walk(path
, &in
, perms
);
9933 if (!(in
.get()->is_dir()))
9938 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9940 _getcwd(new_cwd
, perms
);
9944 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9947 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
9949 Inode
*in
= cwd
.get();
9950 while (in
!= root
) {
9951 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
9953 // A cwd or ancester is unlinked
9954 if (in
->dentries
.empty()) {
9958 Dentry
*dn
= in
->get_first_parent();
9963 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
9964 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9965 filepath
path(in
->ino
);
9966 req
->set_filepath(path
);
9968 int res
= make_request(req
, perms
);
9977 path
.push_front_dentry(dn
->name
);
9978 in
= dn
->dir
->parent_inode
;
9981 dir
+= path
.get_path();
9984 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9986 std::lock_guard
l(client_lock
);
9988 _getcwd(dir
, perms
);
9991 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9992 const UserPerm
& perms
)
9994 std::lock_guard
l(client_lock
);
9995 tout(cct
) << __func__
<< std::endl
;
9996 unsigned long int total_files_on_fs
;
10004 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10005 if (data_pools
.size() == 1) {
10006 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10008 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10011 client_lock
.unlock();
10012 int rval
= cond
.wait();
10014 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10015 client_lock
.lock();
10018 ldout(cct
, 1) << "underlying call to statfs returned error: "
10019 << cpp_strerror(rval
)
10024 memset(stbuf
, 0, sizeof(*stbuf
));
10027 * we're going to set a block size of 4MB so we can represent larger
10028 * FSes without overflowing. Additionally convert the space
10029 * measurements from KB to bytes while making them in terms of
10030 * blocks. We use 4MB only because it is big enough, and because it
10031 * actually *is* the (ceph) default block size.
10033 const int CEPH_BLOCK_SHIFT
= 22;
10034 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10035 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10036 stbuf
->f_files
= total_files_on_fs
;
10037 stbuf
->f_ffree
= 0;
10038 stbuf
->f_favail
= -1;
10039 stbuf
->f_fsid
= -1; // ??
10040 stbuf
->f_flag
= 0; // ??
10041 stbuf
->f_namemax
= NAME_MAX
;
10043 // Usually quota_root will == root_ancestor, but if the mount root has no
10044 // quota but we can see a parent of it that does have a quota, we'll
10045 // respect that one instead.
10046 ceph_assert(root
!= nullptr);
10047 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10049 // get_quota_root should always give us something
10050 // because client quotas are always enabled
10051 ceph_assert(quota_root
!= nullptr);
10053 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10055 // Skip the getattr if any sessions are stale, as we don't want to
10056 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10058 if (!_any_stale_sessions()) {
10059 int r
= _getattr(quota_root
, 0, perms
, true);
10061 // Ignore return value: error getting latest inode metadata is not a good
10062 // reason to break "df".
10063 lderr(cct
) << "Error in getattr on quota root 0x"
10064 << std::hex
<< quota_root
->ino
<< std::dec
10065 << " statfs result may be outdated" << dendl
;
10069 // Special case: if there is a size quota set on the Inode acting
10070 // as the root for this client mount, then report the quota status
10071 // as the filesystem statistics.
10072 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10073 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10074 // It is possible for a quota to be exceeded: arithmetic here must
10075 // handle case where used > total.
10076 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10078 stbuf
->f_blocks
= total
;
10079 stbuf
->f_bfree
= free
;
10080 stbuf
->f_bavail
= free
;
10082 // General case: report the cluster statistics returned from RADOS. Because
10083 // multiple pools may be used without one filesystem namespace via
10084 // layouts, this is the most correct thing we can do.
10085 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10086 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10087 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10093 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10094 struct flock
*fl
, uint64_t owner
, bool removing
)
10096 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10097 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10098 << " type " << fl
->l_type
<< " owner " << owner
10099 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10102 if (F_RDLCK
== fl
->l_type
)
10103 lock_cmd
= CEPH_LOCK_SHARED
;
10104 else if (F_WRLCK
== fl
->l_type
)
10105 lock_cmd
= CEPH_LOCK_EXCL
;
10106 else if (F_UNLCK
== fl
->l_type
)
10107 lock_cmd
= CEPH_LOCK_UNLOCK
;
10111 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10115 * Set the most significant bit, so that MDS knows the 'owner'
10116 * is sufficient to identify the owner of lock. (old code uses
10117 * both 'owner' and 'pid')
10119 owner
|= (1ULL << 63);
10121 MetaRequest
*req
= new MetaRequest(op
);
10123 in
->make_nosnap_relative_path(path
);
10124 req
->set_filepath(path
);
10125 req
->set_inode(in
);
10127 req
->head
.args
.filelock_change
.rule
= lock_type
;
10128 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10129 req
->head
.args
.filelock_change
.owner
= owner
;
10130 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10131 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10132 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10133 req
->head
.args
.filelock_change
.wait
= sleep
;
10138 if (sleep
&& switch_interrupt_cb
) {
10139 // enable interrupt
10140 switch_interrupt_cb(callback_handle
, req
->get());
10141 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10142 // disable interrupt
10143 switch_interrupt_cb(callback_handle
, NULL
);
10144 if (ret
== 0 && req
->aborted()) {
10145 // effect of this lock request has been revoked by the 'lock intr' request
10146 ret
= req
->get_abort_code();
10150 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10154 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10155 ceph_filelock filelock
;
10156 auto p
= bl
.cbegin();
10157 decode(filelock
, p
);
10159 if (CEPH_LOCK_SHARED
== filelock
.type
)
10160 fl
->l_type
= F_RDLCK
;
10161 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10162 fl
->l_type
= F_WRLCK
;
10164 fl
->l_type
= F_UNLCK
;
10166 fl
->l_whence
= SEEK_SET
;
10167 fl
->l_start
= filelock
.start
;
10168 fl
->l_len
= filelock
.length
;
10169 fl
->l_pid
= filelock
.pid
;
10170 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10171 ceph_lock_state_t
*lock_state
;
10172 if (lock_type
== CEPH_LOCK_FCNTL
) {
10173 if (!in
->fcntl_locks
)
10174 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10175 lock_state
= in
->fcntl_locks
.get();
10176 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10177 if (!in
->flock_locks
)
10178 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10179 lock_state
= in
->flock_locks
.get();
10184 _update_lock_state(fl
, owner
, lock_state
);
10187 if (lock_type
== CEPH_LOCK_FCNTL
) {
10188 if (!fh
->fcntl_locks
)
10189 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10190 lock_state
= fh
->fcntl_locks
.get();
10192 if (!fh
->flock_locks
)
10193 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10194 lock_state
= fh
->flock_locks
.get();
10196 _update_lock_state(fl
, owner
, lock_state
);
10204 int Client::_interrupt_filelock(MetaRequest
*req
)
10206 // Set abort code, but do not kick. The abort code prevents the request
10207 // from being re-sent.
10208 req
->abort(-EINTR
);
10210 return 0; // haven't sent the request
10212 Inode
*in
= req
->inode();
10215 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10216 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10217 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10218 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10224 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10226 in
->make_nosnap_relative_path(path
);
10227 intr_req
->set_filepath(path
);
10228 intr_req
->set_inode(in
);
10229 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10230 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10231 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10233 UserPerm
perms(req
->get_uid(), req
->get_gid());
10234 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10237 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10239 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10242 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10243 encode(nr_fcntl_locks
, bl
);
10244 if (nr_fcntl_locks
) {
10245 auto &lock_state
= in
->fcntl_locks
;
10246 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10247 p
!= lock_state
->held_locks
.end();
10249 encode(p
->second
, bl
);
10252 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10253 encode(nr_flock_locks
, bl
);
10254 if (nr_flock_locks
) {
10255 auto &lock_state
= in
->flock_locks
;
10256 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10257 p
!= lock_state
->held_locks
.end();
10259 encode(p
->second
, bl
);
10262 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10263 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10266 void Client::_release_filelocks(Fh
*fh
)
10268 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10271 Inode
*in
= fh
->inode
.get();
10272 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10274 list
<pair
<int, ceph_filelock
> > to_release
;
10276 if (fh
->fcntl_locks
) {
10277 auto &lock_state
= fh
->fcntl_locks
;
10278 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10279 p
!= lock_state
->held_locks
.end();
10281 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10282 lock_state
.reset();
10284 if (fh
->flock_locks
) {
10285 auto &lock_state
= fh
->flock_locks
;
10286 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10287 p
!= lock_state
->held_locks
.end();
10289 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10290 lock_state
.reset();
10293 if (to_release
.empty())
10296 // mds has already released filelocks if session was closed.
10297 if (in
->caps
.empty())
10301 memset(&fl
, 0, sizeof(fl
));
10302 fl
.l_whence
= SEEK_SET
;
10303 fl
.l_type
= F_UNLCK
;
10305 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10306 p
!= to_release
.end();
10308 fl
.l_start
= p
->second
.start
;
10309 fl
.l_len
= p
->second
.length
;
10310 fl
.l_pid
= p
->second
.pid
;
10311 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10312 p
->second
.owner
, true);
10316 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10317 ceph_lock_state_t
*lock_state
)
10320 if (F_RDLCK
== fl
->l_type
)
10321 lock_cmd
= CEPH_LOCK_SHARED
;
10322 else if (F_WRLCK
== fl
->l_type
)
10323 lock_cmd
= CEPH_LOCK_EXCL
;
10325 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10327 ceph_filelock filelock
;
10328 filelock
.start
= fl
->l_start
;
10329 filelock
.length
= fl
->l_len
;
10330 filelock
.client
= 0;
10331 // see comment in _do_filelock()
10332 filelock
.owner
= owner
| (1ULL << 63);
10333 filelock
.pid
= fl
->l_pid
;
10334 filelock
.type
= lock_cmd
;
10336 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10337 list
<ceph_filelock
> activated_locks
;
10338 lock_state
->remove_lock(filelock
, activated_locks
);
10340 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10345 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10347 Inode
*in
= fh
->inode
.get();
10348 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10349 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10353 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10355 Inode
*in
= fh
->inode
.get();
10356 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10357 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10358 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10362 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10364 Inode
*in
= fh
->inode
.get();
10365 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10367 int sleep
= !(cmd
& LOCK_NB
);
10386 memset(&fl
, 0, sizeof(fl
));
10388 fl
.l_whence
= SEEK_SET
;
10390 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10391 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10395 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10397 /* Since the only thing this does is wrap a call to statfs, and
10398 statfs takes a lock, it doesn't seem we have a need to split it
10400 return statfs(0, stbuf
, perms
);
10403 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10407 std::lock_guard
l(client_lock
);
10408 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10409 << " invalidate_ino_cb " << args
->ino_cb
10410 << " invalidate_dentry_cb " << args
->dentry_cb
10411 << " switch_interrupt_cb " << args
->switch_intr_cb
10412 << " remount_cb " << args
->remount_cb
10414 callback_handle
= args
->handle
;
10415 if (args
->ino_cb
) {
10416 ino_invalidate_cb
= args
->ino_cb
;
10417 async_ino_invalidator
.start();
10419 if (args
->dentry_cb
) {
10420 dentry_invalidate_cb
= args
->dentry_cb
;
10421 async_dentry_invalidator
.start();
10423 if (args
->switch_intr_cb
) {
10424 switch_interrupt_cb
= args
->switch_intr_cb
;
10425 interrupt_finisher
.start();
10427 if (args
->remount_cb
) {
10428 remount_cb
= args
->remount_cb
;
10429 remount_finisher
.start();
10431 umask_cb
= args
->umask_cb
;
10434 int Client::test_dentry_handling(bool can_invalidate
)
10438 can_invalidate_dentries
= can_invalidate
;
10440 if (can_invalidate_dentries
) {
10441 ceph_assert(dentry_invalidate_cb
);
10442 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10445 ceph_assert(remount_cb
);
10446 ldout(cct
, 1) << "using remount_cb" << dendl
;
10447 r
= _do_remount(false);
10453 int Client::_sync_fs()
10455 ldout(cct
, 10) << __func__
<< dendl
;
10458 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10459 if (cct
->_conf
->client_oc
) {
10460 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10461 objectcacher
->flush_all(cond
.get());
10466 ceph_tid_t flush_tid
= last_flush_tid
;
10468 // wait for unsafe mds requests
10469 wait_unsafe_requests();
10471 wait_sync_caps(flush_tid
);
10473 if (nullptr != cond
) {
10474 client_lock
.unlock();
10475 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10477 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10478 client_lock
.lock();
10484 int Client::sync_fs()
10486 std::lock_guard
l(client_lock
);
10494 int64_t Client::drop_caches()
10496 std::lock_guard
l(client_lock
);
10497 return objectcacher
->release_all();
10500 int Client::_lazyio(Fh
*fh
, int enable
)
10502 Inode
*in
= fh
->inode
.get();
10503 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10505 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10508 int orig_mode
= fh
->mode
;
10510 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10511 in
->get_open_ref(fh
->mode
);
10512 in
->put_open_ref(orig_mode
);
10513 check_caps(in
, CHECK_CAPS_NODELAY
);
10515 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10516 in
->get_open_ref(fh
->mode
);
10517 in
->put_open_ref(orig_mode
);
10524 int Client::lazyio(int fd
, int enable
)
10526 std::lock_guard
l(client_lock
);
10527 Fh
*f
= get_filehandle(fd
);
10531 return _lazyio(f
, enable
);
10534 int Client::ll_lazyio(Fh
*fh
, int enable
)
10536 std::lock_guard
lock(client_lock
);
10537 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10538 tout(cct
) << __func__
<< std::endl
;
10540 return _lazyio(fh
, enable
);
10543 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10545 std::lock_guard
l(client_lock
);
10546 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10547 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10549 Fh
*f
= get_filehandle(fd
);
10559 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10561 std::lock_guard
l(client_lock
);
10562 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10563 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10565 Fh
*f
= get_filehandle(fd
);
10568 Inode
*in
= f
->inode
.get();
10571 if (_release(in
)) {
10572 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10580 // =============================
10583 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10585 std::lock_guard
l(client_lock
);
10590 filepath
path(relpath
);
10592 int r
= path_walk(path
, &in
, perm
);
10595 if (cct
->_conf
->client_permissions
) {
10596 r
= may_create(in
.get(), perm
);
10600 Inode
*snapdir
= open_snapdir(in
.get());
10601 return _mkdir(snapdir
, name
, 0, perm
);
10604 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10606 std::lock_guard
l(client_lock
);
10611 filepath
path(relpath
);
10613 int r
= path_walk(path
, &in
, perms
);
10616 if (cct
->_conf
->client_permissions
) {
10617 r
= may_delete(in
.get(), NULL
, perms
);
10621 Inode
*snapdir
= open_snapdir(in
.get());
10622 return _rmdir(snapdir
, name
, perms
);
10625 // =============================
10628 int Client::get_caps_issued(int fd
) {
10630 std::lock_guard
lock(client_lock
);
10635 Fh
*f
= get_filehandle(fd
);
10639 return f
->inode
->caps_issued();
10642 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10644 std::lock_guard
lock(client_lock
);
10651 int r
= path_walk(p
, &in
, perms
, true);
10654 return in
->caps_issued();
10657 // =========================================
10660 Inode
*Client::open_snapdir(Inode
*diri
)
10663 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10664 if (!inode_map
.count(vino
)) {
10665 in
= new Inode(this, vino
, &diri
->layout
);
10667 in
->ino
= diri
->ino
;
10668 in
->snapid
= CEPH_SNAPDIR
;
10669 in
->mode
= diri
->mode
;
10670 in
->uid
= diri
->uid
;
10671 in
->gid
= diri
->gid
;
10673 in
->mtime
= diri
->mtime
;
10674 in
->ctime
= diri
->ctime
;
10675 in
->btime
= diri
->btime
;
10676 in
->size
= diri
->size
;
10677 in
->change_attr
= diri
->change_attr
;
10679 in
->dirfragtree
.clear();
10680 in
->snapdir_parent
= diri
;
10681 diri
->flags
|= I_SNAPDIR_OPEN
;
10682 inode_map
[vino
] = in
;
10683 if (use_faked_inos())
10684 _assign_faked_ino(in
);
10685 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10687 in
= inode_map
[vino
];
10688 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10693 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10694 Inode
**out
, const UserPerm
& perms
)
10696 std::lock_guard
lock(client_lock
);
10697 vinodeno_t vparent
= _get_vino(parent
);
10698 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10699 tout(cct
) << __func__
<< std::endl
;
10700 tout(cct
) << name
<< std::endl
;
10706 if (!fuse_default_permissions
) {
10707 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10708 r
= may_lookup(parent
, perms
);
10714 string
dname(name
);
10717 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10724 fill_stat(in
, attr
);
10728 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10729 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10730 tout(cct
) << attr
->st_ino
<< std::endl
;
10735 int Client::ll_lookup_inode(
10736 struct inodeno_t ino
,
10737 const UserPerm
& perms
,
10740 ceph_assert(inode
!= NULL
);
10741 std::lock_guard
lock(client_lock
);
10742 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10747 // Num1: get inode and *inode
10748 int r
= _lookup_ino(ino
, perms
, inode
);
10752 ceph_assert(*inode
!= NULL
);
10754 if (!(*inode
)->dentries
.empty()) {
10755 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10759 if ((*inode
)->is_root()) {
10760 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10764 // Num2: Request the parent inode, so that we can look up the name
10766 r
= _lookup_parent(*inode
, perms
, &parent
);
10768 _ll_forget(*inode
, 1);
10772 ceph_assert(parent
!= NULL
);
10774 // Num3: Finally, get the name (dentry) of the requested inode
10775 r
= _lookup_name(*inode
, parent
, perms
);
10777 // Unexpected error
10778 _ll_forget(parent
, 1);
10779 _ll_forget(*inode
, 1);
10783 _ll_forget(parent
, 1);
10787 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10788 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10789 const UserPerm
& perms
)
10791 std::lock_guard
lock(client_lock
);
10792 vinodeno_t vparent
= _get_vino(parent
);
10793 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10794 tout(cct
) << "ll_lookupx" << std::endl
;
10795 tout(cct
) << name
<< std::endl
;
10801 if (!fuse_default_permissions
) {
10802 r
= may_lookup(parent
, perms
);
10807 string
dname(name
);
10810 unsigned mask
= statx_to_mask(flags
, want
);
10811 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10817 fill_statx(in
, mask
, stx
);
10821 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10822 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10823 tout(cct
) << stx
->stx_ino
<< std::endl
;
10828 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10829 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10831 std::lock_guard
lock(client_lock
);
10836 filepath
fp(name
, 0);
10839 unsigned mask
= statx_to_mask(flags
, want
);
10841 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10842 tout(cct
) << __func__
<< std::endl
;
10843 tout(cct
) << name
<< std::endl
;
10845 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10847 /* zero out mask, just in case... */
10854 fill_statx(in
, mask
, stx
);
10861 void Client::_ll_get(Inode
*in
)
10863 if (in
->ll_ref
== 0) {
10865 if (in
->is_dir() && !in
->dentries
.empty()) {
10866 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10867 in
->get_first_parent()->get(); // pin dentry
10869 if (in
->snapid
!= CEPH_NOSNAP
)
10870 ll_snap_ref
[in
->snapid
]++;
10873 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10876 int Client::_ll_put(Inode
*in
, uint64_t num
)
10879 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10880 if (in
->ll_ref
== 0) {
10881 if (in
->is_dir() && !in
->dentries
.empty()) {
10882 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10883 in
->get_first_parent()->put(); // unpin dentry
10885 if (in
->snapid
!= CEPH_NOSNAP
) {
10886 auto p
= ll_snap_ref
.find(in
->snapid
);
10887 ceph_assert(p
!= ll_snap_ref
.end());
10888 ceph_assert(p
->second
> 0);
10889 if (--p
->second
== 0)
10890 ll_snap_ref
.erase(p
);
10899 void Client::_ll_drop_pins()
10901 ldout(cct
, 10) << __func__
<< dendl
;
10902 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10903 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10904 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10905 it
!= inode_map
.end();
10907 Inode
*in
= it
->second
;
10911 to_be_put
.insert(in
);
10912 _ll_put(in
, in
->ll_ref
);
10917 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
10919 inodeno_t ino
= in
->ino
;
10921 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10922 tout(cct
) << __func__
<< std::endl
;
10923 tout(cct
) << ino
.val
<< std::endl
;
10924 tout(cct
) << count
<< std::endl
;
10926 // Ignore forget if we're no longer mounted
10930 if (ino
== 1) return true; // ignore forget on root.
10933 if (in
->ll_ref
< count
) {
10934 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10935 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10936 _ll_put(in
, in
->ll_ref
);
10939 if (_ll_put(in
, count
) == 0)
10946 bool Client::ll_forget(Inode
*in
, uint64_t count
)
10948 std::lock_guard
lock(client_lock
);
10949 return _ll_forget(in
, count
);
10952 bool Client::ll_put(Inode
*in
)
10954 /* ll_forget already takes the lock */
10955 return ll_forget(in
, 1);
10958 int Client::ll_get_snap_ref(snapid_t snap
)
10960 std::lock_guard
lock(client_lock
);
10961 auto p
= ll_snap_ref
.find(snap
);
10962 if (p
!= ll_snap_ref
.end())
10967 snapid_t
Client::ll_get_snapid(Inode
*in
)
10969 std::lock_guard
lock(client_lock
);
10973 Inode
*Client::ll_get_inode(ino_t ino
)
10975 std::lock_guard
lock(client_lock
);
10980 vinodeno_t vino
= _map_faked_ino(ino
);
10981 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10982 if (p
== inode_map
.end())
10984 Inode
*in
= p
->second
;
10989 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10991 std::lock_guard
lock(client_lock
);
10996 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10997 if (p
== inode_map
.end())
10999 Inode
*in
= p
->second
;
11004 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11006 vinodeno_t vino
= _get_vino(in
);
11008 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11009 tout(cct
) << __func__
<< std::endl
;
11010 tout(cct
) << vino
.ino
.val
<< std::endl
;
11012 if (vino
.snapid
< CEPH_NOSNAP
)
11015 return _getattr(in
, caps
, perms
);
11018 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11020 std::lock_guard
lock(client_lock
);
11025 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11028 fill_stat(in
, attr
);
11029 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11033 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11034 unsigned int flags
, const UserPerm
& perms
)
11036 std::lock_guard
lock(client_lock
);
11042 unsigned mask
= statx_to_mask(flags
, want
);
11044 if (mask
&& !in
->caps_issued_mask(mask
, true))
11045 res
= _ll_getattr(in
, mask
, perms
);
11048 fill_statx(in
, mask
, stx
);
11049 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11053 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11054 const UserPerm
& perms
, InodeRef
*inp
)
11056 vinodeno_t vino
= _get_vino(in
);
11058 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11060 tout(cct
) << __func__
<< std::endl
;
11061 tout(cct
) << vino
.ino
.val
<< std::endl
;
11062 tout(cct
) << stx
->stx_mode
<< std::endl
;
11063 tout(cct
) << stx
->stx_uid
<< std::endl
;
11064 tout(cct
) << stx
->stx_gid
<< std::endl
;
11065 tout(cct
) << stx
->stx_size
<< std::endl
;
11066 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11067 tout(cct
) << stx
->stx_atime
<< std::endl
;
11068 tout(cct
) << stx
->stx_btime
<< std::endl
;
11069 tout(cct
) << mask
<< std::endl
;
11071 if (!fuse_default_permissions
) {
11072 int res
= may_setattr(in
, stx
, mask
, perms
);
11077 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11079 return __setattrx(in
, stx
, mask
, perms
, inp
);
11082 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11083 const UserPerm
& perms
)
11085 std::lock_guard
lock(client_lock
);
11090 InodeRef
target(in
);
11091 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11093 ceph_assert(in
== target
.get());
11094 fill_statx(in
, in
->caps_issued(), stx
);
11097 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11101 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11102 const UserPerm
& perms
)
11104 struct ceph_statx stx
;
11105 stat_to_statx(attr
, &stx
);
11107 std::lock_guard
lock(client_lock
);
11112 InodeRef
target(in
);
11113 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11115 ceph_assert(in
== target
.get());
11116 fill_stat(in
, attr
);
11119 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11127 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11128 const UserPerm
& perms
)
11130 std::lock_guard
lock(client_lock
);
11136 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11139 return _getxattr(in
, name
, value
, size
, perms
);
11142 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11143 const UserPerm
& perms
)
11145 std::lock_guard
lock(client_lock
);
11151 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11154 return _getxattr(in
, name
, value
, size
, perms
);
11157 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11158 const UserPerm
& perms
)
11160 std::lock_guard
lock(client_lock
);
11165 Fh
*f
= get_filehandle(fd
);
11168 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11171 int Client::listxattr(const char *path
, char *list
, size_t size
,
11172 const UserPerm
& perms
)
11174 std::lock_guard
lock(client_lock
);
11180 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11183 return Client::_listxattr(in
.get(), list
, size
, perms
);
11186 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11187 const UserPerm
& perms
)
11189 std::lock_guard
lock(client_lock
);
11195 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11198 return Client::_listxattr(in
.get(), list
, size
, perms
);
11201 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11203 std::lock_guard
lock(client_lock
);
11208 Fh
*f
= get_filehandle(fd
);
11211 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11214 int Client::removexattr(const char *path
, const char *name
,
11215 const UserPerm
& perms
)
11217 std::lock_guard
lock(client_lock
);
11223 int r
= Client::path_walk(path
, &in
, perms
, true);
11226 return _removexattr(in
, name
, perms
);
11229 int Client::lremovexattr(const char *path
, const char *name
,
11230 const UserPerm
& perms
)
11232 std::lock_guard
lock(client_lock
);
11238 int r
= Client::path_walk(path
, &in
, perms
, false);
11241 return _removexattr(in
, name
, perms
);
11244 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11246 std::lock_guard
lock(client_lock
);
11251 Fh
*f
= get_filehandle(fd
);
11254 return _removexattr(f
->inode
, name
, perms
);
11257 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11258 size_t size
, int flags
, const UserPerm
& perms
)
11260 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11262 std::lock_guard
lock(client_lock
);
11268 int r
= Client::path_walk(path
, &in
, perms
, true);
11271 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11274 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11275 size_t size
, int flags
, const UserPerm
& perms
)
11277 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11279 std::lock_guard
lock(client_lock
);
11285 int r
= Client::path_walk(path
, &in
, perms
, false);
11288 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11291 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11292 int flags
, const UserPerm
& perms
)
11294 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11296 std::lock_guard
lock(client_lock
);
11301 Fh
*f
= get_filehandle(fd
);
11304 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11307 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11308 const UserPerm
& perms
)
11312 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11316 // Do a force getattr to get the latest quota before returning
11317 // a value to userspace.
11319 if (vxattr
->flags
& VXATTR_RSTAT
) {
11320 flags
|= CEPH_STAT_RSTAT
;
11322 r
= _getattr(in
, flags
, perms
, true);
11324 // Error from getattr!
11328 // call pointer-to-member function
11330 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11331 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11337 if (r
> (int)size
) {
11339 } else if (r
> 0) {
11340 memcpy(value
, buf
, r
);
11346 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11351 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11355 if (in
->xattrs
.count(n
)) {
11356 r
= in
->xattrs
[n
].length();
11357 if (r
> 0 && size
!= 0) {
11358 if (size
>= (unsigned)r
)
11359 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11366 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11370 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11371 const UserPerm
& perms
)
11373 if (cct
->_conf
->client_permissions
) {
11374 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11378 return _getxattr(in
.get(), name
, value
, size
, perms
);
11381 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11382 size_t size
, const UserPerm
& perms
)
11384 std::lock_guard
lock(client_lock
);
11389 vinodeno_t vino
= _get_vino(in
);
11391 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11392 tout(cct
) << __func__
<< std::endl
;
11393 tout(cct
) << vino
.ino
.val
<< std::endl
;
11394 tout(cct
) << name
<< std::endl
;
11396 if (!fuse_default_permissions
) {
11397 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11402 return _getxattr(in
, name
, value
, size
, perms
);
11405 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11406 const UserPerm
& perms
)
11408 bool len_only
= (size
== 0);
11409 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11415 for (const auto& p
: in
->xattrs
) {
11416 size_t this_len
= p
.first
.length() + 1;
11421 if (this_len
> size
) {
11426 memcpy(name
, p
.first
.c_str(), this_len
);
11431 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11435 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11436 const UserPerm
& perms
)
11438 std::lock_guard
lock(client_lock
);
11443 vinodeno_t vino
= _get_vino(in
);
11445 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11446 tout(cct
) << __func__
<< std::endl
;
11447 tout(cct
) << vino
.ino
.val
<< std::endl
;
11448 tout(cct
) << size
<< std::endl
;
11450 return _listxattr(in
, names
, size
, perms
);
11453 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11454 size_t size
, int flags
, const UserPerm
& perms
)
11457 int xattr_flags
= 0;
11459 xattr_flags
|= CEPH_XATTR_REMOVE
;
11460 if (flags
& XATTR_CREATE
)
11461 xattr_flags
|= CEPH_XATTR_CREATE
;
11462 if (flags
& XATTR_REPLACE
)
11463 xattr_flags
|= CEPH_XATTR_REPLACE
;
11465 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11467 in
->make_nosnap_relative_path(path
);
11468 req
->set_filepath(path
);
11469 req
->set_string2(name
);
11470 req
->set_inode(in
);
11471 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11474 assert (value
|| size
== 0);
11475 bl
.append((const char*)value
, size
);
11478 int res
= make_request(req
, perms
);
11481 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11486 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11487 size_t size
, int flags
, const UserPerm
& perms
)
11489 if (in
->snapid
!= CEPH_NOSNAP
) {
11493 bool posix_acl_xattr
= false;
11494 if (acl_type
== POSIX_ACL
)
11495 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11497 if (strncmp(name
, "user.", 5) &&
11498 strncmp(name
, "security.", 9) &&
11499 strncmp(name
, "trusted.", 8) &&
11500 strncmp(name
, "ceph.", 5) &&
11502 return -EOPNOTSUPP
;
11504 bool check_realm
= false;
11506 if (posix_acl_xattr
) {
11507 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11508 mode_t new_mode
= in
->mode
;
11510 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11517 if (new_mode
!= in
->mode
) {
11518 struct ceph_statx stx
;
11519 stx
.stx_mode
= new_mode
;
11520 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11525 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11527 if (!S_ISDIR(in
->mode
))
11529 int ret
= posix_acl_check(value
, size
);
11538 return -EOPNOTSUPP
;
11541 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11543 if (vxattr
->readonly
)
11544 return -EOPNOTSUPP
;
11545 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11546 check_realm
= true;
11550 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11551 if (ret
>= 0 && check_realm
) {
11552 // check if snaprealm was created for quota inode
11553 if (in
->quota
.is_enable() &&
11554 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11561 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11562 size_t size
, int flags
, const UserPerm
& perms
)
11564 if (cct
->_conf
->client_permissions
) {
11565 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11569 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11572 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11575 if (name
== "layout") {
11576 string::iterator begin
= value
.begin();
11577 string::iterator end
= value
.end();
11578 keys_and_values
<string::iterator
> p
; // create instance of parser
11579 std::map
<string
, string
> m
; // map to receive results
11580 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11585 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11586 if (q
->first
== "pool") {
11591 } else if (name
== "layout.pool") {
11595 if (tmp
.length()) {
11598 pool
= boost::lexical_cast
<unsigned>(tmp
);
11599 if (!osdmap
->have_pg_pool(pool
))
11601 } catch (boost::bad_lexical_cast
const&) {
11602 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11612 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11614 // For setting pool of layout, MetaRequest need osdmap epoch.
11615 // There is a race which create a new data pool but client and mds both don't have.
11616 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11617 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11618 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11619 string
rest(strstr(name
, "layout"));
11620 string
v((const char*)value
, size
);
11621 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11622 return _setxattr_check_data_pool(rest
, v
, &o
);
11625 if (r
== -ENOENT
) {
11627 objecter
->wait_for_latest_osdmap(&ctx
);
11633 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11634 size_t size
, int flags
, const UserPerm
& perms
)
11636 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11638 std::lock_guard
lock(client_lock
);
11643 vinodeno_t vino
= _get_vino(in
);
11645 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11646 tout(cct
) << __func__
<< std::endl
;
11647 tout(cct
) << vino
.ino
.val
<< std::endl
;
11648 tout(cct
) << name
<< std::endl
;
11650 if (!fuse_default_permissions
) {
11651 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11655 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11658 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11660 if (in
->snapid
!= CEPH_NOSNAP
) {
11664 // same xattrs supported by kernel client
11665 if (strncmp(name
, "user.", 5) &&
11666 strncmp(name
, "system.", 7) &&
11667 strncmp(name
, "security.", 9) &&
11668 strncmp(name
, "trusted.", 8) &&
11669 strncmp(name
, "ceph.", 5))
11670 return -EOPNOTSUPP
;
11672 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11673 if (vxattr
&& vxattr
->readonly
)
11674 return -EOPNOTSUPP
;
11676 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11678 in
->make_nosnap_relative_path(path
);
11679 req
->set_filepath(path
);
11680 req
->set_filepath2(name
);
11681 req
->set_inode(in
);
11683 int res
= make_request(req
, perms
);
11686 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11690 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11692 if (cct
->_conf
->client_permissions
) {
11693 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11697 return _removexattr(in
.get(), name
, perms
);
11700 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11702 std::lock_guard
lock(client_lock
);
11707 vinodeno_t vino
= _get_vino(in
);
11709 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11710 tout(cct
) << "ll_removexattr" << std::endl
;
11711 tout(cct
) << vino
.ino
.val
<< std::endl
;
11712 tout(cct
) << name
<< std::endl
;
11714 if (!fuse_default_permissions
) {
11715 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11720 return _removexattr(in
, name
, perms
);
11723 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11725 return in
->quota
.is_enable() &&
11726 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11728 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11730 return snprintf(val
, size
,
11731 "max_bytes=%lld max_files=%lld",
11732 (long long int)in
->quota
.max_bytes
,
11733 (long long int)in
->quota
.max_files
);
11735 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11737 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11739 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11741 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11744 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11746 return in
->layout
!= file_layout_t();
11748 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11750 int r
= snprintf(val
, size
,
11751 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11752 (unsigned long long)in
->layout
.stripe_unit
,
11753 (unsigned long long)in
->layout
.stripe_count
,
11754 (unsigned long long)in
->layout
.object_size
);
11755 objecter
->with_osdmap([&](const OSDMap
& o
) {
11756 if (o
.have_pg_pool(in
->layout
.pool_id
))
11757 r
+= snprintf(val
+ r
, size
- r
, "%s",
11758 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11760 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11761 (uint64_t)in
->layout
.pool_id
);
11763 if (in
->layout
.pool_ns
.length())
11764 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11765 in
->layout
.pool_ns
.c_str());
11768 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11770 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11772 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11774 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11776 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11778 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11780 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11783 objecter
->with_osdmap([&](const OSDMap
& o
) {
11784 if (o
.have_pg_pool(in
->layout
.pool_id
))
11785 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11786 in
->layout
.pool_id
).c_str());
11788 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11792 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11794 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11796 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11798 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11800 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11802 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11804 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11806 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11808 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11810 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11812 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11814 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11816 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11818 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11820 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11822 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11824 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11826 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11827 (long)in
->rstat
.rctime
.nsec());
11829 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11831 return in
->dir_pin
!= -ENODATA
;
11833 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11835 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11838 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11840 return !in
->snap_btime
.is_zero();
11843 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11845 return snprintf(val
, size
, "%llu.%09lu",
11846 (long long unsigned)in
->snap_btime
.sec(),
11847 (long unsigned)in
->snap_btime
.nsec());
11850 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11851 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11853 #define XATTR_NAME_CEPH(_type, _name) \
11855 name: CEPH_XATTR_NAME(_type, _name), \
11856 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11861 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11863 name: CEPH_XATTR_NAME(_type, _name), \
11864 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11869 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11871 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11872 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11874 exists_cb: &Client::_vxattrcb_layout_exists, \
11877 #define XATTR_QUOTA_FIELD(_type, _name) \
11879 name: CEPH_XATTR_NAME(_type, _name), \
11880 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11882 exists_cb: &Client::_vxattrcb_quota_exists, \
11886 const Client::VXattr
Client::_dir_vxattrs
[] = {
11888 name
: "ceph.dir.layout",
11889 getxattr_cb
: &Client::_vxattrcb_layout
,
11891 exists_cb
: &Client::_vxattrcb_layout_exists
,
11894 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11895 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11896 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11897 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11898 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11899 XATTR_NAME_CEPH(dir
, entries
),
11900 XATTR_NAME_CEPH(dir
, files
),
11901 XATTR_NAME_CEPH(dir
, subdirs
),
11902 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11903 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11904 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11905 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11906 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11908 name
: "ceph.quota",
11909 getxattr_cb
: &Client::_vxattrcb_quota
,
11911 exists_cb
: &Client::_vxattrcb_quota_exists
,
11914 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11915 XATTR_QUOTA_FIELD(quota
, max_files
),
11917 name
: "ceph.dir.pin",
11918 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11920 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11924 name
: "ceph.snap.btime",
11925 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11927 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11930 { name
: "" } /* Required table terminator */
11933 const Client::VXattr
Client::_file_vxattrs
[] = {
11935 name
: "ceph.file.layout",
11936 getxattr_cb
: &Client::_vxattrcb_layout
,
11938 exists_cb
: &Client::_vxattrcb_layout_exists
,
11941 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11942 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11943 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11944 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11945 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11947 name
: "ceph.snap.btime",
11948 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11950 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11953 { name
: "" } /* Required table terminator */
11956 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11959 return _dir_vxattrs
;
11960 else if (in
->is_file())
11961 return _file_vxattrs
;
11965 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11967 if (strncmp(name
, "ceph.", 5) == 0) {
11968 const VXattr
*vxattr
= _get_vxattrs(in
);
11970 while (!vxattr
->name
.empty()) {
11971 if (vxattr
->name
== name
)
11980 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11982 std::lock_guard
lock(client_lock
);
11987 vinodeno_t vino
= _get_vino(in
);
11989 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11990 tout(cct
) << "ll_readlink" << std::endl
;
11991 tout(cct
) << vino
.ino
.val
<< std::endl
;
11993 for (auto dn
: in
->dentries
) {
11997 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11998 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12002 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12003 const UserPerm
& perms
, InodeRef
*inp
)
12005 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12006 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12007 << ", gid " << perms
.gid() << ")" << dendl
;
12009 if (strlen(name
) > NAME_MAX
)
12010 return -ENAMETOOLONG
;
12012 if (dir
->snapid
!= CEPH_NOSNAP
) {
12015 if (is_quota_files_exceeded(dir
, perms
)) {
12019 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12022 dir
->make_nosnap_relative_path(path
);
12023 path
.push_dentry(name
);
12024 req
->set_filepath(path
);
12025 req
->set_inode(dir
);
12026 req
->head
.args
.mknod
.rdev
= rdev
;
12027 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12028 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12030 bufferlist xattrs_bl
;
12031 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12034 req
->head
.args
.mknod
.mode
= mode
;
12035 if (xattrs_bl
.length() > 0)
12036 req
->set_data(xattrs_bl
);
12039 res
= get_or_create(dir
, name
, &de
);
12042 req
->set_dentry(de
);
12044 res
= make_request(req
, perms
, inp
);
12048 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12056 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12057 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12058 const UserPerm
& perms
)
12060 std::lock_guard
lock(client_lock
);
12065 vinodeno_t vparent
= _get_vino(parent
);
12067 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12068 tout(cct
) << "ll_mknod" << std::endl
;
12069 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12070 tout(cct
) << name
<< std::endl
;
12071 tout(cct
) << mode
<< std::endl
;
12072 tout(cct
) << rdev
<< std::endl
;
12074 if (!fuse_default_permissions
) {
12075 int r
= may_create(parent
, perms
);
12081 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12083 fill_stat(in
, attr
);
12086 tout(cct
) << attr
->st_ino
<< std::endl
;
12087 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12088 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12093 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12094 dev_t rdev
, Inode
**out
,
12095 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12096 const UserPerm
& perms
)
12098 unsigned caps
= statx_to_mask(flags
, want
);
12099 std::lock_guard
lock(client_lock
);
12104 vinodeno_t vparent
= _get_vino(parent
);
12106 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12107 tout(cct
) << "ll_mknodx" << std::endl
;
12108 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12109 tout(cct
) << name
<< std::endl
;
12110 tout(cct
) << mode
<< std::endl
;
12111 tout(cct
) << rdev
<< std::endl
;
12113 if (!fuse_default_permissions
) {
12114 int r
= may_create(parent
, perms
);
12120 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12122 fill_statx(in
, caps
, stx
);
12125 tout(cct
) << stx
->stx_ino
<< std::endl
;
12126 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12127 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12132 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12133 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12134 int object_size
, const char *data_pool
, bool *created
,
12135 const UserPerm
& perms
)
12137 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12138 mode
<< dec
<< ")" << dendl
;
12140 if (strlen(name
) > NAME_MAX
)
12141 return -ENAMETOOLONG
;
12142 if (dir
->snapid
!= CEPH_NOSNAP
) {
12145 if (is_quota_files_exceeded(dir
, perms
)) {
12149 // use normalized flags to generate cmode
12150 int cflags
= ceph_flags_sys2wire(flags
);
12151 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12152 cflags
|= CEPH_O_LAZY
;
12154 int cmode
= ceph_flags_to_mode(cflags
);
12156 int64_t pool_id
= -1;
12157 if (data_pool
&& *data_pool
) {
12158 pool_id
= objecter
->with_osdmap(
12159 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12162 if (pool_id
> 0xffffffffll
)
12163 return -ERANGE
; // bummer!
12166 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12169 dir
->make_nosnap_relative_path(path
);
12170 path
.push_dentry(name
);
12171 req
->set_filepath(path
);
12172 req
->set_inode(dir
);
12173 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12175 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12176 req
->head
.args
.open
.stripe_count
= stripe_count
;
12177 req
->head
.args
.open
.object_size
= object_size
;
12178 if (cct
->_conf
->client_debug_getattr_caps
)
12179 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12181 req
->head
.args
.open
.mask
= 0;
12182 req
->head
.args
.open
.pool
= pool_id
;
12183 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12184 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12187 bufferlist xattrs_bl
;
12188 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12191 req
->head
.args
.open
.mode
= mode
;
12192 if (xattrs_bl
.length() > 0)
12193 req
->set_data(xattrs_bl
);
12196 res
= get_or_create(dir
, name
, &de
);
12199 req
->set_dentry(de
);
12201 res
= make_request(req
, perms
, inp
, created
);
12206 /* If the caller passed a value in fhp, do the open */
12208 (*inp
)->get_open_ref(cmode
);
12209 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12215 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12216 << " layout " << stripe_unit
12217 << ' ' << stripe_count
12218 << ' ' << object_size
12219 <<") = " << res
<< dendl
;
12228 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12231 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12232 << mode
<< dec
<< ", uid " << perm
.uid()
12233 << ", gid " << perm
.gid() << ")" << dendl
;
12235 if (strlen(name
) > NAME_MAX
)
12236 return -ENAMETOOLONG
;
12238 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12241 if (is_quota_files_exceeded(dir
, perm
)) {
12244 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12245 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12248 dir
->make_nosnap_relative_path(path
);
12249 path
.push_dentry(name
);
12250 req
->set_filepath(path
);
12251 req
->set_inode(dir
);
12252 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12253 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12256 bufferlist xattrs_bl
;
12257 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12260 req
->head
.args
.mkdir
.mode
= mode
;
12261 if (xattrs_bl
.length() > 0)
12262 req
->set_data(xattrs_bl
);
12265 res
= get_or_create(dir
, name
, &de
);
12268 req
->set_dentry(de
);
12270 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12271 res
= make_request(req
, perm
, inp
);
12272 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12276 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12284 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12285 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12287 std::lock_guard
lock(client_lock
);
12292 vinodeno_t vparent
= _get_vino(parent
);
12294 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12295 tout(cct
) << "ll_mkdir" << std::endl
;
12296 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12297 tout(cct
) << name
<< std::endl
;
12298 tout(cct
) << mode
<< std::endl
;
12300 if (!fuse_default_permissions
) {
12301 int r
= may_create(parent
, perm
);
12307 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12309 fill_stat(in
, attr
);
12312 tout(cct
) << attr
->st_ino
<< std::endl
;
12313 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12314 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12319 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12320 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12321 const UserPerm
& perms
)
12323 std::lock_guard
lock(client_lock
);
12328 vinodeno_t vparent
= _get_vino(parent
);
12330 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12331 tout(cct
) << "ll_mkdirx" << std::endl
;
12332 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12333 tout(cct
) << name
<< std::endl
;
12334 tout(cct
) << mode
<< std::endl
;
12336 if (!fuse_default_permissions
) {
12337 int r
= may_create(parent
, perms
);
12343 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12345 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12351 tout(cct
) << stx
->stx_ino
<< std::endl
;
12352 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12353 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12358 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12359 const UserPerm
& perms
, InodeRef
*inp
)
12361 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12362 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12365 if (strlen(name
) > NAME_MAX
)
12366 return -ENAMETOOLONG
;
12368 if (dir
->snapid
!= CEPH_NOSNAP
) {
12371 if (is_quota_files_exceeded(dir
, perms
)) {
12375 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12378 dir
->make_nosnap_relative_path(path
);
12379 path
.push_dentry(name
);
12380 req
->set_filepath(path
);
12381 req
->set_inode(dir
);
12382 req
->set_string2(target
);
12383 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12384 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12387 int res
= get_or_create(dir
, name
, &de
);
12390 req
->set_dentry(de
);
12392 res
= make_request(req
, perms
, inp
);
12395 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12404 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12405 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12407 std::lock_guard
lock(client_lock
);
12412 vinodeno_t vparent
= _get_vino(parent
);
12414 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12416 tout(cct
) << "ll_symlink" << std::endl
;
12417 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12418 tout(cct
) << name
<< std::endl
;
12419 tout(cct
) << value
<< std::endl
;
12421 if (!fuse_default_permissions
) {
12422 int r
= may_create(parent
, perms
);
12428 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12430 fill_stat(in
, attr
);
12433 tout(cct
) << attr
->st_ino
<< std::endl
;
12434 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12435 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12440 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12441 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12442 unsigned flags
, const UserPerm
& perms
)
12444 std::lock_guard
lock(client_lock
);
12449 vinodeno_t vparent
= _get_vino(parent
);
12451 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12453 tout(cct
) << "ll_symlinkx" << std::endl
;
12454 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12455 tout(cct
) << name
<< std::endl
;
12456 tout(cct
) << value
<< std::endl
;
12458 if (!fuse_default_permissions
) {
12459 int r
= may_create(parent
, perms
);
12465 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12467 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12470 tout(cct
) << stx
->stx_ino
<< std::endl
;
12471 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12472 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12477 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12479 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12480 << " uid " << perm
.uid() << " gid " << perm
.gid()
12483 if (dir
->snapid
!= CEPH_NOSNAP
) {
12487 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12490 dir
->make_nosnap_relative_path(path
);
12491 path
.push_dentry(name
);
12492 req
->set_filepath(path
);
12498 int res
= get_or_create(dir
, name
, &de
);
12501 req
->set_dentry(de
);
12502 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12503 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12505 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12509 in
= otherin
.get();
12510 req
->set_other_inode(in
);
12511 in
->break_all_delegs();
12512 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12514 req
->set_inode(dir
);
12516 res
= make_request(req
, perm
);
12519 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12527 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12529 std::lock_guard
lock(client_lock
);
12534 vinodeno_t vino
= _get_vino(in
);
12536 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12537 tout(cct
) << "ll_unlink" << std::endl
;
12538 tout(cct
) << vino
.ino
.val
<< std::endl
;
12539 tout(cct
) << name
<< std::endl
;
12541 if (!fuse_default_permissions
) {
12542 int r
= may_delete(in
, name
, perm
);
12546 return _unlink(in
, name
, perm
);
12549 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12551 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12552 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12554 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12558 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12559 MetaRequest
*req
= new MetaRequest(op
);
12561 dir
->make_nosnap_relative_path(path
);
12562 path
.push_dentry(name
);
12563 req
->set_filepath(path
);
12564 req
->set_inode(dir
);
12566 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12567 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12568 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12573 int res
= get_or_create(dir
, name
, &de
);
12576 if (op
== CEPH_MDS_OP_RMDIR
)
12577 req
->set_dentry(de
);
12581 res
= _lookup(dir
, name
, 0, &in
, perms
);
12585 if (op
== CEPH_MDS_OP_RMSNAP
) {
12586 unlink(de
, true, true);
12589 req
->set_other_inode(in
.get());
12591 res
= make_request(req
, perms
);
12594 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12602 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12604 std::lock_guard
lock(client_lock
);
12609 vinodeno_t vino
= _get_vino(in
);
12611 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12612 tout(cct
) << "ll_rmdir" << std::endl
;
12613 tout(cct
) << vino
.ino
.val
<< std::endl
;
12614 tout(cct
) << name
<< std::endl
;
12616 if (!fuse_default_permissions
) {
12617 int r
= may_delete(in
, name
, perms
);
12622 return _rmdir(in
, name
, perms
);
12625 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12627 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12628 << todir
->ino
<< " " << toname
12629 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12632 if (fromdir
->snapid
!= todir
->snapid
)
12635 int op
= CEPH_MDS_OP_RENAME
;
12636 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12637 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12638 op
= CEPH_MDS_OP_RENAMESNAP
;
12644 MetaRequest
*req
= new MetaRequest(op
);
12647 fromdir
->make_nosnap_relative_path(from
);
12648 from
.push_dentry(fromname
);
12650 todir
->make_nosnap_relative_path(to
);
12651 to
.push_dentry(toname
);
12652 req
->set_filepath(to
);
12653 req
->set_filepath2(from
);
12656 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12660 res
= get_or_create(todir
, toname
, &de
);
12664 if (op
== CEPH_MDS_OP_RENAME
) {
12665 req
->set_old_dentry(oldde
);
12666 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12667 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12669 req
->set_dentry(de
);
12670 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12671 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12673 InodeRef oldin
, otherin
;
12674 Inode
*fromdir_root
= nullptr;
12675 Inode
*todir_root
= nullptr;
12677 bool quota_check
= false;
12678 if (fromdir
!= todir
) {
12680 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12682 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12684 if (todir_root
->quota
.is_enable() && fromdir_root
!= todir_root
) {
12685 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12686 // to auth MDS to get latest rstat for todir_root and source dir
12687 // even if their dentry caches and inode caps are satisfied.
12688 res
= _getattr(todir_root
, CEPH_STAT_RSTAT
, perm
, true);
12692 quota_check
= true;
12693 if (oldde
->inode
&& oldde
->inode
->is_dir()) {
12694 mask
|= CEPH_STAT_RSTAT
;
12699 res
= _lookup(fromdir
, fromname
, mask
, &oldin
, perm
);
12703 Inode
*oldinode
= oldin
.get();
12704 oldinode
->break_all_delegs();
12705 req
->set_old_inode(oldinode
);
12706 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12709 int64_t old_bytes
, old_files
;
12710 if (oldinode
->is_dir()) {
12711 old_bytes
= oldinode
->rstat
.rbytes
;
12712 old_files
= oldinode
->rstat
.rsize();
12714 old_bytes
= oldinode
->size
;
12718 bool quota_exceed
= false;
12719 if (todir_root
&& todir_root
->quota
.max_bytes
&&
12720 (old_bytes
+ todir_root
->rstat
.rbytes
) >= todir_root
->quota
.max_bytes
) {
12721 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " bytes="
12722 << old_bytes
<< ") to (" << todir
->ino
12723 << ") will exceed quota on " << *todir_root
<< dendl
;
12724 quota_exceed
= true;
12727 if (todir_root
&& todir_root
->quota
.max_files
&&
12728 (old_files
+ todir_root
->rstat
.rsize()) >= todir_root
->quota
.max_files
) {
12729 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " files="
12730 << old_files
<< ") to (" << todir
->ino
12731 << ") will exceed quota on " << *todir_root
<< dendl
;
12732 quota_exceed
= true;
12735 if (quota_exceed
) {
12736 res
= (oldinode
->is_dir()) ? -EXDEV
: -EDQUOT
;
12741 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12745 Inode
*in
= otherin
.get();
12746 req
->set_other_inode(in
);
12747 in
->break_all_delegs();
12749 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12757 req
->set_inode(todir
);
12759 // renamesnap reply contains no tracedn, so we need to invalidate
12761 unlink(oldde
, true, true);
12762 unlink(de
, true, true);
12764 req
->set_inode(todir
);
12767 res
= make_request(req
, perm
, &target
);
12768 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12770 // renamed item from our cache
12773 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12781 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12782 const char *newname
, const UserPerm
& perm
)
12784 std::lock_guard
lock(client_lock
);
12789 vinodeno_t vparent
= _get_vino(parent
);
12790 vinodeno_t vnewparent
= _get_vino(newparent
);
12792 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12793 << vnewparent
<< " " << newname
<< dendl
;
12794 tout(cct
) << "ll_rename" << std::endl
;
12795 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12796 tout(cct
) << name
<< std::endl
;
12797 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12798 tout(cct
) << newname
<< std::endl
;
12800 if (!fuse_default_permissions
) {
12801 int r
= may_delete(parent
, name
, perm
);
12804 r
= may_delete(newparent
, newname
, perm
);
12805 if (r
< 0 && r
!= -ENOENT
)
12809 return _rename(parent
, name
, newparent
, newname
, perm
);
12812 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12814 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12815 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12817 if (strlen(newname
) > NAME_MAX
)
12818 return -ENAMETOOLONG
;
12820 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12823 if (is_quota_files_exceeded(dir
, perm
)) {
12827 in
->break_all_delegs();
12828 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12830 filepath
path(newname
, dir
->ino
);
12831 req
->set_filepath(path
);
12832 filepath
existing(in
->ino
);
12833 req
->set_filepath2(existing
);
12835 req
->set_inode(dir
);
12836 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12837 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12840 int res
= get_or_create(dir
, newname
, &de
);
12843 req
->set_dentry(de
);
12845 res
= make_request(req
, perm
, inp
);
12846 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12849 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12857 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12858 const UserPerm
& perm
)
12860 std::lock_guard
lock(client_lock
);
12865 vinodeno_t vino
= _get_vino(in
);
12866 vinodeno_t vnewparent
= _get_vino(newparent
);
12868 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12870 tout(cct
) << "ll_link" << std::endl
;
12871 tout(cct
) << vino
.ino
.val
<< std::endl
;
12872 tout(cct
) << vnewparent
<< std::endl
;
12873 tout(cct
) << newname
<< std::endl
;
12877 if (!fuse_default_permissions
) {
12878 if (S_ISDIR(in
->mode
))
12881 int r
= may_hardlink(in
, perm
);
12885 r
= may_create(newparent
, perm
);
12890 return _link(in
, newparent
, newname
, perm
, &target
);
12893 int Client::ll_num_osds(void)
12895 std::lock_guard
lock(client_lock
);
12896 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12899 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12901 std::lock_guard
lock(client_lock
);
12904 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12905 if (!o
.exists(osd
))
12907 g
= o
.get_addrs(osd
).front();
12912 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12913 *addr
= ntohl(nb_addr
);
12917 uint32_t Client::ll_stripe_unit(Inode
*in
)
12919 std::lock_guard
lock(client_lock
);
12920 return in
->layout
.stripe_unit
;
12923 uint64_t Client::ll_snap_seq(Inode
*in
)
12925 std::lock_guard
lock(client_lock
);
12926 return in
->snaprealm
->seq
;
12929 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12931 std::lock_guard
lock(client_lock
);
12932 *layout
= in
->layout
;
12936 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12938 return ll_file_layout(fh
->inode
.get(), layout
);
12941 /* Currently we cannot take advantage of redundancy in reads, since we
12942 would have to go through all possible placement groups (a
12943 potentially quite large number determined by a hash), and use CRUSH
12944 to calculate the appropriate set of OSDs for each placement group,
12945 then index into that. An array with one entry per OSD is much more
12946 tractable and works for demonstration purposes. */
12948 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12949 file_layout_t
* layout
)
12951 std::lock_guard
lock(client_lock
);
12953 inodeno_t ino
= in
->ino
;
12954 uint32_t object_size
= layout
->object_size
;
12955 uint32_t su
= layout
->stripe_unit
;
12956 uint32_t stripe_count
= layout
->stripe_count
;
12957 uint64_t stripes_per_object
= object_size
/ su
;
12958 uint64_t stripeno
= 0, stripepos
= 0;
12961 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12962 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12964 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12965 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12967 object_t oid
= file_object_t(ino
, objectno
);
12968 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12969 ceph_object_layout olayout
=
12970 o
.file_to_object_layout(oid
, *layout
);
12971 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12974 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12979 /* Return the offset of the block, internal to the object */
12981 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12983 std::lock_guard
lock(client_lock
);
12984 file_layout_t
*layout
=&(in
->layout
);
12985 uint32_t object_size
= layout
->object_size
;
12986 uint32_t su
= layout
->stripe_unit
;
12987 uint64_t stripes_per_object
= object_size
/ su
;
12989 return (blockno
% stripes_per_object
) * su
;
12992 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12993 const UserPerm
& perms
)
12995 std::lock_guard
lock(client_lock
);
13000 vinodeno_t vino
= _get_vino(in
);
13002 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13003 tout(cct
) << "ll_opendir" << std::endl
;
13004 tout(cct
) << vino
.ino
.val
<< std::endl
;
13006 if (!fuse_default_permissions
) {
13007 int r
= may_open(in
, flags
, perms
);
13012 int r
= _opendir(in
, dirpp
, perms
);
13013 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
13015 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13020 int Client::ll_releasedir(dir_result_t
*dirp
)
13022 std::lock_guard
lock(client_lock
);
13023 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13024 tout(cct
) << "ll_releasedir" << std::endl
;
13025 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13034 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13036 std::lock_guard
lock(client_lock
);
13037 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13038 tout(cct
) << "ll_fsyncdir" << std::endl
;
13039 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13044 return _fsync(dirp
->inode
.get(), false);
13047 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13049 ceph_assert(!(flags
& O_CREAT
));
13051 std::lock_guard
lock(client_lock
);
13056 vinodeno_t vino
= _get_vino(in
);
13058 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13059 tout(cct
) << "ll_open" << std::endl
;
13060 tout(cct
) << vino
.ino
.val
<< std::endl
;
13061 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13064 if (!fuse_default_permissions
) {
13065 r
= may_open(in
, flags
, perms
);
13070 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13073 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13075 ll_unclosed_fh_set
.insert(fhptr
);
13077 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13078 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13079 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13083 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13084 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13085 const UserPerm
& perms
)
13089 vinodeno_t vparent
= _get_vino(parent
);
13091 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13092 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13093 << ", gid " << perms
.gid() << dendl
;
13094 tout(cct
) << "ll_create" << std::endl
;
13095 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13096 tout(cct
) << name
<< std::endl
;
13097 tout(cct
) << mode
<< std::endl
;
13098 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13100 bool created
= false;
13101 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13103 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13106 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13107 if (!fuse_default_permissions
) {
13108 r
= may_create(parent
, perms
);
13112 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13123 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13125 if (!fuse_default_permissions
) {
13126 r
= may_open(in
->get(), flags
, perms
);
13129 int release_r
= _release_fh(*fhp
);
13130 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13135 if (*fhp
== NULL
) {
13136 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13144 ll_unclosed_fh_set
.insert(*fhp
);
13149 Inode
*inode
= in
->get();
13150 if (use_faked_inos())
13151 ino
= inode
->faked_ino
;
13156 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13157 tout(cct
) << ino
<< std::endl
;
13158 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13159 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13160 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13165 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13166 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13167 const UserPerm
& perms
)
13169 std::lock_guard
lock(client_lock
);
13175 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13180 // passing an Inode in outp requires an additional ref
13185 fill_stat(in
, attr
);
13193 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13194 int oflags
, Inode
**outp
, Fh
**fhp
,
13195 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13196 const UserPerm
& perms
)
13198 unsigned caps
= statx_to_mask(lflags
, want
);
13199 std::lock_guard
lock(client_lock
);
13205 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13209 // passing an Inode in outp requires an additional ref
13214 fill_statx(in
, caps
, stx
);
13223 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13225 std::lock_guard
lock(client_lock
);
13226 tout(cct
) << "ll_lseek" << std::endl
;
13227 tout(cct
) << offset
<< std::endl
;
13228 tout(cct
) << whence
<< std::endl
;
13233 return _lseek(fh
, offset
, whence
);
13236 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13238 std::lock_guard
lock(client_lock
);
13239 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13240 tout(cct
) << "ll_read" << std::endl
;
13241 tout(cct
) << (unsigned long)fh
<< std::endl
;
13242 tout(cct
) << off
<< std::endl
;
13243 tout(cct
) << len
<< std::endl
;
13248 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13249 len
= std::min(len
, (loff_t
)INT_MAX
);
13250 return _read(fh
, off
, len
, bl
);
13253 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13257 file_layout_t
* layout
)
13259 std::lock_guard
lock(client_lock
);
13264 vinodeno_t vino
= _get_vino(in
);
13265 object_t oid
= file_object_t(vino
.ino
, blockid
);
13266 C_SaferCond onfinish
;
13269 objecter
->read(oid
,
13270 object_locator_t(layout
->pool_id
),
13275 CEPH_OSD_FLAG_READ
,
13278 client_lock
.unlock();
13279 int r
= onfinish
.wait();
13280 client_lock
.lock();
13283 bl
.begin().copy(bl
.length(), buf
);
13290 /* It appears that the OSD doesn't return success unless the entire
13291 buffer was written, return the write length on success. */
13293 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13294 char* buf
, uint64_t offset
,
13295 uint64_t length
, file_layout_t
* layout
,
13296 uint64_t snapseq
, uint32_t sync
)
13298 vinodeno_t vino
= ll_get_vino(in
);
13300 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13305 if (true || sync
) {
13306 /* if write is stable, the epilogue is waiting on
13308 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13310 object_t oid
= file_object_t(vino
.ino
, blockid
);
13311 SnapContext fakesnap
;
13312 ceph::bufferlist bl
;
13314 bl
.push_back(buffer::copy(buf
, length
));
13317 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13320 fakesnap
.seq
= snapseq
;
13322 /* lock just in time */
13323 client_lock
.lock();
13325 client_lock
.unlock();
13329 objecter
->write(oid
,
13330 object_locator_t(layout
->pool_id
),
13335 ceph::real_clock::now(),
13339 client_lock
.unlock();
13340 if (nullptr != onsafe
) {
13341 r
= onsafe
->wait();
13351 int Client::ll_commit_blocks(Inode
*in
,
13355 std::lock_guard
lock(client_lock
);
13357 BarrierContext *bctx;
13358 vinodeno_t vino = _get_vino(in);
13359 uint64_t ino = vino.ino;
13361 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13362 << offset << " to " << length << dendl;
13368 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13369 if (p != barriers.end()) {
13370 barrier_interval civ(offset, offset + length);
13371 p->second->commit_barrier(civ);
13377 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13379 std::lock_guard
lock(client_lock
);
13380 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13381 "~" << len
<< dendl
;
13382 tout(cct
) << "ll_write" << std::endl
;
13383 tout(cct
) << (unsigned long)fh
<< std::endl
;
13384 tout(cct
) << off
<< std::endl
;
13385 tout(cct
) << len
<< std::endl
;
13390 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13391 len
= std::min(len
, (loff_t
)INT_MAX
);
13392 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13393 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13398 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13400 std::lock_guard
lock(client_lock
);
13403 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13406 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13408 std::lock_guard
lock(client_lock
);
13411 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13414 int Client::ll_flush(Fh
*fh
)
13416 std::lock_guard
lock(client_lock
);
13417 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13418 tout(cct
) << "ll_flush" << std::endl
;
13419 tout(cct
) << (unsigned long)fh
<< std::endl
;
13427 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13429 std::lock_guard
lock(client_lock
);
13430 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13431 tout(cct
) << "ll_fsync" << std::endl
;
13432 tout(cct
) << (unsigned long)fh
<< std::endl
;
13437 int r
= _fsync(fh
, syncdataonly
);
13439 // If we're returning an error, clear it from the FH
13440 fh
->take_async_err();
13445 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13447 std::lock_guard
lock(client_lock
);
13448 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13449 tout(cct
) << "ll_sync_inode" << std::endl
;
13450 tout(cct
) << (unsigned long)in
<< std::endl
;
13455 return _fsync(in
, syncdataonly
);
13458 #ifdef FALLOC_FL_PUNCH_HOLE
13460 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13462 if (offset
< 0 || length
<= 0)
13465 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13466 return -EOPNOTSUPP
;
13468 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13469 return -EOPNOTSUPP
;
13471 Inode
*in
= fh
->inode
.get();
13473 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13474 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13478 if (in
->snapid
!= CEPH_NOSNAP
)
13481 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13484 uint64_t size
= offset
+ length
;
13485 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13487 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13492 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13496 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13497 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13498 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13499 (have
& CEPH_CAP_FILE_BUFFER
)) {
13501 auto inline_iter
= in
->inline_data
.cbegin();
13502 int len
= in
->inline_data
.length();
13503 if (offset
< len
) {
13505 inline_iter
.copy(offset
, bl
);
13507 if (offset
+ size
> len
)
13508 size
= len
- offset
;
13510 bl
.append_zero(size
);
13511 if (offset
+ size
< len
) {
13512 inline_iter
+= size
;
13513 inline_iter
.copy(len
- offset
- size
, bl
);
13515 in
->inline_data
= bl
;
13516 in
->inline_version
++;
13518 in
->mtime
= in
->ctime
= ceph_clock_now();
13520 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13522 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13523 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13524 uninline_data(in
, onuninline
.get());
13527 C_SaferCond
onfinish("Client::_punch_hole flock");
13529 unsafe_sync_write
++;
13530 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13532 _invalidate_inode_cache(in
, offset
, length
);
13533 filer
->zero(in
->ino
, &in
->layout
,
13534 in
->snaprealm
->get_snap_context(),
13536 ceph::real_clock::now(),
13537 0, true, &onfinish
);
13538 in
->mtime
= in
->ctime
= ceph_clock_now();
13540 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13542 client_lock
.unlock();
13544 client_lock
.lock();
13545 _sync_write_commit(in
);
13547 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13548 uint64_t size
= offset
+ length
;
13549 if (size
> in
->size
) {
13551 in
->mtime
= in
->ctime
= ceph_clock_now();
13553 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13555 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13556 check_caps(in
, CHECK_CAPS_NODELAY
);
13557 } else if (is_max_size_approaching(in
)) {
13563 if (nullptr != onuninline
) {
13564 client_lock
.unlock();
13565 int ret
= onuninline
->wait();
13566 client_lock
.lock();
13568 if (ret
>= 0 || ret
== -ECANCELED
) {
13569 in
->inline_data
.clear();
13570 in
->inline_version
= CEPH_INLINE_NONE
;
13571 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13577 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13582 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13584 return -EOPNOTSUPP
;
13590 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13592 std::lock_guard
lock(client_lock
);
13593 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13594 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13595 tout(cct
) << (unsigned long)fh
<< std::endl
;
13600 return _fallocate(fh
, mode
, offset
, length
);
13603 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13605 std::lock_guard
lock(client_lock
);
13606 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13611 Fh
*fh
= get_filehandle(fd
);
13614 #if defined(__linux__) && defined(O_PATH)
13615 if (fh
->flags
& O_PATH
)
13618 return _fallocate(fh
, mode
, offset
, length
);
13621 int Client::ll_release(Fh
*fh
)
13623 std::lock_guard
lock(client_lock
);
13628 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13630 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13631 tout(cct
) << (unsigned long)fh
<< std::endl
;
13633 if (ll_unclosed_fh_set
.count(fh
))
13634 ll_unclosed_fh_set
.erase(fh
);
13635 return _release_fh(fh
);
13638 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13640 std::lock_guard
lock(client_lock
);
13642 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13643 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13648 return _getlk(fh
, fl
, owner
);
13651 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13653 std::lock_guard
lock(client_lock
);
13655 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13656 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13661 return _setlk(fh
, fl
, owner
, sleep
);
13664 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13666 std::lock_guard
lock(client_lock
);
13668 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13669 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13674 return _flock(fh
, cmd
, owner
);
13677 int Client::set_deleg_timeout(uint32_t timeout
)
13679 std::lock_guard
lock(client_lock
);
13682 * The whole point is to prevent blacklisting so we must time out the
13683 * delegation before the session autoclose timeout kicks in.
13685 if (timeout
>= mdsmap
->get_session_autoclose())
13688 deleg_timeout
= timeout
;
13692 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13696 std::lock_guard
lock(client_lock
);
13701 Inode
*inode
= fh
->inode
.get();
13704 case CEPH_DELEGATION_NONE
:
13705 inode
->unset_deleg(fh
);
13710 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13711 } catch (std::bad_alloc
&) {
13719 class C_Client_RequestInterrupt
: public Context
{
13724 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13727 void finish(int r
) override
{
13728 std::lock_guard
l(client
->client_lock
);
13729 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13730 client
->_interrupt_filelock(req
);
13731 client
->put_request(req
);
13735 void Client::ll_interrupt(void *d
)
13737 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13738 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13739 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13740 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13743 // =========================================
13746 // expose file layouts
13748 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13749 const UserPerm
& perms
)
13751 std::lock_guard
lock(client_lock
);
13756 filepath
path(relpath
);
13758 int r
= path_walk(path
, &in
, perms
);
13764 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13768 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13770 std::lock_guard
lock(client_lock
);
13775 Fh
*f
= get_filehandle(fd
);
13778 Inode
*in
= f
->inode
.get();
13782 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13786 int64_t Client::get_default_pool_id()
13788 std::lock_guard
lock(client_lock
);
13793 /* first data pool is the default */
13794 return mdsmap
->get_first_data_pool();
13799 int64_t Client::get_pool_id(const char *pool_name
)
13801 std::lock_guard
lock(client_lock
);
13806 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13810 string
Client::get_pool_name(int64_t pool
)
13812 std::lock_guard
lock(client_lock
);
13817 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13818 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13822 int Client::get_pool_replication(int64_t pool
)
13824 std::lock_guard
lock(client_lock
);
13829 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13830 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13834 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13836 std::lock_guard
lock(client_lock
);
13841 Fh
*f
= get_filehandle(fd
);
13844 Inode
*in
= f
->inode
.get();
13846 vector
<ObjectExtent
> extents
;
13847 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13848 ceph_assert(extents
.size() == 1);
13850 objecter
->with_osdmap([&](const OSDMap
& o
) {
13851 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13852 o
.pg_to_acting_osds(pg
, osds
);
13859 * Return the remainder of the extent (stripe unit)
13861 * If length = 1 is passed to Striper::file_to_extents we get a single
13862 * extent back, but its length is one so we still need to compute the length
13863 * to the end of the stripe unit.
13865 * If length = su then we may get 1 or 2 objects back in the extents vector
13866 * which would have to be examined. Even then, the offsets are local to the
13867 * object, so matching up to the file offset is extra work.
13869 * It seems simpler to stick with length = 1 and manually compute the
13873 uint64_t su
= in
->layout
.stripe_unit
;
13874 *len
= su
- (off
% su
);
13880 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13882 std::lock_guard
lock(client_lock
);
13889 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13890 return o
.crush
->get_full_location_ordered(id
, path
);
13894 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13895 vector
<entity_addr_t
>& address
)
13897 std::lock_guard
lock(client_lock
);
13902 Fh
*f
= get_filehandle(fd
);
13905 Inode
*in
= f
->inode
.get();
13908 vector
<ObjectExtent
> extents
;
13909 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13910 in
->truncate_size
, extents
);
13911 ceph_assert(extents
.size() == 1);
13913 // now we have the object and its 'layout'
13914 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13915 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13917 o
.pg_to_acting_osds(pg
, osds
);
13920 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13921 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13922 address
.push_back(addr
);
13928 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13930 std::lock_guard
lock(client_lock
);
13935 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13936 if (!o
.exists(osd
))
13939 addr
= o
.get_addrs(osd
).front();
13944 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13945 loff_t length
, loff_t offset
)
13947 std::lock_guard
lock(client_lock
);
13952 Fh
*f
= get_filehandle(fd
);
13955 Inode
*in
= f
->inode
.get();
13957 // map to a list of extents
13958 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13960 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13965 /* find an osd with the same ip. -ENXIO if none. */
13966 int Client::get_local_osd()
13968 std::lock_guard
lock(client_lock
);
13973 objecter
->with_osdmap([this](const OSDMap
& o
) {
13974 if (o
.get_epoch() != local_osd_epoch
) {
13975 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
13976 local_osd_epoch
= o
.get_epoch();
13987 // ===============================
13989 void Client::ms_handle_connect(Connection
*con
)
13991 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13994 bool Client::ms_handle_reset(Connection
*con
)
13996 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14000 void Client::ms_handle_remote_reset(Connection
*con
)
14002 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14003 std::lock_guard
l(client_lock
);
14004 switch (con
->get_peer_type()) {
14005 case CEPH_ENTITY_TYPE_MDS
:
14007 // kludge to figure out which mds this is; fixme with a Connection* state
14008 mds_rank_t mds
= MDS_RANK_NONE
;
14009 MetaSession
*s
= NULL
;
14010 for (auto &p
: mds_sessions
) {
14011 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14017 assert (s
!= NULL
);
14018 switch (s
->state
) {
14019 case MetaSession::STATE_CLOSING
:
14020 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14021 _closed_mds_session(s
);
14024 case MetaSession::STATE_OPENING
:
14026 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14027 list
<Context
*> waiters
;
14028 waiters
.swap(s
->waiting_for_open
);
14029 _closed_mds_session(s
);
14030 MetaSession
*news
= _get_or_open_mds_session(mds
);
14031 news
->waiting_for_open
.swap(waiters
);
14035 case MetaSession::STATE_OPEN
:
14037 objecter
->maybe_request_map(); /* to check if we are blacklisted */
14038 const auto& conf
= cct
->_conf
;
14039 if (conf
->client_reconnect_stale
) {
14040 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14041 _closed_mds_session(s
);
14043 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14044 s
->state
= MetaSession::STATE_STALE
;
14049 case MetaSession::STATE_NEW
:
14050 case MetaSession::STATE_CLOSED
:
14060 bool Client::ms_handle_refused(Connection
*con
)
14062 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14066 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14068 Inode
*quota_in
= root_ancestor
;
14069 SnapRealm
*realm
= in
->snaprealm
;
14071 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14072 if (realm
->ino
!= in
->ino
) {
14073 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14074 if (p
== inode_map
.end())
14077 if (p
->second
->quota
.is_enable()) {
14078 quota_in
= p
->second
;
14082 realm
= realm
->pparent
;
14084 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14089 * Traverse quota ancestors of the Inode, return true
14090 * if any of them passes the passed function
14092 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14093 std::function
<bool (const Inode
&in
)> test
)
14096 ceph_assert(in
!= NULL
);
14101 if (in
== root_ancestor
) {
14102 // We're done traversing, drop out
14105 // Continue up the tree
14106 in
= get_quota_root(in
, perms
);
14113 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14115 return check_quota_condition(in
, perms
,
14116 [](const Inode
&in
) {
14117 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14121 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14122 const UserPerm
& perms
)
14124 return check_quota_condition(in
, perms
,
14125 [&new_bytes
](const Inode
&in
) {
14126 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14127 > in
.quota
.max_bytes
;
14131 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14133 ceph_assert(in
->size
>= in
->reported_size
);
14134 const uint64_t size
= in
->size
- in
->reported_size
;
14135 return check_quota_condition(in
, perms
,
14136 [&size
](const Inode
&in
) {
14137 if (in
.quota
.max_bytes
) {
14138 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14142 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14143 return (space
>> 4) < size
;
14157 int Client::check_pool_perm(Inode
*in
, int need
)
14159 if (!cct
->_conf
->client_check_pool_perm
)
14162 int64_t pool_id
= in
->layout
.pool_id
;
14163 std::string pool_ns
= in
->layout
.pool_ns
;
14164 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14167 auto it
= pool_perms
.find(perm_key
);
14168 if (it
== pool_perms
.end())
14170 if (it
->second
== POOL_CHECKING
) {
14171 // avoid concurrent checkings
14172 wait_on_list(waiting_for_pool_perm
);
14175 ceph_assert(have
& POOL_CHECKED
);
14181 if (in
->snapid
!= CEPH_NOSNAP
) {
14182 // pool permission check needs to write to the first object. But for snapshot,
14183 // head of the first object may have alread been deleted. To avoid creating
14184 // orphan object, skip the check for now.
14188 pool_perms
[perm_key
] = POOL_CHECKING
;
14191 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14192 object_t oid
= oid_buf
;
14194 SnapContext nullsnapc
;
14196 C_SaferCond rd_cond
;
14197 ObjectOperation rd_op
;
14198 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14200 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14201 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14203 C_SaferCond wr_cond
;
14204 ObjectOperation wr_op
;
14205 wr_op
.create(true);
14207 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14208 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14210 client_lock
.unlock();
14211 int rd_ret
= rd_cond
.wait();
14212 int wr_ret
= wr_cond
.wait();
14213 client_lock
.lock();
14215 bool errored
= false;
14217 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14219 else if (rd_ret
!= -EPERM
) {
14220 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14221 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14225 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14226 have
|= POOL_WRITE
;
14227 else if (wr_ret
!= -EPERM
) {
14228 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14229 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14234 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14235 // Raise EIO because actual error code might be misleading for
14236 // userspace filesystem user.
14237 pool_perms
.erase(perm_key
);
14238 signal_cond_list(waiting_for_pool_perm
);
14242 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14243 signal_cond_list(waiting_for_pool_perm
);
14246 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14247 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14248 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14251 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14252 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14253 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14260 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14262 if (acl_type
== POSIX_ACL
) {
14263 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14264 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14266 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14272 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14274 if (acl_type
== NO_ACL
)
14277 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14281 if (acl_type
== POSIX_ACL
) {
14282 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14283 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14284 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14285 r
= posix_acl_access_chmod(acl
, mode
);
14288 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14294 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14298 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14299 const UserPerm
& perms
)
14301 if (acl_type
== NO_ACL
)
14304 if (S_ISLNK(*mode
))
14307 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14311 if (acl_type
== POSIX_ACL
) {
14312 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14313 map
<string
, bufferptr
> xattrs
;
14315 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14316 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14317 r
= posix_acl_inherit_mode(acl
, mode
);
14322 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14326 xattrs
[ACL_EA_ACCESS
] = acl
;
14329 if (S_ISDIR(*mode
))
14330 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14334 encode(xattrs
, xattrs_bl
);
14337 *mode
&= ~umask_cb(callback_handle
);
14342 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14346 void Client::set_filer_flags(int flags
)
14348 std::lock_guard
l(client_lock
);
14349 ceph_assert(flags
== 0 ||
14350 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14351 objecter
->add_global_op_flags(flags
);
14354 void Client::clear_filer_flags(int flags
)
14356 std::lock_guard
l(client_lock
);
14357 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14358 objecter
->clear_global_op_flag(flags
);
14361 // called before mount
14362 void Client::set_uuid(const std::string
& uuid
)
14364 std::lock_guard
l(client_lock
);
14365 assert(initialized
);
14366 assert(!uuid
.empty());
14368 metadata
["uuid"] = uuid
;
14372 // called before mount. 0 means infinite
14373 void Client::set_session_timeout(unsigned timeout
)
14375 std::lock_guard
l(client_lock
);
14376 assert(initialized
);
14378 metadata
["timeout"] = stringify(timeout
);
14381 // called before mount
14382 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14383 const std::string
& fs_name
)
14385 std::lock_guard
l(client_lock
);
14393 auto it
= metadata
.find("uuid");
14394 if (it
!= metadata
.end() && it
->second
== uuid
)
14398 int r
= subscribe_mdsmap(fs_name
);
14400 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14404 if (metadata
.empty())
14405 populate_metadata("");
14407 while (mdsmap
->get_epoch() == 0)
14408 wait_on_list(waiting_for_mdsmap
);
14411 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14412 if (!mdsmap
->is_up(mds
)) {
14413 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14414 wait_on_list(waiting_for_mdsmap
);
14418 MetaSession
*session
;
14419 if (!have_open_session(mds
)) {
14420 session
= _get_or_open_mds_session(mds
);
14421 if (session
->state
!= MetaSession::STATE_OPENING
) {
14425 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14426 wait_on_context_list(session
->waiting_for_open
);
14427 if (rejected_by_mds
.count(mds
))
14432 session
= &mds_sessions
.at(mds
);
14433 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14434 return -EOPNOTSUPP
;
14436 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14437 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14438 session
->reclaim_state
= MetaSession::RECLAIMING
;
14439 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
14440 session
->con
->send_message2(std::move(m
));
14441 wait_on_list(waiting_for_reclaim
);
14442 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14443 return reclaim_errno
? : -ENOTRECOVERABLE
;
14449 // didn't find target session in any mds
14450 if (reclaim_target_addrs
.empty()) {
14451 if (flags
& CEPH_RECLAIM_RESET
)
14453 return -ENOTRECOVERABLE
;
14456 if (flags
& CEPH_RECLAIM_RESET
)
14459 // use blacklist to check if target session was killed
14460 // (config option mds_session_blacklist_on_evict needs to be true)
14462 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14463 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14464 client_lock
.unlock();
14466 client_lock
.lock();
14469 bool blacklisted
= objecter
->with_osdmap(
14470 [this](const OSDMap
&osd_map
) -> bool {
14471 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14474 return -ENOTRECOVERABLE
;
14476 metadata
["reclaiming_uuid"] = uuid
;
14480 void Client::finish_reclaim()
14482 auto it
= metadata
.find("reclaiming_uuid");
14483 if (it
== metadata
.end()) {
14484 for (auto &p
: mds_sessions
)
14485 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14489 for (auto &p
: mds_sessions
) {
14490 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14491 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
14492 p
.second
.con
->send_message2(std::move(m
));
14495 metadata
["uuid"] = it
->second
;
14496 metadata
.erase(it
);
14499 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14501 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14502 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14504 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14506 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14510 if (reply
->get_result() >= 0) {
14511 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14512 if (reply
->get_epoch() > reclaim_osd_epoch
)
14513 reclaim_osd_epoch
= reply
->get_epoch();
14514 if (!reply
->get_addrs().empty())
14515 reclaim_target_addrs
= reply
->get_addrs();
14517 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14518 reclaim_errno
= reply
->get_result();
14521 signal_cond_list(waiting_for_reclaim
);
14525 * This is included in cap release messages, to cause
14526 * the MDS to wait until this OSD map epoch. It is necessary
14527 * in corner cases where we cancel RADOS ops, so that
14528 * nobody else tries to do IO to the same objects in
14529 * the same epoch as the cancelled ops.
14531 void Client::set_cap_epoch_barrier(epoch_t e
)
14533 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14534 cap_epoch_barrier
= e
;
14537 const char** Client::get_tracked_conf_keys() const
14539 static const char* keys
[] = {
14540 "client_cache_size",
14541 "client_cache_mid",
14543 "client_deleg_timeout",
14544 "client_deleg_break_on_open",
14550 void Client::handle_conf_change(const ConfigProxy
& conf
,
14551 const std::set
<std::string
> &changed
)
14553 std::lock_guard
lock(client_lock
);
14555 if (changed
.count("client_cache_mid")) {
14556 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14558 if (changed
.count("client_acl_type")) {
14560 if (cct
->_conf
->client_acl_type
== "posix_acl")
14561 acl_type
= POSIX_ACL
;
14565 void intrusive_ptr_add_ref(Inode
*in
)
14570 void intrusive_ptr_release(Inode
*in
)
14572 in
->client
->put_inode(in
);
14575 mds_rank_t
Client::_get_random_up_mds() const
14577 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14579 std::set
<mds_rank_t
> up
;
14580 mdsmap
->get_up_mds_set(up
);
14583 return MDS_RANK_NONE
;
14584 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14585 for (int n
= rand() % up
.size(); n
; n
--)
14591 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14592 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14594 monclient
->set_messenger(m
);
14595 objecter
->set_client_incarnation(0);
14598 StandaloneClient::~StandaloneClient()
14601 objecter
= nullptr;
14604 int StandaloneClient::init()
14607 objectcacher
->start();
14610 client_lock
.lock();
14611 ceph_assert(!is_initialized());
14613 messenger
->add_dispatcher_tail(objecter
);
14614 messenger
->add_dispatcher_tail(this);
14616 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14617 int r
= monclient
->init();
14619 // need to do cleanup because we're in an intermediate init state
14621 client_lock
.unlock();
14622 objecter
->shutdown();
14623 objectcacher
->stop();
14624 monclient
->shutdown();
14629 client_lock
.unlock();
14635 void StandaloneClient::shutdown()
14637 Client::shutdown();
14638 objecter
->shutdown();
14639 monclient
->shutdown();