1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
79 #define dout_subsys ceph_subsys_client
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
88 #include "Delegation.h"
90 #include "ClientSnapRealm.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
100 #include "include/cephfs/ceph_statx.h"
102 #if HAVE_GETGROUPLIST
109 #define dout_prefix *_dout << "client." << whoami << " "
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
113 // FreeBSD fails to define this
117 // Darwin fails to define this
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
128 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
130 Client
*client
= static_cast<Client
*>(p
);
131 client
->flush_set_callback(oset
);
137 Client::CommandHook::CommandHook(Client
*client
) :
142 bool Client::CommandHook::call(std::string_view command
,
143 const cmdmap_t
& cmdmap
,
144 std::string_view format
, bufferlist
& out
)
146 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
147 f
->open_object_section("result");
148 m_client
->client_lock
.Lock();
149 if (command
== "mds_requests")
150 m_client
->dump_mds_requests(f
.get());
151 else if (command
== "mds_sessions")
152 m_client
->dump_mds_sessions(f
.get());
153 else if (command
== "dump_cache")
154 m_client
->dump_cache(f
.get());
155 else if (command
== "kick_stale_sessions")
156 m_client
->_kick_stale_sessions();
157 else if (command
== "status")
158 m_client
->dump_status(f
.get());
160 ceph_abort_msg("bad command registered");
161 m_client
->client_lock
.Unlock();
170 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
171 : inode(in
), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
176 void Client::_reset_faked_inos()
179 free_faked_inos
.clear();
180 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
181 last_used_faked_ino
= 0;
182 last_used_faked_root
= 0;
183 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
186 void Client::_assign_faked_ino(Inode
*in
)
188 if (0 == last_used_faked_ino
)
189 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
191 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
192 last_used_faked_ino
= 2048;
193 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
195 ceph_assert(it
!= free_faked_inos
.end());
196 if (last_used_faked_ino
< it
.get_start()) {
197 ceph_assert(it
.get_len() > 0);
198 last_used_faked_ino
= it
.get_start();
200 ++last_used_faked_ino
;
201 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
203 in
->faked_ino
= last_used_faked_ino
;
204 free_faked_inos
.erase(in
->faked_ino
);
205 faked_ino_map
[in
->faked_ino
] = in
->vino();
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
215 void Client::_assign_faked_root(Inode
*in
)
217 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
218 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
219 last_used_faked_root
= 0;
220 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
222 assert(it
!= free_faked_inos
.end());
223 vinodeno_t inode_info
= in
->vino();
224 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
225 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
226 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
229 in
->faked_ino
= last_used_faked_root
;
230 free_faked_inos
.erase(in
->faked_ino
);
231 faked_ino_map
[in
->faked_ino
] = in
->vino();
234 void Client::_release_faked_ino(Inode
*in
)
236 free_faked_inos
.insert(in
->faked_ino
);
237 faked_ino_map
.erase(in
->faked_ino
);
240 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
245 else if (faked_ino_map
.count(ino
))
246 vino
= faked_ino_map
[ino
];
248 vino
= vinodeno_t(0, CEPH_NOSNAP
);
249 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
253 vinodeno_t
Client::map_faked_ino(ino_t ino
)
255 std::lock_guard
lock(client_lock
);
256 return _map_faked_ino(ino
);
261 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
262 : Dispatcher(m
->cct
),
263 timer(m
->cct
, client_lock
),
264 client_lock("Client::client_lock"),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 objecter_finisher(m
->cct
),
274 m_command_hook(this),
279 user_id
= cct
->_conf
->client_mount_uid
;
280 group_id
= cct
->_conf
->client_mount_gid
;
281 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
282 "fuse_default_permissions");
284 if (cct
->_conf
->client_acl_type
== "posix_acl")
285 acl_type
= POSIX_ACL
;
287 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
290 free_fd_set
.insert(10, 1<<30);
292 mdsmap
.reset(new MDSMap
);
295 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
297 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
298 client_flush_set_callback
, // all commit callback
300 cct
->_conf
->client_oc_size
,
301 cct
->_conf
->client_oc_max_objects
,
302 cct
->_conf
->client_oc_max_dirty
,
303 cct
->_conf
->client_oc_target_dirty
,
304 cct
->_conf
->client_oc_max_dirty_age
,
306 objecter_finisher
.start();
307 filer
.reset(new Filer(objecter
, &objecter_finisher
));
308 objecter
->enable_blacklist_events();
314 ceph_assert(!client_lock
.is_locked());
316 // It is necessary to hold client_lock, because any inode destruction
317 // may call into ObjectCacher, which asserts that it's lock (which is
318 // client_lock) is held.
321 client_lock
.Unlock();
324 void Client::tear_down_cache()
327 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
331 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
336 while (!opened_dirs
.empty()) {
337 dir_result_t
*dirp
= *opened_dirs
.begin();
338 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
347 ceph_assert(lru
.lru_get_size() == 0);
350 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
351 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
355 while (!root_parents
.empty())
356 root_parents
.erase(root_parents
.begin());
361 ceph_assert(inode_map
.empty());
364 inodeno_t
Client::get_root_ino()
366 std::lock_guard
l(client_lock
);
367 if (use_faked_inos())
368 return root
->faked_ino
;
373 Inode
*Client::get_root()
375 std::lock_guard
l(client_lock
);
383 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
386 in
->make_long_path(path
);
387 ldout(cct
, 1) << "dump_inode: "
388 << (disconnected
? "DISCONNECTED ":"")
389 << "inode " << in
->ino
391 << " ref " << in
->get_num_ref()
395 f
->open_object_section("inode");
396 f
->dump_stream("path") << path
;
398 f
->dump_int("disconnected", 1);
405 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
406 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
407 it
!= in
->dir
->dentries
.end();
409 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
411 f
->open_object_section("dentry");
415 if (it
->second
->inode
)
416 dump_inode(f
, it
->second
->inode
.get(), did
, false);
421 void Client::dump_cache(Formatter
*f
)
425 ldout(cct
, 1) << __func__
<< dendl
;
428 f
->open_array_section("cache");
431 dump_inode(f
, root
, did
, true);
433 // make a second pass to catch anything disconnected
434 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
435 it
!= inode_map
.end();
437 if (did
.count(it
->second
))
439 dump_inode(f
, it
->second
, did
, true);
446 void Client::dump_status(Formatter
*f
)
448 ceph_assert(client_lock
.is_locked_by_me());
450 ldout(cct
, 1) << __func__
<< dendl
;
452 const epoch_t osd_epoch
453 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
456 f
->open_object_section("metadata");
457 for (const auto& kv
: metadata
)
458 f
->dump_string(kv
.first
.c_str(), kv
.second
);
461 f
->dump_int("dentry_count", lru
.lru_get_size());
462 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
463 f
->dump_int("id", get_nodeid().v
);
464 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
465 f
->dump_object("inst", inst
);
466 f
->dump_object("addr", inst
.addr
);
467 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
468 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
469 f
->dump_int("inode_count", inode_map
.size());
470 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
471 f
->dump_int("osd_epoch", osd_epoch
);
472 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
473 f
->dump_bool("blacklisted", blacklisted
);
480 objectcacher
->start();
483 ceph_assert(!initialized
);
485 messenger
->add_dispatcher_tail(this);
486 client_lock
.Unlock();
492 void Client::_finish_init()
496 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
497 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
498 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
499 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
500 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
501 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
502 logger
.reset(plb
.create_perf_counters());
503 cct
->get_perfcounters_collection()->add(logger
.get());
505 client_lock
.Unlock();
507 cct
->_conf
.add_observer(this);
509 AdminSocket
* admin_socket
= cct
->get_admin_socket();
510 int ret
= admin_socket
->register_command("mds_requests",
513 "show in-progress mds requests");
515 lderr(cct
) << "error registering admin socket command: "
516 << cpp_strerror(-ret
) << dendl
;
518 ret
= admin_socket
->register_command("mds_sessions",
521 "show mds session state");
523 lderr(cct
) << "error registering admin socket command: "
524 << cpp_strerror(-ret
) << dendl
;
526 ret
= admin_socket
->register_command("dump_cache",
529 "show in-memory metadata cache contents");
531 lderr(cct
) << "error registering admin socket command: "
532 << cpp_strerror(-ret
) << dendl
;
534 ret
= admin_socket
->register_command("kick_stale_sessions",
535 "kick_stale_sessions",
537 "kick sessions that were remote reset");
539 lderr(cct
) << "error registering admin socket command: "
540 << cpp_strerror(-ret
) << dendl
;
542 ret
= admin_socket
->register_command("status",
545 "show overall client status");
547 lderr(cct
) << "error registering admin socket command: "
548 << cpp_strerror(-ret
) << dendl
;
553 client_lock
.Unlock();
556 void Client::shutdown()
558 ldout(cct
, 1) << __func__
<< dendl
;
560 // If we were not mounted, but were being used for sending
561 // MDS commands, we may have sessions that need closing.
564 client_lock
.Unlock();
566 cct
->_conf
.remove_observer(this);
568 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
570 if (ino_invalidate_cb
) {
571 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
572 async_ino_invalidator
.wait_for_empty();
573 async_ino_invalidator
.stop();
576 if (dentry_invalidate_cb
) {
577 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
578 async_dentry_invalidator
.wait_for_empty();
579 async_dentry_invalidator
.stop();
582 if (switch_interrupt_cb
) {
583 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
584 interrupt_finisher
.wait_for_empty();
585 interrupt_finisher
.stop();
589 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
590 remount_finisher
.wait_for_empty();
591 remount_finisher
.stop();
594 objectcacher
->stop(); // outside of client_lock! this does a join.
597 ceph_assert(initialized
);
600 client_lock
.Unlock();
602 objecter_finisher
.wait_for_empty();
603 objecter_finisher
.stop();
606 cct
->get_perfcounters_collection()->remove(logger
.get());
612 // ===================
613 // metadata cache stuff
615 void Client::trim_cache(bool trim_kernel_dcache
)
617 uint64_t max
= cct
->_conf
->client_cache_size
;
618 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
620 while (lru
.lru_get_size() != last
) {
621 last
= lru
.lru_get_size();
623 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
626 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
633 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
634 _invalidate_kernel_dcache();
637 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
638 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
642 while (!root_parents
.empty())
643 root_parents
.erase(root_parents
.begin());
649 void Client::trim_cache_for_reconnect(MetaSession
*s
)
651 mds_rank_t mds
= s
->mds_num
;
652 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
655 list
<Dentry
*> skipped
;
656 while (lru
.lru_get_size() > 0) {
657 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
661 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
662 dn
->dir
->parent_inode
->caps
.count(mds
)) {
666 skipped
.push_back(dn
);
669 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
670 lru
.lru_insert_mid(*p
);
672 ldout(cct
, 20) << __func__
<< " mds." << mds
673 << " trimmed " << trimmed
<< " dentries" << dendl
;
675 if (s
->caps
.size() > 0)
676 _invalidate_kernel_dcache();
679 void Client::trim_dentry(Dentry
*dn
)
681 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
683 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
686 Inode
*diri
= dn
->dir
->parent_inode
;
687 diri
->dir_release_count
++;
688 clear_dir_complete_and_ordered(diri
, true);
690 unlink(dn
, false, false); // drop dir, drop dentry
694 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
695 uint64_t truncate_seq
, uint64_t truncate_size
)
697 uint64_t prior_size
= in
->size
;
699 if (truncate_seq
> in
->truncate_seq
||
700 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
701 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
703 in
->reported_size
= size
;
704 if (truncate_seq
!= in
->truncate_seq
) {
705 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
706 << truncate_seq
<< dendl
;
707 in
->truncate_seq
= truncate_seq
;
708 in
->oset
.truncate_seq
= truncate_seq
;
710 // truncate cached file data
711 if (prior_size
> size
) {
712 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
716 // truncate inline data
717 if (in
->inline_version
< CEPH_INLINE_NONE
) {
718 uint32_t len
= in
->inline_data
.length();
720 in
->inline_data
.splice(size
, len
- size
);
723 if (truncate_seq
>= in
->truncate_seq
&&
724 in
->truncate_size
!= truncate_size
) {
726 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
727 << truncate_size
<< dendl
;
728 in
->truncate_size
= truncate_size
;
729 in
->oset
.truncate_size
= truncate_size
;
731 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
736 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
737 utime_t ctime
, utime_t mtime
, utime_t atime
)
739 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
740 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
742 if (time_warp_seq
> in
->time_warp_seq
)
743 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
744 << " is higher than local time_warp_seq "
745 << in
->time_warp_seq
<< dendl
;
748 // be careful with size, mtime, atime
749 if (issued
& (CEPH_CAP_FILE_EXCL
|
751 CEPH_CAP_FILE_BUFFER
|
753 CEPH_CAP_XATTR_EXCL
)) {
754 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
755 if (ctime
> in
->ctime
)
757 if (time_warp_seq
> in
->time_warp_seq
) {
758 //the mds updated times, so take those!
761 in
->time_warp_seq
= time_warp_seq
;
762 } else if (time_warp_seq
== in
->time_warp_seq
) {
764 if (mtime
> in
->mtime
)
766 if (atime
> in
->atime
)
768 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
769 //ignore mds values as we have a higher seq
772 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
773 if (time_warp_seq
>= in
->time_warp_seq
) {
777 in
->time_warp_seq
= time_warp_seq
;
781 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
782 << time_warp_seq
<< " is lower than local time_warp_seq "
788 void Client::_fragmap_remove_non_leaves(Inode
*in
)
790 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
791 if (!in
->dirfragtree
.is_leaf(p
->first
))
792 in
->fragmap
.erase(p
++);
797 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
799 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
800 if (p
->second
== mds
)
801 in
->fragmap
.erase(p
++);
806 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
807 MetaSession
*session
,
808 const UserPerm
& request_perms
)
811 bool was_new
= false;
812 if (inode_map
.count(st
->vino
)) {
813 in
= inode_map
[st
->vino
];
814 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
816 in
= new Inode(this, st
->vino
, &st
->layout
);
817 inode_map
[st
->vino
] = in
;
819 if (use_faked_inos())
820 _assign_faked_ino(in
);
824 if (use_faked_inos())
825 _assign_faked_root(root
);
828 } else if (!mounted
) {
829 root_parents
[root_ancestor
] = in
;
834 in
->ino
= st
->vino
.ino
;
835 in
->snapid
= st
->vino
.snapid
;
836 in
->mode
= st
->mode
& S_IFMT
;
841 if (in
->is_symlink())
842 in
->symlink
= st
->symlink
;
844 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
845 bool new_version
= false;
846 if (in
->version
== 0 ||
847 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
848 (in
->version
& ~1) < st
->version
))
852 in
->caps_issued(&issued
);
853 issued
|= in
->caps_dirty();
854 int new_issued
= ~issued
& (int)st
->cap
.caps
;
856 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
857 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
861 in
->btime
= st
->btime
;
862 in
->snap_btime
= st
->snap_btime
;
865 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
866 !(issued
& CEPH_CAP_LINK_EXCL
)) {
867 in
->nlink
= st
->nlink
;
870 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
871 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
872 st
->ctime
, st
->mtime
, st
->atime
);
876 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
877 in
->layout
= st
->layout
;
878 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
882 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
883 in
->dirstat
= st
->dirstat
;
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
888 in
->dir_layout
= st
->dir_layout
;
889 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
890 in
->rstat
= st
->rstat
;
891 in
->quota
= st
->quota
;
892 in
->dir_pin
= st
->dir_pin
;
894 // move me if/when version reflects fragtree changes.
895 if (in
->dirfragtree
!= st
->dirfragtree
) {
896 in
->dirfragtree
= st
->dirfragtree
;
897 _fragmap_remove_non_leaves(in
);
901 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
902 st
->xattrbl
.length() &&
903 st
->xattr_version
> in
->xattr_version
) {
904 auto p
= st
->xattrbl
.cbegin();
905 decode(in
->xattrs
, p
);
906 in
->xattr_version
= st
->xattr_version
;
909 if (st
->inline_version
> in
->inline_version
) {
910 in
->inline_data
= st
->inline_data
;
911 in
->inline_version
= st
->inline_version
;
914 /* always take a newer change attr */
915 if (st
->change_attr
> in
->change_attr
)
916 in
->change_attr
= st
->change_attr
;
918 if (st
->version
> in
->version
)
919 in
->version
= st
->version
;
922 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
925 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
927 if (in
->snapid
== CEPH_NOSNAP
) {
928 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
929 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
930 st
->cap
.flags
, request_perms
);
931 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
932 in
->max_size
= st
->max_size
;
933 in
->rstat
= st
->rstat
;
936 // setting I_COMPLETE needs to happen after adding the cap
938 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
939 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
940 in
->dirstat
.nfiles
== 0 &&
941 in
->dirstat
.nsubdirs
== 0) {
942 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
943 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
945 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
946 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
947 in
->dir
->readdir_cache
.clear();
948 for (const auto& p
: in
->dir
->dentries
) {
949 unlink(p
.second
, true, true); // keep dir, keep dentry
951 if (in
->dir
->dentries
.empty())
956 in
->snap_caps
|= st
->cap
.caps
;
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
966 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
967 Inode
*in
, utime_t from
, MetaSession
*session
,
971 if (dir
->dentries
.count(dname
))
972 dn
= dir
->dentries
[dname
];
974 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
975 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
978 if (dn
&& dn
->inode
) {
979 if (dn
->inode
->vino() == in
->vino()) {
981 ldout(cct
, 12) << " had dentry " << dname
982 << " with correct vino " << dn
->inode
->vino()
985 ldout(cct
, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn
->inode
->vino()
988 unlink(dn
, true, true); // keep dir, keep dentry
992 if (!dn
|| !dn
->inode
) {
993 InodeRef
tmp_ref(in
);
995 if (old_dentry
->dir
!= dir
) {
996 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
997 old_diri
->dir_ordered_count
++;
998 clear_dir_complete_and_ordered(old_diri
, false);
1000 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1002 Inode
*diri
= dir
->parent_inode
;
1003 diri
->dir_ordered_count
++;
1004 clear_dir_complete_and_ordered(diri
, false);
1005 dn
= link(dir
, dname
, in
, dn
);
1008 update_dentry_lease(dn
, dlease
, from
, session
);
1012 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1014 utime_t dttl
= from
;
1015 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1019 if (dlease
->mask
& CEPH_LOCK_DN
) {
1020 if (dttl
> dn
->lease_ttl
) {
1021 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1022 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1023 dn
->lease_ttl
= dttl
;
1024 dn
->lease_mds
= session
->mds_num
;
1025 dn
->lease_seq
= dlease
->seq
;
1026 dn
->lease_gen
= session
->cap_gen
;
1029 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1034 * update MDS location cache for a single inode
1036 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1039 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1040 if (dst
->auth
>= 0) {
1041 in
->fragmap
[dst
->frag
] = dst
->auth
;
1043 in
->fragmap
.erase(dst
->frag
);
1045 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1046 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1047 _fragmap_remove_non_leaves(in
);
1051 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1054 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1056 if (diri
->flags
& I_COMPLETE
) {
1058 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1059 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1061 if (diri
->flags
& I_DIR_ORDERED
) {
1062 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1063 diri
->flags
&= ~I_DIR_ORDERED
;
1067 diri
->dir
->readdir_cache
.clear();
1072 * insert results from readdir or lssnap into the metadata cache.
1074 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1076 auto& reply
= request
->reply
;
1077 ConnectionRef con
= request
->reply
->get_connection();
1079 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1080 features
= (uint64_t)-1;
1083 features
= con
->get_features();
1086 dir_result_t
*dirp
= request
->dirp
;
1089 // the extra buffer list is only set for readdir and lssnap replies
1090 auto p
= reply
->get_extra_bl().cbegin();
1093 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1095 diri
= open_snapdir(diri
);
1098 // only open dir if we're actually adding stuff to it!
1099 Dir
*dir
= diri
->open_dir();
1103 DirStat
dst(p
, features
);
1109 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1110 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1112 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1113 unsigned readdir_offset
= dirp
->next_offset
;
1114 string readdir_start
= dirp
->last_name
;
1115 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1117 unsigned last_hash
= 0;
1119 if (!readdir_start
.empty()) {
1120 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1121 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1122 /* mds understands offset_hash */
1123 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1127 if (fg
!= dst
.frag
) {
1128 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1132 readdir_start
.clear();
1133 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1137 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1138 << ", hash_order=" << hash_order
1139 << ", readdir_start " << readdir_start
1140 << ", last_hash " << last_hash
1141 << ", next_offset " << readdir_offset
<< dendl
;
1143 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1144 fg
.is_leftmost() && readdir_offset
== 2 &&
1145 !(hash_order
&& last_hash
)) {
1146 dirp
->release_count
= diri
->dir_release_count
;
1147 dirp
->ordered_count
= diri
->dir_ordered_count
;
1148 dirp
->start_shared_gen
= diri
->shared_gen
;
1149 dirp
->cache_index
= 0;
1152 dirp
->buffer_frag
= fg
;
1154 _readdir_drop_dirp_buffer(dirp
);
1155 dirp
->buffer
.reserve(numdn
);
1159 for (unsigned i
=0; i
<numdn
; i
++) {
1161 dlease
.decode(p
, features
);
1162 InodeStat
ist(p
, features
);
1164 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1166 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1169 if (diri
->dir
->dentries
.count(dname
)) {
1170 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1171 if (olddn
->inode
!= in
) {
1172 // replace incorrect dentry
1173 unlink(olddn
, true, true); // keep dir, dentry
1174 dn
= link(dir
, dname
, in
, olddn
);
1175 ceph_assert(dn
== olddn
);
1183 dn
= link(dir
, dname
, in
, NULL
);
1186 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1188 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1189 if (hash
!= last_hash
)
1192 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1194 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1196 // add to readdir cache
1197 if (dirp
->release_count
== diri
->dir_release_count
&&
1198 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1199 dirp
->start_shared_gen
== diri
->shared_gen
) {
1200 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1202 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1203 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1205 dir
->readdir_cache
.push_back(dn
);
1206 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1207 if (dirp
->inode
->is_complete_and_ordered())
1208 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1210 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1212 ceph_abort_msg("unexpected readdir buffer idx");
1214 dirp
->cache_index
++;
1216 // add to cached result list
1217 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1218 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1222 dirp
->last_name
= dname
;
1224 dirp
->next_offset
= 2;
1226 dirp
->next_offset
= readdir_offset
;
1228 if (dir
->is_empty())
1235 * insert a trace from a MDS reply into the cache.
1237 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1239 auto& reply
= request
->reply
;
1240 int op
= request
->get_op();
1242 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1243 << " is_target=" << (int)reply
->head
.is_target
1244 << " is_dentry=" << (int)reply
->head
.is_dentry
1247 auto p
= reply
->get_trace_bl().cbegin();
1248 if (request
->got_unsafe
) {
1249 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1250 ceph_assert(p
.end());
1255 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1257 Dentry
*d
= request
->dentry();
1259 Inode
*diri
= d
->dir
->parent_inode
;
1260 diri
->dir_release_count
++;
1261 clear_dir_complete_and_ordered(diri
, true);
1264 if (d
&& reply
->get_result() == 0) {
1265 if (op
== CEPH_MDS_OP_RENAME
) {
1267 Dentry
*od
= request
->old_dentry();
1268 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1270 unlink(od
, true, true); // keep dir, dentry
1271 } else if (op
== CEPH_MDS_OP_RMDIR
||
1272 op
== CEPH_MDS_OP_UNLINK
) {
1274 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1275 unlink(d
, true, true); // keep dir, dentry
1281 ConnectionRef con
= request
->reply
->get_connection();
1283 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1284 features
= (uint64_t)-1;
1287 features
= con
->get_features();
1289 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1292 SnapRealm
*realm
= NULL
;
1293 if (reply
->snapbl
.length())
1294 update_snap_trace(reply
->snapbl
, &realm
);
1296 ldout(cct
, 10) << " hrm "
1297 << " is_target=" << (int)reply
->head
.is_target
1298 << " is_dentry=" << (int)reply
->head
.is_dentry
1307 if (reply
->head
.is_dentry
) {
1308 dirst
.decode(p
, features
);
1309 dst
.decode(p
, features
);
1311 dlease
.decode(p
, features
);
1315 if (reply
->head
.is_target
) {
1316 ist
.decode(p
, features
);
1317 if (cct
->_conf
->client_debug_getattr_caps
) {
1318 unsigned wanted
= 0;
1319 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1320 wanted
= request
->head
.args
.getattr
.mask
;
1321 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1322 wanted
= request
->head
.args
.open
.mask
;
1324 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1325 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1326 ceph_abort_msg("MDS reply does not contain xattrs");
1329 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1334 if (reply
->head
.is_dentry
) {
1335 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1337 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1340 Dir
*dir
= diri
->open_dir();
1341 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1342 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1345 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1346 dn
= diri
->dir
->dentries
[dname
];
1348 diri
->dir_ordered_count
++;
1349 clear_dir_complete_and_ordered(diri
, false);
1350 unlink(dn
, true, true); // keep dir, dentry
1353 if (dlease
.duration_ms
> 0) {
1355 Dir
*dir
= diri
->open_dir();
1356 dn
= link(dir
, dname
, NULL
, NULL
);
1358 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1361 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1362 op
== CEPH_MDS_OP_MKSNAP
) {
1363 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1364 // fake it for snap lookup
1365 vinodeno_t vino
= ist
.vino
;
1366 vino
.snapid
= CEPH_SNAPDIR
;
1367 ceph_assert(inode_map
.count(vino
));
1368 diri
= inode_map
[vino
];
1370 string dname
= request
->path
.last_dentry();
1373 dlease
.duration_ms
= 0;
1376 Dir
*dir
= diri
->open_dir();
1377 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1379 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1380 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1382 unlink(dn
, true, true); // keep dir, dentry
1388 if (op
== CEPH_MDS_OP_READDIR
||
1389 op
== CEPH_MDS_OP_LSSNAP
) {
1390 insert_readdir_results(request
, session
, in
);
1391 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1392 // hack: return parent inode instead
1396 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1397 // pin the target inode if its parent dentry is not pinned
1398 request
->set_other_inode(in
);
1403 put_snap_realm(realm
);
1405 request
->target
= in
;
1411 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1413 mds_rank_t mds
= MDS_RANK_NONE
;
1415 bool is_hash
= false;
1420 if (req
->resend_mds
>= 0) {
1421 mds
= req
->resend_mds
;
1422 req
->resend_mds
= -1;
1423 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1427 if (cct
->_conf
->client_use_random_mds
)
1433 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1434 if (req
->path
.depth()) {
1435 hash
= in
->hash_dentry_name(req
->path
[0]);
1436 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1437 << " on " << req
->path
[0]
1438 << " => " << hash
<< dendl
;
1443 in
= de
->inode
.get();
1444 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1446 in
= de
->dir
->parent_inode
;
1447 hash
= in
->hash_dentry_name(de
->name
);
1448 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1449 << " on " << de
->name
1450 << " => " << hash
<< dendl
;
1455 if (in
->snapid
!= CEPH_NOSNAP
) {
1456 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1457 while (in
->snapid
!= CEPH_NOSNAP
) {
1458 if (in
->snapid
== CEPH_SNAPDIR
)
1459 in
= in
->snapdir_parent
.get();
1460 else if (!in
->dentries
.empty())
1461 /* In most cases there will only be one dentry, so getting it
1462 * will be the correct action. If there are multiple hard links,
1463 * I think the MDS should be able to redirect as needed*/
1464 in
= in
->get_first_parent()->dir
->parent_inode
;
1466 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1473 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1474 << " hash=" << hash
<< dendl
;
1476 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1477 frag_t fg
= in
->dirfragtree
[hash
];
1478 if (in
->fragmap
.count(fg
)) {
1479 mds
= in
->fragmap
[fg
];
1482 } else if (in
->auth_cap
) {
1483 mds
= in
->auth_cap
->session
->mds_num
;
1486 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1491 if (in
->auth_cap
&& req
->auth_is_best()) {
1492 mds
= in
->auth_cap
->session
->mds_num
;
1493 } else if (!in
->caps
.empty()) {
1494 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1498 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1505 mds
= _get_random_up_mds();
1506 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1510 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1515 void Client::connect_mds_targets(mds_rank_t mds
)
1517 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1518 ceph_assert(mds_sessions
.count(mds
));
1519 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1520 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1521 q
!= info
.export_targets
.end();
1523 if (mds_sessions
.count(*q
) == 0 &&
1524 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1525 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1526 << " export target mds." << *q
<< dendl
;
1527 _open_mds_session(*q
);
1532 void Client::dump_mds_sessions(Formatter
*f
)
1534 f
->dump_int("id", get_nodeid().v
);
1535 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1536 f
->dump_object("inst", inst
);
1537 f
->dump_stream("inst_str") << inst
;
1538 f
->dump_stream("addr_str") << inst
.addr
;
1539 f
->open_array_section("sessions");
1540 for (const auto &p
: mds_sessions
) {
1541 f
->open_object_section("session");
1546 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1548 void Client::dump_mds_requests(Formatter
*f
)
1550 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1551 p
!= mds_requests
.end();
1553 f
->open_object_section("request");
1559 int Client::verify_reply_trace(int r
,
1560 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1561 InodeRef
*ptarget
, bool *pcreated
,
1562 const UserPerm
& perms
)
1564 // check whether this request actually did the create, and set created flag
1565 bufferlist extra_bl
;
1566 inodeno_t created_ino
;
1567 bool got_created_ino
= false;
1568 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1570 extra_bl
= reply
->get_extra_bl();
1571 if (extra_bl
.length() >= 8) {
1572 // if the extra bufferlist has a buffer, we assume its the created inode
1573 // and that this request to create succeeded in actually creating
1574 // the inode (won the race with other create requests)
1575 decode(created_ino
, extra_bl
);
1576 got_created_ino
= true;
1577 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1581 *pcreated
= got_created_ino
;
1583 if (request
->target
) {
1584 *ptarget
= request
->target
;
1585 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1587 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1588 (*ptarget
) = p
->second
;
1589 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1591 // we got a traceless reply, and need to look up what we just
1592 // created. for now, do this by name. someday, do this by the
1593 // ino... which we know! FIXME.
1595 Dentry
*d
= request
->dentry();
1598 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1599 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1600 << " got_ino " << got_created_ino
1601 << " ino " << created_ino
1603 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1606 // if the dentry is not linked, just do our best. see #5021.
1607 ceph_abort_msg("how did this happen? i want logs!");
1610 Inode
*in
= request
->inode();
1611 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1612 << in
->ino
<< dendl
;
1613 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1617 // verify ino returned in reply and trace_dist are the same
1618 if (got_created_ino
&&
1619 created_ino
.val
!= target
->ino
.val
) {
1620 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1624 ptarget
->swap(target
);
1636 * Blocking helper to make an MDS request.
1638 * If the ptarget flag is set, behavior changes slightly: the caller
1639 * expects to get a pointer to the inode we are creating or operating
1640 * on. As a result, we will follow up any traceless mutation reply
1641 * with a getattr or lookup to transparently handle a traceless reply
1642 * from the MDS (as when the MDS restarts and the client has to replay
1645 * @param request the MetaRequest to execute
1646 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1647 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1648 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1649 * @param use_mds [optional] prefer a specific mds (-1 for default)
1650 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1652 int Client::make_request(MetaRequest
*request
,
1653 const UserPerm
& perms
,
1654 InodeRef
*ptarget
, bool *pcreated
,
1660 // assign a unique tid
1661 ceph_tid_t tid
= ++last_tid
;
1662 request
->set_tid(tid
);
1665 request
->op_stamp
= ceph_clock_now();
1668 mds_requests
[tid
] = request
->get();
1669 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1672 request
->set_caller_perms(perms
);
1674 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1675 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1676 request
->set_oldest_client_tid(1);
1678 request
->set_oldest_client_tid(oldest_tid
);
1683 request
->resend_mds
= use_mds
;
1686 if (request
->aborted())
1690 request
->abort(-EBLACKLISTED
);
1696 request
->caller_cond
= &caller_cond
;
1699 Inode
*hash_diri
= NULL
;
1700 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1701 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1702 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1703 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1705 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1706 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1708 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1709 request
->resend_mds
= _get_random_up_mds();
1712 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1713 wait_on_list(waiting_for_mdsmap
);
1719 MetaSession
*session
= NULL
;
1720 if (!have_open_session(mds
)) {
1721 session
= _get_or_open_mds_session(mds
);
1724 if (session
->state
== MetaSession::STATE_OPENING
) {
1725 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1726 wait_on_context_list(session
->waiting_for_open
);
1727 // Abort requests on REJECT from MDS
1728 if (rejected_by_mds
.count(mds
)) {
1729 request
->abort(-EPERM
);
1735 if (!have_open_session(mds
))
1738 session
= &mds_sessions
.at(mds
);
1742 send_request(request
, session
);
1745 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1746 request
->kick
= false;
1747 while (!request
->reply
&& // reply
1748 request
->resend_mds
< 0 && // forward
1750 caller_cond
.Wait(client_lock
);
1751 request
->caller_cond
= NULL
;
1753 // did we get a reply?
1758 if (!request
->reply
) {
1759 ceph_assert(request
->aborted());
1760 ceph_assert(!request
->got_unsafe
);
1761 r
= request
->get_abort_code();
1762 request
->item
.remove_myself();
1763 unregister_request(request
);
1764 put_request(request
);
1769 auto reply
= std::move(request
->reply
);
1770 r
= reply
->get_result();
1772 request
->success
= true;
1774 // kick dispatcher (we've got it!)
1775 ceph_assert(request
->dispatch_cond
);
1776 request
->dispatch_cond
->Signal();
1777 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1778 request
->dispatch_cond
= 0;
1780 if (r
>= 0 && ptarget
)
1781 r
= verify_reply_trace(r
, request
, reply
, ptarget
, pcreated
, perms
);
1784 *pdirbl
= reply
->get_extra_bl();
1787 utime_t lat
= ceph_clock_now();
1788 lat
-= request
->sent_stamp
;
1789 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1790 logger
->tinc(l_c_lat
, lat
);
1791 logger
->tinc(l_c_reply
, lat
);
1793 put_request(request
);
1797 void Client::unregister_request(MetaRequest
*req
)
1799 mds_requests
.erase(req
->tid
);
1800 if (req
->tid
== oldest_tid
) {
1801 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1803 if (p
== mds_requests
.end()) {
1807 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1808 oldest_tid
= p
->first
;
1817 void Client::put_request(MetaRequest
*request
)
1819 if (request
->_put()) {
1821 if (request
->success
)
1822 op
= request
->get_op();
1824 request
->take_other_inode(&other_in
);
1828 (op
== CEPH_MDS_OP_RMDIR
||
1829 op
== CEPH_MDS_OP_RENAME
||
1830 op
== CEPH_MDS_OP_RMSNAP
)) {
1831 _try_to_trim_inode(other_in
.get(), false);
1836 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1837 mds_rank_t mds
, int drop
,
1838 int unless
, int force
)
1840 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1841 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1842 << ", have:" << ", force:" << force
<< ")" << dendl
;
1844 auto it
= in
->caps
.find(mds
);
1845 if (it
!= in
->caps
.end()) {
1846 Cap
&cap
= it
->second
;
1847 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1848 if ((drop
& cap
.issued
) &&
1849 !(unless
& cap
.issued
)) {
1850 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(cap
.issued
) << dendl
;
1851 cap
.issued
&= ~drop
;
1852 cap
.implemented
&= ~drop
;
1854 ldout(cct
, 25) << "Now have: " << ccap_string(cap
.issued
) << dendl
;
1859 ceph_mds_request_release rel
;
1861 rel
.cap_id
= cap
.cap_id
;
1863 rel
.issue_seq
= cap
.issue_seq
;
1864 rel
.mseq
= cap
.mseq
;
1865 rel
.caps
= cap
.implemented
;
1866 rel
.wanted
= cap
.wanted
;
1869 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1872 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1873 << released
<< dendl
;
1877 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1878 mds_rank_t mds
, int drop
, int unless
)
1880 ldout(cct
, 20) << __func__
<< " enter(dn:"
1881 << dn
<< ")" << dendl
;
1884 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1885 mds
, drop
, unless
, 1);
1886 if (released
&& dn
->lease_mds
== mds
) {
1887 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1888 auto& rel
= req
->cap_releases
.back();
1889 rel
.item
.dname_len
= dn
->name
.length();
1890 rel
.item
.dname_seq
= dn
->lease_seq
;
1891 rel
.dname
= dn
->name
;
1893 ldout(cct
, 25) << __func__
<< " exit(dn:"
1894 << dn
<< ")" << dendl
;
1899 * This requires the MClientRequest *request member to be set.
1900 * It will error out horribly without one.
1901 * Additionally, if you set any *drop member, you'd better have
1902 * set the corresponding dentry!
1904 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1906 ldout(cct
, 20) << __func__
<< " enter (req: "
1907 << req
<< ", mds: " << mds
<< ")" << dendl
;
1908 if (req
->inode_drop
&& req
->inode())
1909 encode_inode_release(req
->inode(), req
,
1910 mds
, req
->inode_drop
,
1913 if (req
->old_inode_drop
&& req
->old_inode())
1914 encode_inode_release(req
->old_inode(), req
,
1915 mds
, req
->old_inode_drop
,
1916 req
->old_inode_unless
);
1917 if (req
->other_inode_drop
&& req
->other_inode())
1918 encode_inode_release(req
->other_inode(), req
,
1919 mds
, req
->other_inode_drop
,
1920 req
->other_inode_unless
);
1922 if (req
->dentry_drop
&& req
->dentry())
1923 encode_dentry_release(req
->dentry(), req
,
1924 mds
, req
->dentry_drop
,
1925 req
->dentry_unless
);
1927 if (req
->old_dentry_drop
&& req
->old_dentry())
1928 encode_dentry_release(req
->old_dentry(), req
,
1929 mds
, req
->old_dentry_drop
,
1930 req
->old_dentry_unless
);
1931 ldout(cct
, 25) << __func__
<< " exit (req: "
1932 << req
<< ", mds " << mds
<<dendl
;
1935 bool Client::have_open_session(mds_rank_t mds
)
1937 const auto &it
= mds_sessions
.find(mds
);
1938 return it
!= mds_sessions
.end() &&
1939 (it
->second
.state
== MetaSession::STATE_OPEN
||
1940 it
->second
.state
== MetaSession::STATE_STALE
);
1943 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1945 const auto &it
= mds_sessions
.find(mds
);
1946 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1953 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1955 auto it
= mds_sessions
.find(mds
);
1956 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1960 * Populate a map of strings with client-identifying metadata,
1961 * such as the hostname. Call this once at initialization.
1963 void Client::populate_metadata(const std::string
&mount_root
)
1969 metadata
["hostname"] = u
.nodename
;
1970 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1972 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1975 metadata
["pid"] = stringify(getpid());
1977 // Ceph entity id (the '0' in "client.0")
1978 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1980 // Our mount position
1981 if (!mount_root
.empty()) {
1982 metadata
["root"] = mount_root
;
1986 metadata
["ceph_version"] = pretty_version_to_str();
1987 metadata
["ceph_sha1"] = git_version_to_str();
1989 // Apply any metadata from the user's configured overrides
1990 std::vector
<std::string
> tokens
;
1991 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
1992 for (const auto &i
: tokens
) {
1993 auto eqpos
= i
.find("=");
1994 // Throw out anything that isn't of the form "<str>=<str>"
1995 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
1996 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
1999 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2004 * Optionally add or override client metadata fields.
2006 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2008 std::lock_guard
l(client_lock
);
2009 ceph_assert(initialized
);
2011 auto it
= metadata
.find(k
);
2012 if (it
!= metadata
.end()) {
2013 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2014 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2020 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2022 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2023 auto addrs
= mdsmap
->get_addrs(mds
);
2024 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2025 std::forward_as_tuple(mds
),
2026 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2027 ceph_assert(em
.second
); /* not already present */
2028 MetaSession
*session
= &em
.first
->second
;
2030 // Maybe skip sending a request to open if this MDS daemon
2031 // has previously sent us a REJECT.
2032 if (rejected_by_mds
.count(mds
)) {
2033 if (rejected_by_mds
[mds
] == session
->addrs
) {
2034 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2035 "because we were rejected" << dendl
;
2038 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2039 "rejected us, trying with new inst" << dendl
;
2040 rejected_by_mds
.erase(mds
);
2044 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_OPEN
);
2045 m
->metadata
= metadata
;
2046 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2047 session
->con
->send_message2(std::move(m
));
2051 void Client::_close_mds_session(MetaSession
*s
)
2053 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2054 s
->state
= MetaSession::STATE_CLOSING
;
2055 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2058 void Client::_closed_mds_session(MetaSession
*s
)
2060 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2061 s
->state
= MetaSession::STATE_CLOSED
;
2062 s
->con
->mark_down();
2063 signal_context_list(s
->waiting_for_open
);
2064 mount_cond
.Signal();
2065 remove_session_caps(s
);
2066 kick_requests_closed(s
);
2067 mds_sessions
.erase(s
->mds_num
);
2070 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2072 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2073 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2075 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2077 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2081 switch (m
->get_op()) {
2082 case CEPH_SESSION_OPEN
:
2084 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2085 missing_features
-= m
->supported_features
;
2086 if (!missing_features
.empty()) {
2087 lderr(cct
) << "mds." << from
<< " lacks required features '"
2088 << missing_features
<< "', closing session " << dendl
;
2089 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2090 _close_mds_session(session
);
2091 _closed_mds_session(session
);
2094 session
->mds_features
= std::move(m
->supported_features
);
2096 renew_caps(session
);
2097 session
->state
= MetaSession::STATE_OPEN
;
2099 mount_cond
.Signal();
2101 connect_mds_targets(from
);
2102 signal_context_list(session
->waiting_for_open
);
2106 case CEPH_SESSION_CLOSE
:
2107 _closed_mds_session(session
);
2110 case CEPH_SESSION_RENEWCAPS
:
2111 if (session
->cap_renew_seq
== m
->get_seq()) {
2112 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2114 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2116 wake_up_session_caps(session
, false);
2120 case CEPH_SESSION_STALE
:
2121 // invalidate session caps/leases
2123 session
->cap_ttl
= ceph_clock_now();
2124 session
->cap_ttl
-= 1;
2125 renew_caps(session
);
2128 case CEPH_SESSION_RECALL_STATE
:
2129 trim_caps(session
, m
->get_max_caps());
2132 case CEPH_SESSION_FLUSHMSG
:
2133 /* flush cap release */
2134 if (auto& m
= session
->release
; m
) {
2135 session
->con
->send_message2(std::move(m
));
2137 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2140 case CEPH_SESSION_FORCE_RO
:
2141 force_session_readonly(session
);
2144 case CEPH_SESSION_REJECT
:
2146 std::string_view error_str
;
2147 auto it
= m
->metadata
.find("error_string");
2148 if (it
!= m
->metadata
.end())
2149 error_str
= it
->second
;
2151 error_str
= "unknown error";
2152 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2154 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2155 _closed_mds_session(session
);
2164 bool Client::_any_stale_sessions() const
2166 ceph_assert(client_lock
.is_locked_by_me());
2168 for (const auto &p
: mds_sessions
) {
2169 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2177 void Client::_kick_stale_sessions()
2179 ldout(cct
, 1) << __func__
<< dendl
;
2181 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2182 MetaSession
&s
= it
->second
;
2184 if (s
.state
== MetaSession::STATE_STALE
)
2185 _closed_mds_session(&s
);
2189 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2190 bool drop_cap_releases
)
2193 mds_rank_t mds
= session
->mds_num
;
2194 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2195 << " for mds." << mds
<< dendl
;
2196 auto r
= build_client_request(request
);
2197 if (request
->dentry()) {
2198 r
->set_dentry_wanted();
2200 if (request
->got_unsafe
) {
2201 r
->set_replayed_op();
2202 if (request
->target
)
2203 r
->head
.ino
= request
->target
->ino
;
2205 encode_cap_releases(request
, mds
);
2206 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2207 request
->cap_releases
.clear();
2209 r
->releases
.swap(request
->cap_releases
);
2211 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2212 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2213 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2214 r
->set_osdmap_epoch(o
.get_epoch());
2218 if (request
->mds
== -1) {
2219 request
->sent_stamp
= ceph_clock_now();
2220 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2224 Inode
*in
= request
->inode();
2226 auto it
= in
->caps
.find(mds
);
2227 if (it
!= in
->caps
.end()) {
2228 request
->sent_on_mseq
= it
->second
.mseq
;
2232 session
->requests
.push_back(&request
->item
);
2234 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2235 session
->con
->send_message2(std::move(r
));
2238 MClientRequest::ref
Client::build_client_request(MetaRequest
*request
)
2240 auto req
= MClientRequest::create(request
->get_op());
2241 req
->set_tid(request
->tid
);
2242 req
->set_stamp(request
->op_stamp
);
2243 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2245 // if the filepath's haven't been set, set them!
2246 if (request
->path
.empty()) {
2247 Inode
*in
= request
->inode();
2248 Dentry
*de
= request
->dentry();
2250 in
->make_nosnap_relative_path(request
->path
);
2253 de
->inode
->make_nosnap_relative_path(request
->path
);
2255 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2256 request
->path
.push_dentry(de
->name
);
2258 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2259 << " No path, inode, or appropriately-endowed dentry given!"
2261 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2262 << " No path, inode, or dentry given!"
2265 req
->set_filepath(request
->get_filepath());
2266 req
->set_filepath2(request
->get_filepath2());
2267 req
->set_data(request
->data
);
2268 req
->set_retry_attempt(request
->retry_attempt
++);
2269 req
->head
.num_fwd
= request
->num_fwd
;
2271 int gid_count
= request
->perms
.get_gids(&_gids
);
2272 req
->set_gid_list(gid_count
, _gids
);
2278 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2280 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2281 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2285 ceph_tid_t tid
= fwd
->get_tid();
2287 if (mds_requests
.count(tid
) == 0) {
2288 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2292 MetaRequest
*request
= mds_requests
[tid
];
2293 ceph_assert(request
);
2295 // reset retry counter
2296 request
->retry_attempt
= 0;
2298 // request not forwarded, or dest mds has no session.
2300 ldout(cct
, 10) << __func__
<< " tid " << tid
2301 << " fwd " << fwd
->get_num_fwd()
2302 << " to mds." << fwd
->get_dest_mds()
2303 << ", resending to " << fwd
->get_dest_mds()
2307 request
->item
.remove_myself();
2308 request
->num_fwd
= fwd
->get_num_fwd();
2309 request
->resend_mds
= fwd
->get_dest_mds();
2310 request
->caller_cond
->Signal();
2313 bool Client::is_dir_operation(MetaRequest
*req
)
2315 int op
= req
->get_op();
2316 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2317 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2318 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2319 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2324 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2326 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2327 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2332 ceph_tid_t tid
= reply
->get_tid();
2333 bool is_safe
= reply
->is_safe();
2335 if (mds_requests
.count(tid
) == 0) {
2336 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2337 << " safe is:" << is_safe
<< dendl
;
2340 MetaRequest
*request
= mds_requests
.at(tid
);
2342 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2343 << " tid " << tid
<< dendl
;
2345 if (request
->got_unsafe
&& !is_safe
) {
2346 //duplicate response
2347 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2348 << mds_num
<< " safe:" << is_safe
<< dendl
;
2352 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2353 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2354 << " from mds." << request
->mds
<< dendl
;
2355 request
->send_to_auth
= true;
2356 request
->resend_mds
= choose_target_mds(request
);
2357 Inode
*in
= request
->inode();
2358 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2359 if (request
->resend_mds
>= 0 &&
2360 request
->resend_mds
== request
->mds
&&
2362 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2363 request
->sent_on_mseq
== it
->second
.mseq
)) {
2364 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2366 request
->caller_cond
->Signal();
2371 ceph_assert(!request
->reply
);
2372 request
->reply
= reply
;
2373 insert_trace(request
, session
);
2375 // Handle unsafe reply
2377 request
->got_unsafe
= true;
2378 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2379 if (is_dir_operation(request
)) {
2380 Inode
*dir
= request
->inode();
2382 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2384 if (request
->target
) {
2385 InodeRef
&in
= request
->target
;
2386 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2390 // Only signal the caller once (on the first reply):
2391 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2392 if (!is_safe
|| !request
->got_unsafe
) {
2394 request
->dispatch_cond
= &cond
;
2397 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2398 request
->caller_cond
->Signal();
2400 // wake for kick back
2401 while (request
->dispatch_cond
) {
2402 ldout(cct
, 20) << __func__
<< " awaiting kickback on tid " << tid
<< " " << &cond
<< dendl
;
2403 cond
.Wait(client_lock
);
2408 // the filesystem change is committed to disk
2409 // we're done, clean up
2410 if (request
->got_unsafe
) {
2411 request
->unsafe_item
.remove_myself();
2412 request
->unsafe_dir_item
.remove_myself();
2413 request
->unsafe_target_item
.remove_myself();
2414 signal_cond_list(request
->waitfor_safe
);
2416 request
->item
.remove_myself();
2417 unregister_request(request
);
2420 mount_cond
.Signal();
2423 void Client::_handle_full_flag(int64_t pool
)
2425 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2426 << "on " << pool
<< dendl
;
2427 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2428 // to do this rather than blocking, because otherwise when we fill up we
2429 // potentially lock caps forever on files with dirty pages, and we need
2430 // to be able to release those caps to the MDS so that it can delete files
2431 // and free up space.
2432 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2434 // For all inodes with layouts in this pool and a pending flush write op
2435 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2436 // from ObjectCacher so that it doesn't re-issue the write in response to
2437 // the ENOSPC error.
2438 // Fortunately since we're cancelling everything in a given pool, we don't
2439 // need to know which ops belong to which ObjectSet, we can just blow all
2440 // the un-flushed cached data away and mark any dirty inodes' async_err
2441 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2442 // affecting this pool, and all the objectsets we're purging were also
2444 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2445 i
!= inode_map
.end(); ++i
)
2447 Inode
*inode
= i
->second
;
2448 if (inode
->oset
.dirty_or_tx
2449 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2450 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2451 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2452 objectcacher
->purge_set(&inode
->oset
);
2453 inode
->set_async_err(-ENOSPC
);
2457 if (cancelled_epoch
!= (epoch_t
)-1) {
2458 set_cap_epoch_barrier(cancelled_epoch
);
2462 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2464 std::set
<entity_addr_t
> new_blacklists
;
2465 objecter
->consume_blacklist_events(&new_blacklists
);
2467 const auto myaddrs
= messenger
->get_myaddrs();
2468 bool new_blacklist
= false;
2469 bool prenautilus
= objecter
->with_osdmap(
2470 [&](const OSDMap
& o
) {
2471 return o
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
2474 for (auto a
: myaddrs
.v
) {
2475 // blacklist entries are always TYPE_ANY for nautilus+
2476 a
.set_type(entity_addr_t::TYPE_ANY
);
2477 if (new_blacklists
.count(a
)) {
2478 new_blacklist
= true;
2482 // ...except pre-nautilus, they were TYPE_LEGACY
2483 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2484 if (new_blacklists
.count(a
)) {
2485 new_blacklist
= true;
2491 if (new_blacklist
) {
2492 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2493 return o
.get_epoch();
2495 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2498 _abort_mds_sessions(-EBLACKLISTED
);
2500 // Since we know all our OSD ops will fail, cancel them all preemtively,
2501 // so that on an unhealthy cluster we can umount promptly even if e.g.
2502 // some PGs were inaccessible.
2503 objecter
->op_cancel_writes(-EBLACKLISTED
);
2505 } else if (blacklisted
) {
2506 // Handle case where we were blacklisted but no longer are
2507 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2508 return o
.is_blacklisted(myaddrs
);});
2511 // Always subscribe to next osdmap for blacklisted client
2512 // until this client is not blacklisted.
2514 objecter
->maybe_request_map();
2517 if (objecter
->osdmap_full_flag()) {
2518 _handle_full_flag(-1);
2520 // Accumulate local list of full pools so that I can drop
2521 // the objecter lock before re-entering objecter in
2523 std::vector
<int64_t> full_pools
;
2525 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2526 for (const auto& kv
: o
.get_pools()) {
2527 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2528 full_pools
.push_back(kv
.first
);
2533 for (auto p
: full_pools
)
2534 _handle_full_flag(p
);
2536 // Subscribe to subsequent maps to watch for the full flag going
2537 // away. For the global full flag objecter does this for us, but
2538 // it pays no attention to the per-pool full flag so in this branch
2539 // we do it ourselves.
2540 if (!full_pools
.empty()) {
2541 objecter
->maybe_request_map();
2547 // ------------------------
2548 // incoming messages
2551 bool Client::ms_dispatch2(const MessageRef
&m
)
2553 std::lock_guard
l(client_lock
);
2555 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2559 switch (m
->get_type()) {
2560 // mounting and mds sessions
2561 case CEPH_MSG_MDS_MAP
:
2562 handle_mds_map(MMDSMap::msgref_cast(m
));
2564 case CEPH_MSG_FS_MAP
:
2565 handle_fs_map(MFSMap::msgref_cast(m
));
2567 case CEPH_MSG_FS_MAP_USER
:
2568 handle_fs_map_user(MFSMapUser::msgref_cast(m
));
2570 case CEPH_MSG_CLIENT_SESSION
:
2571 handle_client_session(MClientSession::msgref_cast(m
));
2574 case CEPH_MSG_OSD_MAP
:
2575 handle_osd_map(MOSDMap::msgref_cast(m
));
2579 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2580 handle_client_request_forward(MClientRequestForward::msgref_cast(m
));
2582 case CEPH_MSG_CLIENT_REPLY
:
2583 handle_client_reply(MClientReply::msgref_cast(m
));
2587 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2588 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m
));
2591 case CEPH_MSG_CLIENT_SNAP
:
2592 handle_snap(MClientSnap::msgref_cast(m
));
2594 case CEPH_MSG_CLIENT_CAPS
:
2595 handle_caps(MClientCaps::msgref_cast(m
));
2597 case CEPH_MSG_CLIENT_LEASE
:
2598 handle_lease(MClientLease::msgref_cast(m
));
2600 case MSG_COMMAND_REPLY
:
2601 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2602 handle_command_reply(MCommandReply::msgref_cast(m
));
2607 case CEPH_MSG_CLIENT_QUOTA
:
2608 handle_quota(MClientQuota::msgref_cast(m
));
2617 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2618 << "+" << inode_map
.size() << dendl
;
2619 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2621 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2622 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2623 mount_cond
.Signal();
2625 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2626 << "+" << inode_map
.size() << dendl
;
2633 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2635 fsmap
.reset(new FSMap(m
->get_fsmap()));
2637 signal_cond_list(waiting_for_fsmap
);
2639 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2642 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2644 fsmap_user
.reset(new FSMapUser
);
2645 *fsmap_user
= m
->get_fsmap();
2647 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2648 signal_cond_list(waiting_for_fsmap
);
2651 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2653 mds_gid_t old_inc
, new_inc
;
2654 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2655 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2656 << " is identical to or older than our "
2657 << mdsmap
->get_epoch() << dendl
;
2661 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2663 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2664 oldmap
.swap(mdsmap
);
2666 mdsmap
->decode(m
->get_encoded());
2668 // Cancel any commands for missing or laggy GIDs
2669 std::list
<ceph_tid_t
> cancel_ops
;
2670 auto &commands
= command_table
.get_commands();
2671 for (const auto &i
: commands
) {
2672 auto &op
= i
.second
;
2673 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2674 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2675 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2676 cancel_ops
.push_back(i
.first
);
2678 std::ostringstream ss
;
2679 ss
<< "MDS " << op_mds_gid
<< " went away";
2680 *(op
.outs
) = ss
.str();
2682 op
.con
->mark_down();
2684 op
.on_finish
->complete(-ETIMEDOUT
);
2689 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2690 i
!= cancel_ops
.end(); ++i
) {
2691 command_table
.erase(*i
);
2695 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2696 mds_rank_t mds
= p
->first
;
2697 MetaSession
*session
= &p
->second
;
2700 int oldstate
= oldmap
->get_state(mds
);
2701 int newstate
= mdsmap
->get_state(mds
);
2702 if (!mdsmap
->is_up(mds
)) {
2703 session
->con
->mark_down();
2704 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2705 old_inc
= oldmap
->get_incarnation(mds
);
2706 new_inc
= mdsmap
->get_incarnation(mds
);
2707 if (old_inc
!= new_inc
) {
2708 ldout(cct
, 1) << "mds incarnation changed from "
2709 << old_inc
<< " to " << new_inc
<< dendl
;
2710 oldstate
= MDSMap::STATE_NULL
;
2712 session
->con
->mark_down();
2713 session
->addrs
= mdsmap
->get_addrs(mds
);
2714 // When new MDS starts to take over, notify kernel to trim unused entries
2715 // in its dcache/icache. Hopefully, the kernel will release some unused
2716 // inodes before the new MDS enters reconnect state.
2717 trim_cache_for_reconnect(session
);
2718 } else if (oldstate
== newstate
)
2719 continue; // no change
2721 session
->mds_state
= newstate
;
2722 if (newstate
== MDSMap::STATE_RECONNECT
) {
2723 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2724 send_reconnect(session
);
2725 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2726 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2727 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2728 _closed_mds_session(session
);
2731 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2732 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2733 // kick new requests
2734 kick_requests(session
);
2735 kick_flushing_caps(session
);
2736 signal_context_list(session
->waiting_for_open
);
2737 wake_up_session_caps(session
, true);
2739 connect_mds_targets(mds
);
2741 } else if (newstate
== MDSMap::STATE_NULL
&&
2742 mds
>= mdsmap
->get_max_mds()) {
2743 _closed_mds_session(session
);
2747 // kick any waiting threads
2748 signal_cond_list(waiting_for_mdsmap
);
2750 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2753 void Client::send_reconnect(MetaSession
*session
)
2755 mds_rank_t mds
= session
->mds_num
;
2756 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2758 // trim unused caps to reduce MDS's cache rejoin time
2759 trim_cache_for_reconnect(session
);
2761 session
->readonly
= false;
2763 session
->release
.reset();
2765 // reset my cap seq number
2767 //connect to the mds' offload targets
2768 connect_mds_targets(mds
);
2769 //make sure unsafe requests get saved
2770 resend_unsafe_requests(session
);
2772 early_kick_flushing_caps(session
);
2774 auto m
= MClientReconnect::create();
2775 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2777 // i have an open session.
2778 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2779 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2780 p
!= inode_map
.end();
2782 Inode
*in
= p
->second
;
2783 auto it
= in
->caps
.find(mds
);
2784 if (it
!= in
->caps
.end()) {
2786 m
->get_approx_size() >= (std::numeric_limits
<int>::max() >> 1)) {
2788 session
->con
->send_message2(std::move(m
));
2790 m
= MClientReconnect::create();
2793 Cap
&cap
= it
->second
;
2794 ldout(cct
, 10) << " caps on " << p
->first
2795 << " " << ccap_string(cap
.issued
)
2796 << " wants " << ccap_string(in
->caps_wanted())
2799 in
->make_long_path(path
);
2800 ldout(cct
, 10) << " path " << path
<< dendl
;
2803 _encode_filelocks(in
, flockbl
);
2805 cap
.seq
= 0; // reset seq.
2806 cap
.issue_seq
= 0; // reset seq.
2807 cap
.mseq
= 0; // reset seq.
2808 // cap gen should catch up with session cap_gen
2809 if (cap
.gen
< session
->cap_gen
) {
2810 cap
.gen
= session
->cap_gen
;
2811 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2813 cap
.issued
= cap
.implemented
;
2815 snapid_t snap_follows
= 0;
2816 if (!in
->cap_snaps
.empty())
2817 snap_follows
= in
->cap_snaps
.begin()->first
;
2819 m
->add_cap(p
->first
.ino
,
2821 path
.get_ino(), path
.get_path(), // ino
2822 in
->caps_wanted(), // wanted
2823 cap
.issued
, // issued
2828 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2829 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2830 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2831 did_snaprealm
.insert(in
->snaprealm
->ino
);
2837 m
->set_encoding_version(0); // use connection features to choose encoding
2838 session
->con
->send_message2(std::move(m
));
2840 mount_cond
.Signal();
2842 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2843 signal_cond_list(waiting_for_reclaim
);
2847 void Client::kick_requests(MetaSession
*session
)
2849 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2850 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2851 p
!= mds_requests
.end();
2853 MetaRequest
*req
= p
->second
;
2854 if (req
->got_unsafe
)
2856 if (req
->aborted()) {
2857 if (req
->caller_cond
) {
2859 req
->caller_cond
->Signal();
2863 if (req
->retry_attempt
> 0)
2864 continue; // new requests only
2865 if (req
->mds
== session
->mds_num
) {
2866 send_request(p
->second
, session
);
2871 void Client::resend_unsafe_requests(MetaSession
*session
)
2873 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2876 send_request(*iter
, session
);
2878 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2879 // process completed requests in clientreplay stage.
2880 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2881 p
!= mds_requests
.end();
2883 MetaRequest
*req
= p
->second
;
2884 if (req
->got_unsafe
)
2888 if (req
->retry_attempt
== 0)
2889 continue; // old requests only
2890 if (req
->mds
== session
->mds_num
)
2891 send_request(req
, session
, true);
2895 void Client::wait_unsafe_requests()
2897 list
<MetaRequest
*> last_unsafe_reqs
;
2898 for (const auto &p
: mds_sessions
) {
2899 const MetaSession
&s
= p
.second
;
2900 if (!s
.unsafe_requests
.empty()) {
2901 MetaRequest
*req
= s
.unsafe_requests
.back();
2903 last_unsafe_reqs
.push_back(req
);
2907 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2908 p
!= last_unsafe_reqs
.end();
2910 MetaRequest
*req
= *p
;
2911 if (req
->unsafe_item
.is_on_list())
2912 wait_on_list(req
->waitfor_safe
);
2917 void Client::kick_requests_closed(MetaSession
*session
)
2919 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2920 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2921 p
!= mds_requests
.end(); ) {
2922 MetaRequest
*req
= p
->second
;
2924 if (req
->mds
== session
->mds_num
) {
2925 if (req
->caller_cond
) {
2927 req
->caller_cond
->Signal();
2929 req
->item
.remove_myself();
2930 if (req
->got_unsafe
) {
2931 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2932 req
->unsafe_item
.remove_myself();
2933 if (is_dir_operation(req
)) {
2934 Inode
*dir
= req
->inode();
2936 dir
->set_async_err(-EIO
);
2937 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2938 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2939 req
->unsafe_dir_item
.remove_myself();
2942 InodeRef
&in
= req
->target
;
2943 in
->set_async_err(-EIO
);
2944 lderr(cct
) << "kick_requests_closed drop req of inode : "
2945 << in
->ino
<< " " << req
->get_tid() << dendl
;
2946 req
->unsafe_target_item
.remove_myself();
2948 signal_cond_list(req
->waitfor_safe
);
2949 unregister_request(req
);
2953 ceph_assert(session
->requests
.empty());
2954 ceph_assert(session
->unsafe_requests
.empty());
2964 void Client::got_mds_push(MetaSession
*s
)
2967 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2968 if (s
->state
== MetaSession::STATE_CLOSING
) {
2969 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2973 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2975 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2977 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2979 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2980 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2985 got_mds_push(session
);
2987 ceph_seq_t seq
= m
->get_seq();
2990 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
2991 if (inode_map
.count(vino
) == 0) {
2992 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
2995 in
= inode_map
[vino
];
2997 if (m
->get_mask() & CEPH_LOCK_DN
) {
2998 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
2999 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3002 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3003 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3009 auto reply
= MClientLease::create(CEPH_MDS_LEASE_RELEASE
, seq
, m
->get_mask(), m
->get_ino(), m
->get_first(), m
->get_last(), m
->dname
);
3010 m
->get_connection()->send_message2(std::move(reply
));
3014 void Client::put_inode(Inode
*in
, int n
)
3016 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3017 int left
= in
->_put(n
);
3020 remove_all_caps(in
);
3022 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3023 bool unclean
= objectcacher
->release_set(&in
->oset
);
3024 ceph_assert(!unclean
);
3025 inode_map
.erase(in
->vino());
3026 if (use_faked_inos())
3027 _release_faked_ino(in
);
3032 while (!root_parents
.empty())
3033 root_parents
.erase(root_parents
.begin());
3040 void Client::close_dir(Dir
*dir
)
3042 Inode
*in
= dir
->parent_inode
;
3043 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3044 ceph_assert(dir
->is_empty());
3045 ceph_assert(in
->dir
== dir
);
3046 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3047 if (!in
->dentries
.empty())
3048 in
->get_first_parent()->put(); // unpin dentry
3052 put_inode(in
); // unpin inode
3056 * Don't call this with in==NULL, use get_or_create for that
3057 * leave dn set to default NULL unless you're trying to add
3058 * a new inode to a pre-created Dentry
3060 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3063 // create a new Dentry
3064 dn
= new Dentry(dir
, name
);
3066 lru
.lru_insert_mid(dn
); // mid or top?
3068 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3069 << " dn " << dn
<< " (new dn)" << dendl
;
3071 ceph_assert(!dn
->inode
);
3072 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3073 << " dn " << dn
<< " (old dn)" << dendl
;
3076 if (in
) { // link to inode
3078 // only one parent for directories!
3079 if (in
->is_dir() && !in
->dentries
.empty()) {
3080 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3081 Dentry
*olddn
= in
->get_first_parent();
3082 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3083 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3084 old_diri
->dir_release_count
++;
3085 clear_dir_complete_and_ordered(old_diri
, true);
3086 unlink(olddn
, true, true); // keep dir, dentry
3090 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3096 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3098 InodeRef
in(dn
->inode
);
3099 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3100 << " inode " << dn
->inode
<< dendl
;
3102 // unlink from inode
3105 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3111 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3121 if (dir
->is_empty() && !keepdir
)
3127 * For asynchronous flushes, check for errors from the IO and
3128 * update the inode if necessary
3130 class C_Client_FlushComplete
: public Context
{
3135 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3136 void finish(int r
) override
{
3137 ceph_assert(client
->client_lock
.is_locked_by_me());
3139 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3140 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3141 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3142 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3143 inode
->set_async_err(r
);
3153 void Client::get_cap_ref(Inode
*in
, int cap
)
3155 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3156 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3157 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3160 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3161 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3162 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3165 in
->get_cap_ref(cap
);
3168 void Client::put_cap_ref(Inode
*in
, int cap
)
3170 int last
= in
->put_cap_ref(cap
);
3173 int drop
= last
& ~in
->caps_issued();
3174 if (in
->snapid
== CEPH_NOSNAP
) {
3175 if ((last
& CEPH_CAP_FILE_WR
) &&
3176 !in
->cap_snaps
.empty() &&
3177 in
->cap_snaps
.rbegin()->second
.writing
) {
3178 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3179 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3180 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3181 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3183 if (last
& CEPH_CAP_FILE_BUFFER
) {
3184 for (auto &p
: in
->cap_snaps
)
3185 p
.second
.dirty_data
= 0;
3186 signal_cond_list(in
->waitfor_commit
);
3187 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3191 if (last
& CEPH_CAP_FILE_CACHE
) {
3192 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3198 put_inode(in
, put_nref
);
3202 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3204 int r
= check_pool_perm(in
, need
);
3209 int file_wanted
= in
->caps_file_wanted();
3210 if ((file_wanted
& need
) != need
) {
3211 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3212 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3218 int have
= in
->caps_issued(&implemented
);
3220 bool waitfor_caps
= false;
3221 bool waitfor_commit
= false;
3223 if (have
& need
& CEPH_CAP_FILE_WR
) {
3225 (endoff
>= (loff_t
)in
->max_size
||
3226 endoff
> (loff_t
)(in
->size
<< 1)) &&
3227 endoff
> (loff_t
)in
->wanted_max_size
) {
3228 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3229 in
->wanted_max_size
= endoff
;
3233 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3234 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3235 waitfor_caps
= true;
3237 if (!in
->cap_snaps
.empty()) {
3238 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3239 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3240 waitfor_caps
= true;
3242 for (auto &p
: in
->cap_snaps
) {
3243 if (p
.second
.dirty_data
) {
3244 waitfor_commit
= true;
3248 if (waitfor_commit
) {
3249 _flush(in
, new C_Client_FlushComplete(this, in
));
3250 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3255 if (!waitfor_caps
&& !waitfor_commit
) {
3256 if ((have
& need
) == need
) {
3257 int revoking
= implemented
& ~have
;
3258 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3259 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3260 << " revoking " << ccap_string(revoking
)
3262 if ((revoking
& want
) == 0) {
3263 *phave
= need
| (have
& want
);
3264 in
->get_cap_ref(need
);
3268 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3269 waitfor_caps
= true;
3272 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3273 in
->auth_cap
->session
->readonly
)
3276 if (in
->flags
& I_CAP_DROPPED
) {
3277 int mds_wanted
= in
->caps_mds_wanted();
3278 if ((mds_wanted
& need
) != need
) {
3279 int ret
= _renew_caps(in
);
3284 if (!(file_wanted
& ~mds_wanted
))
3285 in
->flags
&= ~I_CAP_DROPPED
;
3289 wait_on_list(in
->waitfor_caps
);
3290 else if (waitfor_commit
)
3291 wait_on_list(in
->waitfor_commit
);
3295 int Client::get_caps_used(Inode
*in
)
3297 unsigned used
= in
->caps_used();
3298 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3299 !objectcacher
->set_is_empty(&in
->oset
))
3300 used
|= CEPH_CAP_FILE_CACHE
;
3304 void Client::cap_delay_requeue(Inode
*in
)
3306 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3307 in
->hold_caps_until
= ceph_clock_now();
3308 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3309 delayed_list
.push_back(&in
->delay_cap_item
);
3312 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3313 int flags
, int used
, int want
, int retain
,
3314 int flush
, ceph_tid_t flush_tid
)
3316 int held
= cap
->issued
| cap
->implemented
;
3317 int revoking
= cap
->implemented
& ~cap
->issued
;
3318 retain
&= ~revoking
;
3319 int dropping
= cap
->issued
& ~retain
;
3320 int op
= CEPH_CAP_OP_UPDATE
;
3322 ldout(cct
, 10) << __func__
<< " " << *in
3323 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3324 << " used " << ccap_string(used
)
3325 << " want " << ccap_string(want
)
3326 << " flush " << ccap_string(flush
)
3327 << " retain " << ccap_string(retain
)
3328 << " held "<< ccap_string(held
)
3329 << " revoking " << ccap_string(revoking
)
3330 << " dropping " << ccap_string(dropping
)
3333 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3334 const int would_have_issued
= cap
->issued
& retain
;
3335 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3340 cap
->issued
= cap
->issued
| cap
->implemented
;
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3346 cap
->issued
^= xattr_mask
& revoking
;
3347 cap
->implemented
^= xattr_mask
& revoking
;
3349 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3350 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3353 cap
->issued
&= retain
;
3354 cap
->implemented
&= cap
->issued
| used
;
3357 snapid_t follows
= 0;
3360 follows
= in
->snaprealm
->get_snap_context().seq
;
3362 auto m
= MClientCaps::create(op
,
3365 cap
->cap_id
, cap
->seq
,
3371 m
->caller_uid
= in
->cap_dirtier_uid
;
3372 m
->caller_gid
= in
->cap_dirtier_gid
;
3374 m
->head
.issue_seq
= cap
->issue_seq
;
3375 m
->set_tid(flush_tid
);
3377 m
->head
.uid
= in
->uid
;
3378 m
->head
.gid
= in
->gid
;
3379 m
->head
.mode
= in
->mode
;
3381 m
->head
.nlink
= in
->nlink
;
3383 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3384 encode(in
->xattrs
, m
->xattrbl
);
3385 m
->head
.xattr_version
= in
->xattr_version
;
3389 m
->max_size
= in
->max_size
;
3390 m
->truncate_seq
= in
->truncate_seq
;
3391 m
->truncate_size
= in
->truncate_size
;
3392 m
->mtime
= in
->mtime
;
3393 m
->atime
= in
->atime
;
3394 m
->ctime
= in
->ctime
;
3395 m
->btime
= in
->btime
;
3396 m
->time_warp_seq
= in
->time_warp_seq
;
3397 m
->change_attr
= in
->change_attr
;
3399 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3400 !in
->cap_snaps
.empty() &&
3401 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3402 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3405 if (flush
& CEPH_CAP_FILE_WR
) {
3406 m
->inline_version
= in
->inline_version
;
3407 m
->inline_data
= in
->inline_data
;
3410 in
->reported_size
= in
->size
;
3411 m
->set_snap_follows(follows
);
3413 if (cap
== in
->auth_cap
) {
3414 m
->set_max_size(in
->wanted_max_size
);
3415 in
->requested_max_size
= in
->wanted_max_size
;
3416 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3419 if (!session
->flushing_caps_tids
.empty())
3420 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3422 session
->con
->send_message2(std::move(m
));
3425 static bool is_max_size_approaching(Inode
*in
)
3427 /* mds will adjust max size according to the reported size */
3428 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3430 if (in
->size
>= in
->max_size
)
3432 /* half of previous max_size increment has been used */
3433 if (in
->max_size
> in
->reported_size
&&
3434 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3439 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3441 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3443 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3446 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3447 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3448 used
&= ~CEPH_CAP_FILE_CACHE
;
3449 used
|= CEPH_CAP_FILE_LAZYIO
;
3451 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3452 used
&= ~CEPH_CAP_FILE_BUFFER
;
3453 used
|= CEPH_CAP_FILE_LAZYIO
;
3456 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3457 used
&= ~CEPH_CAP_FILE_CACHE
;
3458 used
|= CEPH_CAP_FILE_LAZYIO
;
3460 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3461 used
&= ~CEPH_CAP_FILE_BUFFER
;
3462 used
|= CEPH_CAP_FILE_LAZYIO
;
3471 * Examine currently used and wanted versus held caps. Release, flush or ack
3472 * revoked caps to the MDS as appropriate.
3474 * @param in the inode to check
3475 * @param flags flags to apply to cap check
3477 void Client::check_caps(Inode
*in
, unsigned flags
)
3479 unsigned wanted
= in
->caps_wanted();
3480 unsigned used
= get_caps_used(in
);
3484 int issued
= in
->caps_issued(&implemented
);
3485 int revoking
= implemented
& ~issued
;
3487 int orig_used
= used
;
3488 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3490 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3491 if (!unmounting
&& in
->nlink
> 0) {
3493 retain
|= CEPH_CAP_ANY
;
3494 } else if (in
->is_dir() &&
3495 (issued
& CEPH_CAP_FILE_SHARED
) &&
3496 (in
->flags
& I_COMPLETE
)) {
3497 // we do this here because we don't want to drop to Fs (and then
3498 // drop the Fs if we do a create!) if that alone makes us send lookups
3499 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3500 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3503 retain
|= CEPH_CAP_ANY_SHARED
;
3504 // keep RD only if we didn't have the file open RW,
3505 // because then the mds would revoke it anyway to
3506 // journal max_size=0.
3507 if (in
->max_size
== 0)
3508 retain
|= CEPH_CAP_ANY_RD
;
3512 ldout(cct
, 10) << __func__
<< " on " << *in
3513 << " wanted " << ccap_string(wanted
)
3514 << " used " << ccap_string(used
)
3515 << " issued " << ccap_string(issued
)
3516 << " revoking " << ccap_string(revoking
)
3517 << " flags=" << flags
3520 if (in
->snapid
!= CEPH_NOSNAP
)
3521 return; //snap caps last forever, can't write
3523 if (in
->caps
.empty())
3524 return; // guard if at end of func
3526 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3527 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3529 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3533 for (auto &p
: in
->caps
) {
3534 mds_rank_t mds
= p
.first
;
3535 Cap
&cap
= p
.second
;
3537 MetaSession
*session
= &mds_sessions
.at(mds
);
3540 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3541 cap_used
&= ~in
->auth_cap
->issued
;
3543 revoking
= cap
.implemented
& ~cap
.issued
;
3545 ldout(cct
, 10) << " cap mds." << mds
3546 << " issued " << ccap_string(cap
.issued
)
3547 << " implemented " << ccap_string(cap
.implemented
)
3548 << " revoking " << ccap_string(revoking
) << dendl
;
3550 if (in
->wanted_max_size
> in
->max_size
&&
3551 in
->wanted_max_size
> in
->requested_max_size
&&
3552 &cap
== in
->auth_cap
)
3555 /* approaching file_max? */
3556 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3557 &cap
== in
->auth_cap
&&
3558 is_max_size_approaching(in
)) {
3559 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3560 << ", reported " << in
->reported_size
<< dendl
;
3564 /* completed revocation? */
3565 if (revoking
&& (revoking
& cap_used
) == 0) {
3566 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3570 /* want more caps from mds? */
3571 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3574 if (!revoking
&& unmounting
&& (cap_used
== 0))
3577 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3578 !in
->dirty_caps
) // and we have no dirty caps
3581 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3582 ldout(cct
, 10) << "delaying cap release" << dendl
;
3583 cap_delay_requeue(in
);
3588 if (&cap
== in
->auth_cap
) {
3589 if (in
->flags
& I_KICK_FLUSH
) {
3590 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3591 << " to mds." << mds
<< dendl
;
3592 kick_flushing_caps(in
, session
);
3594 if (!in
->cap_snaps
.empty() &&
3595 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3600 ceph_tid_t flush_tid
;
3601 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3602 flushing
= mark_caps_flushing(in
, &flush_tid
);
3608 int msg_flags
= (flags
& CHECK_CAPS_SYNCHRONOUS
) ? MClientCaps::FLAG_SYNC
: 0;
3609 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3610 flushing
, flush_tid
);
3615 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3617 int used
= get_caps_used(in
);
3618 int dirty
= in
->caps_dirty();
3619 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3621 if (in
->cap_snaps
.size() &&
3622 in
->cap_snaps
.rbegin()->second
.writing
) {
3623 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3625 } else if (in
->caps_dirty() ||
3626 (used
& CEPH_CAP_FILE_WR
) ||
3627 (dirty
& CEPH_CAP_ANY_WR
)) {
3628 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3629 ceph_assert(capsnapem
.second
); /* element inserted */
3630 CapSnap
&capsnap
= capsnapem
.first
->second
;
3631 capsnap
.context
= old_snapc
;
3632 capsnap
.issued
= in
->caps_issued();
3633 capsnap
.dirty
= in
->caps_dirty();
3635 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3637 capsnap
.uid
= in
->uid
;
3638 capsnap
.gid
= in
->gid
;
3639 capsnap
.mode
= in
->mode
;
3640 capsnap
.btime
= in
->btime
;
3641 capsnap
.xattrs
= in
->xattrs
;
3642 capsnap
.xattr_version
= in
->xattr_version
;
3643 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3644 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3646 if (used
& CEPH_CAP_FILE_WR
) {
3647 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3648 capsnap
.writing
= 1;
3650 finish_cap_snap(in
, capsnap
, used
);
3653 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3657 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3659 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3660 capsnap
.size
= in
->size
;
3661 capsnap
.mtime
= in
->mtime
;
3662 capsnap
.atime
= in
->atime
;
3663 capsnap
.ctime
= in
->ctime
;
3664 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3665 capsnap
.change_attr
= in
->change_attr
;
3666 capsnap
.dirty
|= in
->caps_dirty();
3668 /* Only reset it if it wasn't set before */
3669 if (capsnap
.cap_dirtier_uid
== -1) {
3670 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3671 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3674 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3675 capsnap
.inline_data
= in
->inline_data
;
3676 capsnap
.inline_version
= in
->inline_version
;
3679 if (used
& CEPH_CAP_FILE_BUFFER
) {
3680 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3681 << " WRBUFFER, delaying" << dendl
;
3683 capsnap
.dirty_data
= 0;
3688 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3690 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3691 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3695 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3696 snapid_t follows
, CapSnap
& capsnap
)
3698 auto m
= MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP
,
3699 in
->ino
, in
->snaprealm
->ino
, 0,
3700 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3701 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3702 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3704 m
->set_client_tid(capsnap
.flush_tid
);
3705 m
->head
.snap_follows
= follows
;
3707 m
->head
.caps
= capsnap
.issued
;
3708 m
->head
.dirty
= capsnap
.dirty
;
3710 m
->head
.uid
= capsnap
.uid
;
3711 m
->head
.gid
= capsnap
.gid
;
3712 m
->head
.mode
= capsnap
.mode
;
3713 m
->btime
= capsnap
.btime
;
3715 m
->size
= capsnap
.size
;
3717 m
->head
.xattr_version
= capsnap
.xattr_version
;
3718 encode(capsnap
.xattrs
, m
->xattrbl
);
3720 m
->ctime
= capsnap
.ctime
;
3721 m
->btime
= capsnap
.btime
;
3722 m
->mtime
= capsnap
.mtime
;
3723 m
->atime
= capsnap
.atime
;
3724 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3725 m
->change_attr
= capsnap
.change_attr
;
3727 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3728 m
->inline_version
= in
->inline_version
;
3729 m
->inline_data
= in
->inline_data
;
3732 ceph_assert(!session
->flushing_caps_tids
.empty());
3733 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3735 session
->con
->send_message2(std::move(m
));
3738 void Client::flush_snaps(Inode
*in
)
3740 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3741 ceph_assert(in
->cap_snaps
.size());
3744 ceph_assert(in
->auth_cap
);
3745 MetaSession
*session
= in
->auth_cap
->session
;
3747 for (auto &p
: in
->cap_snaps
) {
3748 CapSnap
&capsnap
= p
.second
;
3749 // only do new flush
3750 if (capsnap
.flush_tid
> 0)
3753 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3754 << " follows " << p
.first
3755 << " size " << capsnap
.size
3756 << " mtime " << capsnap
.mtime
3757 << " dirty_data=" << capsnap
.dirty_data
3758 << " writing=" << capsnap
.writing
3759 << " on " << *in
<< dendl
;
3760 if (capsnap
.dirty_data
|| capsnap
.writing
)
3763 capsnap
.flush_tid
= ++last_flush_tid
;
3764 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3765 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3766 if (!in
->flushing_cap_item
.is_on_list())
3767 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3769 send_flush_snap(in
, session
, p
.first
, capsnap
);
3773 void Client::wait_on_list(list
<Cond
*>& ls
)
3776 ls
.push_back(&cond
);
3777 cond
.Wait(client_lock
);
3781 void Client::signal_cond_list(list
<Cond
*>& ls
)
3783 for (list
<Cond
*>::iterator it
= ls
.begin(); it
!= ls
.end(); ++it
)
3787 void Client::wait_on_context_list(list
<Context
*>& ls
)
3792 ls
.push_back(new C_Cond(&cond
, &done
, &r
));
3794 cond
.Wait(client_lock
);
3797 void Client::signal_context_list(list
<Context
*>& ls
)
3799 while (!ls
.empty()) {
3800 ls
.front()->complete(0);
3805 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3807 for (const auto &cap
: s
->caps
) {
3808 auto &in
= cap
->inode
;
3810 in
.requested_max_size
= 0;
3811 in
.wanted_max_size
= 0;
3813 if (cap
->gen
< s
->cap_gen
) {
3814 // mds did not re-issue stale cap.
3815 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3816 // make sure mds knows what we want.
3817 if (in
.caps_file_wanted() & ~cap
->wanted
)
3818 in
.flags
|= I_CAP_DROPPED
;
3821 signal_cond_list(in
.waitfor_caps
);
3826 // flush dirty data (from objectcache)
3828 class C_Client_CacheInvalidate
: public Context
{
3832 int64_t offset
, length
;
3834 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3835 client(c
), offset(off
), length(len
) {
3836 if (client
->use_faked_inos())
3837 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3841 void finish(int r
) override
{
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3843 ceph_assert(!client
->client_lock
.is_locked_by_me());
3844 client
->_async_invalidate(ino
, offset
, length
);
3848 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3852 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3853 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3856 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3858 if (ino_invalidate_cb
)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3863 void Client::_invalidate_inode_cache(Inode
*in
)
3865 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3867 // invalidate our userspace inode cache
3868 if (cct
->_conf
->client_oc
) {
3869 objectcacher
->release_set(&in
->oset
);
3870 if (!objectcacher
->set_is_empty(&in
->oset
))
3871 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3874 _schedule_invalidate_callback(in
, 0, 0);
3877 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3879 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3881 // invalidate our userspace inode cache
3882 if (cct
->_conf
->client_oc
) {
3883 vector
<ObjectExtent
> ls
;
3884 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3885 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3888 _schedule_invalidate_callback(in
, off
, len
);
3891 bool Client::_release(Inode
*in
)
3893 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3894 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3895 _invalidate_inode_cache(in
);
3901 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3903 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3905 if (!in
->oset
.dirty_or_tx
) {
3906 ldout(cct
, 10) << " nothing to flush" << dendl
;
3907 onfinish
->complete(0);
3911 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3912 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3913 objectcacher
->purge_set(&in
->oset
);
3915 onfinish
->complete(-ENOSPC
);
3920 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3923 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3925 ceph_assert(client_lock
.is_locked());
3926 if (!in
->oset
.dirty_or_tx
) {
3927 ldout(cct
, 10) << " nothing to flush" << dendl
;
3931 C_SaferCond
onflush("Client::_flush_range flock");
3932 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3933 offset
, size
, &onflush
);
3936 client_lock
.Unlock();
3942 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock
.is_locked()); // will be called via dispatch() -> objecter -> ...
3946 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3951 void Client::_flushed(Inode
*in
)
3953 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3955 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3960 // checks common to add_update_cap, handle_cap_grant
3961 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
3963 unsigned had
= in
->caps_issued();
3965 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3966 !(had
& CEPH_CAP_FILE_CACHE
))
3969 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3970 !(had
& CEPH_CAP_FILE_SHARED
)) {
3974 clear_dir_complete_and_ordered(in
, true);
3978 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3979 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
3980 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
3982 if (!in
->is_any_caps()) {
3983 ceph_assert(in
->snaprealm
== 0);
3984 in
->snaprealm
= get_snap_realm(realm
);
3985 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3986 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
3988 ceph_assert(in
->snaprealm
);
3989 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
3990 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
3991 in
->snaprealm_item
.remove_myself();
3992 auto oldrealm
= in
->snaprealm
;
3993 in
->snaprealm
= get_snap_realm(realm
);
3994 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3995 put_snap_realm(oldrealm
);
3999 mds_rank_t mds
= mds_session
->mds_num
;
4000 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4001 Cap
&cap
= capem
.first
->second
;
4002 if (!capem
.second
) {
4003 if (cap
.gen
< mds_session
->cap_gen
)
4004 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4015 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4016 if (&cap
!= in
->auth_cap
)
4017 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4019 ceph_assert(cap
.cap_id
== cap_id
);
4022 issued
|= cap
.issued
;
4023 flags
|= CEPH_CAP_FLAG_AUTH
;
4027 check_cap_issue(in
, issued
);
4029 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4030 if (in
->auth_cap
!= &cap
&&
4031 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4032 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4033 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4034 << "add myself to new auth MDS' flushing caps list" << dendl
;
4035 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4037 in
->auth_cap
= &cap
;
4041 unsigned old_caps
= cap
.issued
;
4042 cap
.cap_id
= cap_id
;
4043 cap
.issued
= issued
;
4044 cap
.implemented
|= issued
;
4045 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4046 cap
.wanted
= wanted
;
4048 cap
.wanted
|= wanted
;
4050 cap
.issue_seq
= seq
;
4052 cap
.gen
= mds_session
->cap_gen
;
4053 cap
.latest_perms
= cap_perms
;
4054 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4055 << " from mds." << mds
4059 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4060 // non-auth MDS is revoking the newly grant caps ?
4061 for (auto &p
: in
->caps
) {
4062 if (&p
.second
== &cap
)
4064 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4065 check_caps(in
, CHECK_CAPS_NODELAY
);
4071 if (issued
& ~old_caps
)
4072 signal_cond_list(in
->waitfor_caps
);
4075 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4077 auto &in
= cap
->inode
;
4078 MetaSession
*session
= cap
->session
;
4079 mds_rank_t mds
= cap
->session
->mds_num
;
4081 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4083 if (queue_release
) {
4084 session
->enqueue_cap_release(
4092 if (in
.auth_cap
== cap
) {
4093 if (in
.flushing_cap_item
.is_on_list()) {
4094 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4095 in
.flushing_cap_item
.remove_myself();
4099 size_t n
= in
.caps
.erase(mds
);
4100 ceph_assert(n
== 1);
4103 if (!in
.is_any_caps()) {
4104 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4105 in
.snaprealm_item
.remove_myself();
4106 put_snap_realm(in
.snaprealm
);
4111 void Client::remove_all_caps(Inode
*in
)
4113 while (!in
->caps
.empty())
4114 remove_cap(&in
->caps
.begin()->second
, true);
4117 void Client::remove_session_caps(MetaSession
*s
)
4119 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4121 while (s
->caps
.size()) {
4122 Cap
*cap
= *s
->caps
.begin();
4123 InodeRef
in(&cap
->inode
);
4124 bool dirty_caps
= false;
4125 if (in
->auth_cap
== cap
) {
4126 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4127 in
->wanted_max_size
= 0;
4128 in
->requested_max_size
= 0;
4130 if (cap
->wanted
| cap
->issued
)
4131 in
->flags
|= I_CAP_DROPPED
;
4132 remove_cap(cap
, false);
4133 in
->cap_snaps
.clear();
4135 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4136 if (in
->flushing_caps
) {
4137 num_flushing_caps
--;
4138 in
->flushing_cap_tids
.clear();
4140 in
->flushing_caps
= 0;
4141 in
->mark_caps_clean();
4142 put_inode(in
.get());
4144 signal_cond_list(in
->waitfor_caps
);
4146 s
->flushing_caps_tids
.clear();
4150 int Client::_do_remount(bool retry_on_error
)
4152 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4155 int r
= remount_cb(callback_handle
);
4157 retries_on_invalidate
= 0;
4160 client_t whoami
= get_nodeid();
4163 "failed to remount (to trim kernel dentries): "
4164 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4167 "failed to remount (to trim kernel dentries): "
4168 "return code = " << r
<< dendl
;
4171 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4172 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4173 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4174 if (should_abort
&& !unmounting
) {
4175 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4182 class C_Client_Remount
: public Context
{
4186 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4187 void finish(int r
) override
{
4188 ceph_assert(r
== 0);
4189 client
->_do_remount(true);
4193 void Client::_invalidate_kernel_dcache()
4197 if (can_invalidate_dentries
) {
4198 if (dentry_invalidate_cb
&& root
->dir
) {
4199 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4200 p
!= root
->dir
->dentries
.end();
4202 if (p
->second
->inode
)
4203 _schedule_invalidate_dentry_callback(p
->second
, false);
4206 } else if (remount_cb
) {
4208 // when remounting a file system, linux kernel trims all unused dentries in the fs
4209 remount_finisher
.queue(new C_Client_Remount(this));
4213 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4219 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4220 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4221 Dentry
*dn
= p
->second
;
4223 ceph_assert(!dn
->inode
);
4224 if (dn
->lru_is_expireable())
4225 unlink(dn
, true, false); // keep dir, drop dentry
4227 if (dir
->dentries
.empty()) {
4232 if (in
->flags
& I_SNAPDIR_OPEN
) {
4233 InodeRef snapdir
= open_snapdir(in
.get());
4234 _trim_negative_child_dentries(snapdir
);
4238 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4240 mds_rank_t mds
= s
->mds_num
;
4241 size_t caps_size
= s
->caps
.size();
4242 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4243 << " caps " << caps_size
<< dendl
;
4245 uint64_t trimmed
= 0;
4246 auto p
= s
->caps
.begin();
4247 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4248 * looking at from getting deleted during traversal. */
4249 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4251 InodeRef
in(&cap
->inode
);
4253 // Increment p early because it will be invalidated if cap
4254 // is deleted inside remove_cap
4257 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4258 int mine
= cap
->issued
| cap
->implemented
;
4259 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4260 // disposable non-auth cap
4261 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4262 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4263 cap
= (remove_cap(cap
, true), nullptr);
4267 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4268 _trim_negative_child_dentries(in
);
4270 auto q
= in
->dentries
.begin();
4271 while (q
!= in
->dentries
.end()) {
4274 if (dn
->lru_is_expireable()) {
4275 if (can_invalidate_dentries
&&
4276 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4277 // Only issue one of these per DN for inodes in root: handle
4278 // others more efficiently by calling for root-child DNs at
4279 // the end of this function.
4280 _schedule_invalidate_dentry_callback(dn
, true);
4282 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4285 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4289 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4290 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4295 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4296 for (const auto &dn
: to_trim
) {
4301 caps_size
= s
->caps
.size();
4302 if (caps_size
> (size_t)max
)
4303 _invalidate_kernel_dcache();
4306 void Client::force_session_readonly(MetaSession
*s
)
4309 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4310 auto &in
= (*p
)->inode
;
4311 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4312 signal_cond_list(in
.waitfor_caps
);
4316 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4318 MetaSession
*session
= in
->auth_cap
->session
;
4320 int flushing
= in
->dirty_caps
;
4321 ceph_assert(flushing
);
4323 ceph_tid_t flush_tid
= ++last_flush_tid
;
4324 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4326 if (!in
->flushing_caps
) {
4327 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4328 num_flushing_caps
++;
4330 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4333 in
->flushing_caps
|= flushing
;
4334 in
->mark_caps_clean();
4336 if (!in
->flushing_cap_item
.is_on_list())
4337 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4338 session
->flushing_caps_tids
.insert(flush_tid
);
4344 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4346 for (auto &p
: in
->cap_snaps
) {
4347 CapSnap
&capsnap
= p
.second
;
4348 if (capsnap
.flush_tid
> 0) {
4349 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4350 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4353 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4354 it
!= in
->flushing_cap_tids
.end();
4356 old_s
->flushing_caps_tids
.erase(it
->first
);
4357 new_s
->flushing_caps_tids
.insert(it
->first
);
4359 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4363 * Flush all caps back to the MDS. Because the callers generally wait on the
4364 * result of this function (syncfs and umount cases), we set
4365 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4367 void Client::flush_caps_sync()
4369 ldout(cct
, 10) << __func__
<< dendl
;
4370 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4372 unsigned flags
= CHECK_CAPS_NODELAY
;
4376 delayed_list
.pop_front();
4377 if (p
.end() && dirty_list
.empty())
4378 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4379 check_caps(in
, flags
);
4383 p
= dirty_list
.begin();
4385 unsigned flags
= CHECK_CAPS_NODELAY
;
4390 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4391 check_caps(in
, flags
);
4395 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4397 while (in
->flushing_caps
) {
4398 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4399 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4400 if (it
->first
> want
)
4402 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4403 << ccap_string(it
->second
) << " want " << want
4404 << " last " << it
->first
<< dendl
;
4405 wait_on_list(in
->waitfor_caps
);
4409 void Client::wait_sync_caps(ceph_tid_t want
)
4412 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4413 << num_flushing_caps
<< " total flushing)" << dendl
;
4414 for (auto &p
: mds_sessions
) {
4415 MetaSession
*s
= &p
.second
;
4416 if (s
->flushing_caps_tids
.empty())
4418 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4419 if (oldest_tid
<= want
) {
4420 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4421 << " (want " << want
<< ")" << dendl
;
4422 sync_cond
.Wait(client_lock
);
4428 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4430 in
->flags
&= ~I_KICK_FLUSH
;
4432 Cap
*cap
= in
->auth_cap
;
4433 ceph_assert(cap
->session
== session
);
4435 ceph_tid_t last_snap_flush
= 0;
4436 for (auto p
= in
->flushing_cap_tids
.rbegin();
4437 p
!= in
->flushing_cap_tids
.rend();
4440 last_snap_flush
= p
->first
;
4445 int wanted
= in
->caps_wanted();
4446 int used
= get_caps_used(in
) | in
->caps_dirty();
4447 auto it
= in
->cap_snaps
.begin();
4448 for (auto& p
: in
->flushing_cap_tids
) {
4450 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4451 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4454 ceph_assert(it
!= in
->cap_snaps
.end());
4455 ceph_assert(it
->second
.flush_tid
== p
.first
);
4456 send_flush_snap(in
, session
, it
->first
, it
->second
);
4462 void Client::kick_flushing_caps(MetaSession
*session
)
4464 mds_rank_t mds
= session
->mds_num
;
4465 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4467 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4469 if (in
->flags
& I_KICK_FLUSH
) {
4470 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4471 kick_flushing_caps(in
, session
);
4476 void Client::early_kick_flushing_caps(MetaSession
*session
)
4478 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4480 Cap
*cap
= in
->auth_cap
;
4483 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4484 // stage. This guarantees that MDS processes the cap flush message before issuing
4485 // the flushing caps to other client.
4486 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4487 in
->flags
|= I_KICK_FLUSH
;
4491 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4492 << " to mds." << session
->mds_num
<< dendl
;
4493 // send_reconnect() also will reset these sequence numbers. make sure
4494 // sequence numbers in cap flush message match later reconnect message.
4498 cap
->issued
= cap
->implemented
;
4500 kick_flushing_caps(in
, session
);
4504 void SnapRealm::build_snap_context()
4506 set
<snapid_t
> snaps
;
4507 snapid_t max_seq
= seq
;
4509 // start with prior_parents?
4510 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4511 snaps
.insert(prior_parent_snaps
[i
]);
4513 // current parent's snaps
4515 const SnapContext
& psnapc
= pparent
->get_snap_context();
4516 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4517 if (psnapc
.snaps
[i
] >= parent_since
)
4518 snaps
.insert(psnapc
.snaps
[i
]);
4519 if (psnapc
.seq
> max_seq
)
4520 max_seq
= psnapc
.seq
;
4524 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4525 snaps
.insert(my_snaps
[i
]);
4528 cached_snap_context
.seq
= max_seq
;
4529 cached_snap_context
.snaps
.resize(0);
4530 cached_snap_context
.snaps
.reserve(snaps
.size());
4531 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4532 cached_snap_context
.snaps
.push_back(*p
);
4535 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4540 while (!q
.empty()) {
4544 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4545 realm
->invalidate_cache();
4547 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4548 p
!= realm
->pchildren
.end();
4554 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4556 SnapRealm
*realm
= snap_realms
[r
];
4558 snap_realms
[r
] = realm
= new SnapRealm(r
);
4559 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4564 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4566 if (snap_realms
.count(r
) == 0) {
4567 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4570 SnapRealm
*realm
= snap_realms
[r
];
4571 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4576 void Client::put_snap_realm(SnapRealm
*realm
)
4578 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4579 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4580 if (--realm
->nref
== 0) {
4581 snap_realms
.erase(realm
->ino
);
4582 if (realm
->pparent
) {
4583 realm
->pparent
->pchildren
.erase(realm
);
4584 put_snap_realm(realm
->pparent
);
4590 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4592 if (realm
->parent
!= parent
) {
4593 ldout(cct
, 10) << __func__
<< " " << *realm
4594 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4595 realm
->parent
= parent
;
4596 if (realm
->pparent
) {
4597 realm
->pparent
->pchildren
.erase(realm
);
4598 put_snap_realm(realm
->pparent
);
4600 realm
->pparent
= get_snap_realm(parent
);
4601 realm
->pparent
->pchildren
.insert(realm
);
4607 static bool has_new_snaps(const SnapContext
& old_snapc
,
4608 const SnapContext
& new_snapc
)
4610 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4614 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4616 SnapRealm
*first_realm
= NULL
;
4617 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4619 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4621 auto p
= bl
.cbegin();
4625 SnapRealm
*realm
= get_snap_realm(info
.ino());
4627 bool invalidate
= false;
4629 if (info
.seq() > realm
->seq
) {
4630 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4634 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4635 // flush me + children
4638 while (!q
.empty()) {
4639 SnapRealm
*realm
= q
.front();
4642 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4643 p
!= realm
->pchildren
.end();
4647 if (dirty_realms
.count(realm
) == 0) {
4649 dirty_realms
[realm
] = realm
->get_snap_context();
4655 realm
->seq
= info
.seq();
4656 realm
->created
= info
.created();
4657 realm
->parent_since
= info
.parent_since();
4658 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4659 realm
->my_snaps
= info
.my_snaps
;
4663 // _always_ verify parent
4664 if (adjust_realm_parent(realm
, info
.parent()))
4668 invalidate_snaprealm_and_children(realm
);
4669 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4670 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4672 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4673 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4677 first_realm
= realm
;
4679 put_snap_realm(realm
);
4682 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4683 q
!= dirty_realms
.end();
4685 SnapRealm
*realm
= q
->first
;
4686 // if there are new snaps ?
4687 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4688 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4689 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4693 queue_cap_snap(in
, q
->second
);
4696 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4698 put_snap_realm(realm
);
4702 *realm_ret
= first_realm
;
4704 put_snap_realm(first_realm
);
4707 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4709 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4710 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4711 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4716 got_mds_push(session
);
4718 map
<Inode
*, SnapContext
> to_move
;
4719 SnapRealm
*realm
= 0;
4721 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4722 ceph_assert(m
->head
.split
);
4724 auto p
= m
->bl
.cbegin();
4726 ceph_assert(info
.ino() == m
->head
.split
);
4728 // flush, then move, ino's.
4729 realm
= get_snap_realm(info
.ino());
4730 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4731 for (auto& ino
: m
->split_inos
) {
4732 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4733 if (inode_map
.count(vino
)) {
4734 Inode
*in
= inode_map
[vino
];
4735 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4737 if (in
->snaprealm
->created
> info
.created()) {
4738 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4739 << *in
->snaprealm
<< dendl
;
4742 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4745 in
->snaprealm_item
.remove_myself();
4746 to_move
[in
] = in
->snaprealm
->get_snap_context();
4747 put_snap_realm(in
->snaprealm
);
4751 // move child snaprealms, too
4752 for (auto& child_realm
: m
->split_realms
) {
4753 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4754 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4757 adjust_realm_parent(child
, realm
->ino
);
4758 put_snap_realm(child
);
4762 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4765 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4766 Inode
*in
= p
->first
;
4767 in
->snaprealm
= realm
;
4768 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4770 // queue for snap writeback
4771 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4772 queue_cap_snap(in
, p
->second
);
4774 put_snap_realm(realm
);
4778 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4780 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4781 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4786 got_mds_push(session
);
4788 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4790 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4791 if (inode_map
.count(vino
)) {
4793 in
= inode_map
[vino
];
4796 in
->quota
= m
->quota
;
4797 in
->rstat
= m
->rstat
;
4802 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4804 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4805 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4810 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4811 // Pause RADOS operations until we see the required epoch
4812 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4815 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4816 // Record the barrier so that we will transmit it to MDS when releasing
4817 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4820 got_mds_push(session
);
4823 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4824 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4827 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4828 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4829 session
->enqueue_cap_release(
4836 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4839 // in case the mds is waiting on e.g. a revocation
4840 flush_cap_releases();
4844 switch (m
->get_op()) {
4845 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4846 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4847 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4850 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4851 Cap
&cap
= in
->caps
.at(mds
);
4853 switch (m
->get_op()) {
4854 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4855 case CEPH_CAP_OP_IMPORT
:
4856 case CEPH_CAP_OP_REVOKE
:
4857 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4858 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4861 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4866 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4868 mds_rank_t mds
= session
->mds_num
;
4870 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4871 << " IMPORT from mds." << mds
<< dendl
;
4873 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4876 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4878 cap_perms
= cap
->latest_perms
;
4882 SnapRealm
*realm
= NULL
;
4883 update_snap_trace(m
->snapbl
, &realm
);
4885 add_update_cap(in
, session
, m
->get_cap_id(),
4886 m
->get_caps(), m
->get_wanted(), m
->get_seq(), m
->get_mseq(),
4887 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4889 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4890 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4894 put_snap_realm(realm
);
4896 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4897 // reflush any/all caps (if we are now the auth_cap)
4898 kick_flushing_caps(in
, session
);
4902 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4904 mds_rank_t mds
= session
->mds_num
;
4906 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4907 << " EXPORT from mds." << mds
<< dendl
;
4909 auto it
= in
->caps
.find(mds
);
4910 if (it
!= in
->caps
.end()) {
4911 Cap
&cap
= it
->second
;
4912 if (cap
.cap_id
== m
->get_cap_id()) {
4913 if (m
->peer
.cap_id
) {
4914 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
4915 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4916 auto it
= in
->caps
.find(peer_mds
);
4917 if (it
!= in
->caps
.end()) {
4918 Cap
&tcap
= it
->second
;
4919 if (tcap
.cap_id
== m
->peer
.cap_id
&&
4920 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
4921 tcap
.cap_id
= m
->peer
.cap_id
;
4922 tcap
.seq
= m
->peer
.seq
- 1;
4923 tcap
.issue_seq
= tcap
.seq
;
4924 tcap
.issued
|= cap
.issued
;
4925 tcap
.implemented
|= cap
.issued
;
4926 if (&cap
== in
->auth_cap
)
4927 in
->auth_cap
= &tcap
;
4928 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
4929 adjust_session_flushing_caps(in
, session
, tsession
);
4932 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
4933 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4934 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4938 if (cap
.wanted
| cap
.issued
)
4939 in
->flags
|= I_CAP_DROPPED
;
4942 remove_cap(&cap
, false);
4947 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4949 mds_rank_t mds
= session
->mds_num
;
4950 ceph_assert(in
->caps
.count(mds
));
4952 ldout(cct
, 10) << __func__
<< " on ino " << *in
4953 << " size " << in
->size
<< " -> " << m
->get_size()
4957 in
->caps_issued(&issued
);
4958 issued
|= in
->caps_dirty();
4959 update_inode_file_size(in
, issued
, m
->get_size(),
4960 m
->get_truncate_seq(), m
->get_truncate_size());
4963 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
4965 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4966 int dirty
= m
->get_dirty();
4970 auto it
= in
->flushing_cap_tids
.begin();
4971 if (it
->first
< flush_ack_tid
) {
4972 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
4973 << " got unexpected flush ack tid " << flush_ack_tid
4974 << " expected is " << it
->first
<< dendl
;
4976 for (; it
!= in
->flushing_cap_tids
.end(); ) {
4982 if (it
->first
== flush_ack_tid
)
4983 cleaned
= it
->second
;
4984 if (it
->first
<= flush_ack_tid
) {
4985 session
->flushing_caps_tids
.erase(it
->first
);
4986 in
->flushing_cap_tids
.erase(it
++);
4990 cleaned
&= ~it
->second
;
4996 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
4997 << " cleaned " << ccap_string(cleaned
) << " on " << *in
4998 << " with " << ccap_string(dirty
) << dendl
;
5001 signal_cond_list(in
->waitfor_caps
);
5002 if (session
->flushing_caps_tids
.empty() ||
5003 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5008 in
->cap_dirtier_uid
= -1;
5009 in
->cap_dirtier_gid
= -1;
5013 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5015 if (in
->flushing_caps
) {
5016 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5017 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5018 in
->flushing_caps
&= ~cleaned
;
5019 if (in
->flushing_caps
== 0) {
5020 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5021 num_flushing_caps
--;
5022 if (in
->flushing_cap_tids
.empty())
5023 in
->flushing_cap_item
.remove_myself();
5025 if (!in
->caps_dirty())
5032 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5034 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5035 mds_rank_t mds
= session
->mds_num
;
5036 ceph_assert(in
->caps
.count(mds
));
5037 snapid_t follows
= m
->get_snap_follows();
5039 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5040 auto& capsnap
= it
->second
;
5041 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5042 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5044 InodeRef
tmp_ref(in
);
5045 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5046 << " on " << *in
<< dendl
;
5047 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5048 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5049 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5050 in
->flushing_cap_item
.remove_myself();
5051 in
->cap_snaps
.erase(it
);
5053 signal_cond_list(in
->waitfor_caps
);
5054 if (session
->flushing_caps_tids
.empty() ||
5055 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5059 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5060 << " on " << *in
<< dendl
;
5061 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5065 class C_Client_DentryInvalidate
: public Context
{
5072 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5073 client(c
), name(dn
->name
) {
5074 if (client
->use_faked_inos()) {
5075 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5077 ino
.ino
= dn
->inode
->faked_ino
;
5079 dirino
= dn
->dir
->parent_inode
->vino();
5081 ino
= dn
->inode
->vino();
5084 ino
.ino
= inodeno_t();
5086 void finish(int r
) override
{
5087 // _async_dentry_invalidate is responsible for its own locking
5088 ceph_assert(!client
->client_lock
.is_locked_by_me());
5089 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5093 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5097 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5098 << " in dir " << dirino
<< dendl
;
5099 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5102 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5104 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5105 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5108 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5110 int ref
= in
->get_num_ref();
5111 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5113 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5114 for (auto p
= in
->dir
->dentries
.begin();
5115 p
!= in
->dir
->dentries
.end(); ) {
5116 Dentry
*dn
= p
->second
;
5118 /* rmsnap removes whole subtree, need trim inodes recursively.
5119 * we don't need to invalidate dentries recursively. because
5120 * invalidating a directory dentry effectively invalidate
5122 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5123 _try_to_trim_inode(dn
->inode
.get(), false);
5125 if (dn
->lru_is_expireable())
5126 unlink(dn
, true, false); // keep dir, drop dentry
5128 if (in
->dir
->dentries
.empty()) {
5134 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5135 InodeRef snapdir
= open_snapdir(in
);
5136 _try_to_trim_inode(snapdir
.get(), false);
5141 auto q
= in
->dentries
.begin();
5142 while (q
!= in
->dentries
.end()) {
5145 if( in
->ll_ref
> 0 && sched_inval
) {
5146 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5147 // so in->dentries doesn't always reflect the state of kernel's dcache.
5148 _schedule_invalidate_dentry_callback(dn
, true);
5150 unlink(dn
, true, true);
5155 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5157 mds_rank_t mds
= session
->mds_num
;
5158 int used
= get_caps_used(in
);
5159 int wanted
= in
->caps_wanted();
5161 const unsigned new_caps
= m
->get_caps();
5162 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5163 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5164 << " mds." << mds
<< " seq " << m
->get_seq()
5165 << " caps now " << ccap_string(new_caps
)
5166 << " was " << ccap_string(cap
->issued
)
5167 << (was_stale
? " (stale)" : "") << dendl
;
5170 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5171 cap
->seq
= m
->get_seq();
5172 cap
->gen
= session
->cap_gen
;
5174 check_cap_issue(in
, new_caps
);
5178 in
->caps_issued(&issued
);
5179 issued
|= in
->caps_dirty();
5181 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5182 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5183 in
->mode
= m
->head
.mode
;
5184 in
->uid
= m
->head
.uid
;
5185 in
->gid
= m
->head
.gid
;
5186 in
->btime
= m
->btime
;
5188 bool deleted_inode
= false;
5189 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5190 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5191 in
->nlink
= m
->head
.nlink
;
5192 if (in
->nlink
== 0 &&
5193 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5194 deleted_inode
= true;
5196 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5197 m
->xattrbl
.length() &&
5198 m
->head
.xattr_version
> in
->xattr_version
) {
5199 auto p
= m
->xattrbl
.cbegin();
5200 decode(in
->xattrs
, p
);
5201 in
->xattr_version
= m
->head
.xattr_version
;
5204 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5205 in
->dirstat
.nfiles
= m
->get_nfiles();
5206 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5209 if (new_caps
& CEPH_CAP_ANY_RD
) {
5210 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5211 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5214 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5215 in
->layout
= m
->get_layout();
5216 update_inode_file_size(in
, issued
, m
->get_size(),
5217 m
->get_truncate_seq(), m
->get_truncate_size());
5220 if (m
->inline_version
> in
->inline_version
) {
5221 in
->inline_data
= m
->inline_data
;
5222 in
->inline_version
= m
->inline_version
;
5225 /* always take a newer change attr */
5226 if (m
->get_change_attr() > in
->change_attr
)
5227 in
->change_attr
= m
->get_change_attr();
5230 if (cap
== in
->auth_cap
&&
5231 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5232 (m
->get_max_size() != in
->max_size
)) {
5233 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5234 in
->max_size
= m
->get_max_size();
5235 if (in
->max_size
> in
->wanted_max_size
) {
5236 in
->wanted_max_size
= 0;
5237 in
->requested_max_size
= 0;
5242 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5243 (wanted
& ~(cap
->wanted
| new_caps
))) {
5244 // If mds is importing cap, prior cap messages that update 'wanted'
5245 // may get dropped by mds (migrate seq mismatch).
5247 // We don't send cap message to update 'wanted' if what we want are
5248 // already issued. If mds revokes caps, cap message that releases caps
5249 // also tells mds what we want. But if caps got revoked by mds forcedly
5250 // (session stale). We may haven't told mds what we want.
5256 auto revoked
= cap
->issued
& ~new_caps
;
5258 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5259 cap
->issued
= new_caps
;
5260 cap
->implemented
|= new_caps
;
5262 // recall delegations if we're losing caps necessary for them
5263 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5264 in
->recall_deleg(false);
5265 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5266 in
->recall_deleg(true);
5268 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5269 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5270 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5271 // waitin' for flush
5272 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5276 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5279 } else if (cap
->issued
== new_caps
) {
5280 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5282 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5283 cap
->issued
= new_caps
;
5284 cap
->implemented
|= new_caps
;
5286 if (cap
== in
->auth_cap
) {
5287 // non-auth MDS is revoking the newly grant caps ?
5288 for (const auto &p
: in
->caps
) {
5289 if (&p
.second
== cap
)
5291 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5304 signal_cond_list(in
->waitfor_caps
);
5306 // may drop inode's last ref
5308 _try_to_trim_inode(in
, true);
5311 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5313 if (perms
.uid() == 0)
5316 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5317 int ret
= _posix_acl_permission(in
, perms
, want
);
5322 // check permissions before doing anything else
5323 if (!in
->check_mode(perms
, want
))
5328 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5329 const UserPerm
& perms
)
5331 int r
= _getattr_for_perm(in
, perms
);
5336 if (strncmp(name
, "system.", 7) == 0) {
5337 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5340 r
= inode_permission(in
, perms
, want
);
5343 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5347 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5348 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5352 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5353 const UserPerm
& perms
)
5355 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5356 int r
= _getattr_for_perm(in
, perms
);
5360 if (mask
& CEPH_SETATTR_SIZE
) {
5361 r
= inode_permission(in
, perms
, MAY_WRITE
);
5367 if (mask
& CEPH_SETATTR_UID
) {
5368 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5371 if (mask
& CEPH_SETATTR_GID
) {
5372 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5373 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5377 if (mask
& CEPH_SETATTR_MODE
) {
5378 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5381 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5382 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5383 stx
->stx_mode
&= ~S_ISGID
;
5386 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5387 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5388 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5389 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5390 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5391 check_mask
|= CEPH_SETATTR_MTIME
;
5392 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5393 check_mask
|= CEPH_SETATTR_ATIME
;
5394 if (check_mask
& mask
) {
5397 r
= inode_permission(in
, perms
, MAY_WRITE
);
5405 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5409 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5411 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5414 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5416 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5417 want
= MAY_READ
| MAY_WRITE
;
5418 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5420 if (flags
& O_TRUNC
)
5424 switch (in
->mode
& S_IFMT
) {
5429 if (want
& MAY_WRITE
) {
5436 r
= _getattr_for_perm(in
, perms
);
5440 r
= inode_permission(in
, perms
, want
);
5442 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5446 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5448 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5449 int r
= _getattr_for_perm(dir
, perms
);
5453 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5455 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5459 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5461 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5462 int r
= _getattr_for_perm(dir
, perms
);
5466 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5468 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5472 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5474 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5475 int r
= _getattr_for_perm(dir
, perms
);
5479 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5483 /* 'name == NULL' means rmsnap */
5484 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5486 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5489 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5493 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5497 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5499 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5500 int r
= _getattr_for_perm(in
, perms
);
5504 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5510 if (!S_ISREG(in
->mode
))
5513 if (in
->mode
& S_ISUID
)
5516 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5519 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5521 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5525 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5527 int mask
= CEPH_STAT_CAP_MODE
;
5529 if (acl_type
!= NO_ACL
) {
5530 mask
|= CEPH_STAT_CAP_XATTR
;
5531 force
= in
->xattr_version
== 0;
5533 return _getattr(in
, mask
, perms
, force
);
5536 vinodeno_t
Client::_get_vino(Inode
*in
)
5538 /* The caller must hold the client lock */
5539 return vinodeno_t(in
->ino
, in
->snapid
);
5543 * Resolve an MDS spec to a list of MDS daemon GIDs.
5545 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5546 * It may be '*' in which case it matches all GIDs.
5548 * If no error is returned, the `targets` vector will be populated with at least
5551 int Client::resolve_mds(
5552 const std::string
&mds_spec
,
5553 std::vector
<mds_gid_t
> *targets
)
5556 ceph_assert(targets
!= nullptr);
5559 std::stringstream ss
;
5560 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5562 // We got a role, resolve it to a GID
5563 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5564 << role
<< "'" << dendl
;
5566 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5570 std::string strtol_err
;
5571 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5572 if (strtol_err
.empty()) {
5573 // It is a possible GID
5574 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5575 if (fsmap
->gid_exists(mds_gid
)) {
5576 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5577 targets
->push_back(mds_gid
);
5579 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5583 } else if (mds_spec
== "*") {
5584 // It is a wildcard: use all MDSs
5585 const auto mds_info
= fsmap
->get_mds_info();
5587 if (mds_info
.empty()) {
5588 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5592 for (const auto i
: mds_info
) {
5593 targets
->push_back(i
.first
);
5596 // It did not parse as an integer, it is not a wildcard, it must be a name
5597 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5599 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5601 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5605 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5606 << "' to GID " << mds_gid
<< dendl
;
5607 targets
->push_back(mds_gid
);
5616 * Authenticate with mon and establish global ID
5618 int Client::authenticate()
5620 ceph_assert(client_lock
.is_locked_by_me());
5622 if (monclient
->is_authenticated()) {
5626 client_lock
.Unlock();
5627 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5633 whoami
= monclient
->get_global_id();
5634 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5639 int Client::fetch_fsmap(bool user
)
5642 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5643 // rather than MDSMap because no one MDSMap contains all the daemons, and
5644 // a `tell` can address any daemon.
5645 version_t fsmap_latest
;
5648 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5649 client_lock
.Unlock();
5652 } while (r
== -EAGAIN
);
5655 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5659 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5662 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5663 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5664 monclient
->renew_subs();
5665 wait_on_list(waiting_for_fsmap
);
5667 ceph_assert(fsmap_user
);
5668 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5670 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5671 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5672 monclient
->renew_subs();
5673 wait_on_list(waiting_for_fsmap
);
5676 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5678 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5679 << fsmap_latest
<< dendl
;
5685 * @mds_spec one of ID, rank, GID, "*"
5688 int Client::mds_command(
5689 const std::string
&mds_spec
,
5690 const vector
<string
>& cmd
,
5691 const bufferlist
& inbl
,
5696 std::lock_guard
lock(client_lock
);
5707 r
= fetch_fsmap(false);
5712 // Look up MDS target(s) of the command
5713 std::vector
<mds_gid_t
> targets
;
5714 r
= resolve_mds(mds_spec
, &targets
);
5719 // If daemons are laggy, we won't send them commands. If all
5720 // are laggy then we fail.
5721 std::vector
<mds_gid_t
> non_laggy
;
5722 for (const auto gid
: targets
) {
5723 const auto info
= fsmap
->get_info_gid(gid
);
5724 if (!info
.laggy()) {
5725 non_laggy
.push_back(gid
);
5728 if (non_laggy
.size() == 0) {
5729 *outs
= "All targeted MDS daemons are laggy";
5733 if (metadata
.empty()) {
5734 // We are called on an unmounted client, so metadata
5735 // won't be initialized yet.
5736 populate_metadata("");
5739 // Send commands to targets
5740 C_GatherBuilder
gather(cct
, onfinish
);
5741 for (const auto target_gid
: non_laggy
) {
5742 const auto info
= fsmap
->get_info_gid(target_gid
);
5744 // Open a connection to the target MDS
5745 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5747 // Generate MDSCommandOp state
5748 auto &op
= command_table
.start_command();
5750 op
.on_finish
= gather
.new_sub();
5755 op
.mds_gid
= target_gid
;
5758 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5759 << " tid=" << op
.tid
<< cmd
<< dendl
;
5761 // Construct and send MCommand
5762 auto m
= op
.get_message(monclient
->get_fsid());
5763 conn
->send_message2(std::move(m
));
5770 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5772 ceph_tid_t
const tid
= m
->get_tid();
5774 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5776 if (!command_table
.exists(tid
)) {
5777 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5781 auto &op
= command_table
.get_command(tid
);
5783 *op
.outbl
= m
->get_data();
5790 op
.on_finish
->complete(m
->r
);
5793 command_table
.erase(tid
);
5796 // -------------------
5799 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5801 int r
= authenticate();
5803 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5807 std::string resolved_fs_name
;
5808 if (fs_name
.empty()) {
5809 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5811 resolved_fs_name
= fs_name
;
5814 std::string want
= "mdsmap";
5815 if (!resolved_fs_name
.empty()) {
5816 r
= fetch_fsmap(true);
5819 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5820 if (fscid
== FS_CLUSTER_ID_NONE
) {
5824 std::ostringstream oss
;
5825 oss
<< want
<< "." << fscid
;
5828 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5830 monclient
->sub_want(want
, 0, 0);
5831 monclient
->renew_subs();
5836 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5837 bool require_mds
, const std::string
&fs_name
)
5839 std::lock_guard
lock(client_lock
);
5842 ldout(cct
, 5) << "already mounted" << dendl
;
5848 int r
= subscribe_mdsmap(fs_name
);
5850 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5854 tick(); // start tick
5858 auto availability
= mdsmap
->is_cluster_available();
5859 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5861 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5862 return CEPH_FUSE_NO_MDS_UP
;
5863 } else if (availability
== MDSMap::AVAILABLE
) {
5864 // Continue to mount
5866 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5867 // Else, wait. MDSMonitor will update the map to bring
5868 // us to a conclusion eventually.
5869 wait_on_list(waiting_for_mdsmap
);
5871 // Unexpected value!
5877 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5879 filepath
fp(CEPH_INO_ROOT
);
5880 if (!mount_root
.empty()) {
5881 fp
= filepath(mount_root
.c_str());
5884 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5885 req
->set_filepath(fp
);
5886 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5887 int res
= make_request(req
, perms
);
5889 if (res
== -EACCES
&& root
) {
5890 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5908 if (!cct
->_conf
->client_trace
.empty()) {
5909 traceout
.open(cct
->_conf
->client_trace
.c_str());
5910 if (traceout
.is_open()) {
5911 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5913 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5918 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5919 ldout(cct, 3) << "op: struct stat st;" << dendl;
5920 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5921 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5922 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5923 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5924 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5925 ldout(cct, 3) << "op: int fd;" << dendl;
5932 void Client::_close_sessions()
5934 while (!mds_sessions
.empty()) {
5935 // send session closes!
5936 for (auto &p
: mds_sessions
) {
5937 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
5938 _close_mds_session(&p
.second
);
5942 // wait for sessions to close
5943 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5944 mount_cond
.Wait(client_lock
);
5948 void Client::flush_mdlog_sync()
5950 if (mds_requests
.empty())
5952 for (auto &p
: mds_sessions
) {
5953 flush_mdlog(&p
.second
);
5957 void Client::flush_mdlog(MetaSession
*session
)
5959 // Only send this to Luminous or newer MDS daemons, older daemons
5960 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5961 const uint64_t features
= session
->con
->get_features();
5962 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5963 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5964 session
->con
->send_message2(std::move(m
));
5969 void Client::_abort_mds_sessions(int err
)
5971 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
5972 auto req
= p
->second
;
5974 // unsafe requests will be removed during close session below.
5975 if (req
->got_unsafe
)
5979 if (req
->caller_cond
) {
5981 req
->caller_cond
->Signal();
5985 // Process aborts on any requests that were on this waitlist.
5986 // Any requests that were on a waiting_for_open session waitlist
5987 // will get kicked during close session below.
5988 signal_cond_list(waiting_for_mdsmap
);
5990 // Force-close all sessions
5991 while(!mds_sessions
.empty()) {
5992 auto& session
= mds_sessions
.begin()->second
;
5993 _closed_mds_session(&session
);
5997 void Client::_unmount(bool abort
)
6002 if (abort
|| blacklisted
) {
6003 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6005 ldout(cct
, 2) << "unmounting" << dendl
;
6012 // Abort all mds sessions
6013 _abort_mds_sessions(-ENOTCONN
);
6015 objecter
->op_cancel_writes(-ENOTCONN
);
6017 // flush the mdlog for pending requests, if any
6021 while (!mds_requests
.empty()) {
6022 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests" << dendl
;
6023 mount_cond
.Wait(client_lock
);
6027 timer
.cancel_event(tick_event
);
6032 // clean up any unclosed files
6033 while (!fd_map
.empty()) {
6034 Fh
*fh
= fd_map
.begin()->second
;
6035 fd_map
.erase(fd_map
.begin());
6036 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6040 while (!ll_unclosed_fh_set
.empty()) {
6041 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6043 ll_unclosed_fh_set
.erase(fh
);
6044 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6048 while (!opened_dirs
.empty()) {
6049 dir_result_t
*dirp
= *opened_dirs
.begin();
6050 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6056 while (unsafe_sync_write
> 0) {
6057 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting" << dendl
;
6058 mount_cond
.Wait(client_lock
);
6061 if (cct
->_conf
->client_oc
) {
6062 // flush/release all buffered data
6063 std::list
<InodeRef
> anchor
;
6064 for (auto& p
: inode_map
) {
6065 Inode
*in
= p
.second
;
6067 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6071 // prevent inode from getting freed
6072 anchor
.emplace_back(in
);
6074 if (abort
|| blacklisted
) {
6075 objectcacher
->purge_set(&in
->oset
);
6076 } else if (!in
->caps
.empty()) {
6078 _flush(in
, new C_Client_FlushComplete(this, in
));
6083 if (abort
|| blacklisted
) {
6084 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6087 if (in
->dirty_caps
) {
6088 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6089 in
->mark_caps_clean();
6095 wait_sync_caps(last_flush_tid
);
6101 while (lru
.lru_get_size() > 0 ||
6102 !inode_map
.empty()) {
6103 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6104 << "+" << inode_map
.size() << " items"
6105 << ", waiting (for caps to release?)"
6107 utime_t until
= ceph_clock_now() + utime_t(5, 0);
6108 int r
= mount_cond
.WaitUntil(client_lock
, until
);
6109 if (r
== ETIMEDOUT
) {
6113 ceph_assert(lru
.lru_get_size() == 0);
6114 ceph_assert(inode_map
.empty());
6117 if (!cct
->_conf
->client_trace
.empty()) {
6118 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6126 ldout(cct
, 2) << "unmounted." << dendl
;
6129 void Client::unmount()
6131 std::lock_guard
lock(client_lock
);
6135 void Client::abort_conn()
6137 std::lock_guard
lock(client_lock
);
6141 void Client::flush_cap_releases()
6143 // send any cap releases
6144 for (auto &p
: mds_sessions
) {
6145 auto &session
= p
.second
;
6146 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6148 if (cct
->_conf
->client_inject_release_failure
) {
6149 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6151 session
.con
->send_message2(std::move(session
.release
));
6153 session
.release
.reset();
6160 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6161 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6162 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6163 cct
->_conf
.apply_changes(nullptr);
6166 ldout(cct
, 21) << "tick" << dendl
;
6167 tick_event
= timer
.add_event_after(
6168 cct
->_conf
->client_tick_interval
,
6169 new FunctionContext([this](int) {
6170 // Called back via Timer, which takes client_lock for us
6171 ceph_assert(client_lock
.is_locked_by_me());
6174 utime_t now
= ceph_clock_now();
6176 if (!mounted
&& !mds_requests
.empty()) {
6177 MetaRequest
*req
= mds_requests
.begin()->second
;
6178 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6179 req
->abort(-ETIMEDOUT
);
6180 if (req
->caller_cond
) {
6182 req
->caller_cond
->Signal();
6184 signal_cond_list(waiting_for_mdsmap
);
6185 for (auto &p
: mds_sessions
) {
6186 signal_context_list(p
.second
.waiting_for_open
);
6191 if (mdsmap
->get_epoch()) {
6193 utime_t el
= now
- last_cap_renew
;
6194 if (el
> mdsmap
->get_session_timeout() / 3.0)
6197 flush_cap_releases();
6201 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6205 if (in
->hold_caps_until
> now
)
6207 delayed_list
.pop_front();
6208 check_caps(in
, CHECK_CAPS_NODELAY
);
6214 void Client::renew_caps()
6216 ldout(cct
, 10) << "renew_caps()" << dendl
;
6217 last_cap_renew
= ceph_clock_now();
6219 for (auto &p
: mds_sessions
) {
6220 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6221 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6222 renew_caps(&p
.second
);
6226 void Client::renew_caps(MetaSession
*session
)
6228 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6229 session
->last_cap_renew_request
= ceph_clock_now();
6230 uint64_t seq
= ++session
->cap_renew_seq
;
6231 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6235 // ===============================================================
6236 // high level (POSIXy) interface
6238 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6239 InodeRef
*target
, const UserPerm
& perms
)
6241 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6242 MetaRequest
*req
= new MetaRequest(op
);
6244 dir
->make_nosnap_relative_path(path
);
6245 path
.push_dentry(name
);
6246 req
->set_filepath(path
);
6247 req
->set_inode(dir
);
6248 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6249 mask
|= DEBUG_GETATTR_CAPS
;
6250 req
->head
.args
.getattr
.mask
= mask
;
6252 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6254 int r
= make_request(req
, perms
, target
);
6255 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6259 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6260 const UserPerm
& perms
)
6265 if (dname
== "..") {
6266 if (dir
->dentries
.empty()) {
6267 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6268 filepath
path(dir
->ino
);
6269 req
->set_filepath(path
);
6272 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6275 Inode
*tempino
= tmptarget
.get();
6278 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6284 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6293 if (!dir
->is_dir()) {
6298 if (dname
.length() > NAME_MAX
) {
6303 if (dname
== cct
->_conf
->client_snapdir
&&
6304 dir
->snapid
== CEPH_NOSNAP
) {
6305 *target
= open_snapdir(dir
);
6310 dir
->dir
->dentries
.count(dname
)) {
6311 dn
= dir
->dir
->dentries
[dname
];
6313 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6314 << " seq " << dn
->lease_seq
6317 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6318 // is dn lease valid?
6319 utime_t now
= ceph_clock_now();
6320 if (dn
->lease_mds
>= 0 &&
6321 dn
->lease_ttl
> now
&&
6322 mds_sessions
.count(dn
->lease_mds
)) {
6323 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6324 if (s
.cap_ttl
> now
&&
6325 s
.cap_gen
== dn
->lease_gen
) {
6326 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6327 // make trim_caps() behave.
6328 dir
->try_touch_cap(dn
->lease_mds
);
6331 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6332 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6335 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6336 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6337 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6339 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6340 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6341 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6346 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6349 // can we conclude ENOENT locally?
6350 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6351 (dir
->flags
& I_COMPLETE
)) {
6352 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6357 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6362 *target
= dn
->inode
;
6370 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6372 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6376 int Client::get_or_create(Inode
*dir
, const char* name
,
6377 Dentry
**pdn
, bool expect_null
)
6380 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6382 if (dir
->dir
->dentries
.count(name
)) {
6383 Dentry
*dn
= dir
->dir
->dentries
[name
];
6385 // is dn lease valid?
6386 utime_t now
= ceph_clock_now();
6388 dn
->lease_mds
>= 0 &&
6389 dn
->lease_ttl
> now
&&
6390 mds_sessions
.count(dn
->lease_mds
)) {
6391 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6392 if (s
.cap_ttl
> now
&&
6393 s
.cap_gen
== dn
->lease_gen
) {
6400 // otherwise link up a new one
6401 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6408 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6409 const UserPerm
& perms
, bool followsym
, int mask
)
6411 filepath path
= origpath
;
6413 if (origpath
.absolute())
6419 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6424 while (i
< path
.depth() && cur
) {
6426 const string
&dname
= path
[i
];
6427 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6428 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6430 if (cct
->_conf
->client_permissions
) {
6431 int r
= may_lookup(cur
.get(), perms
);
6434 caps
= CEPH_CAP_AUTH_SHARED
;
6437 /* Get extra requested caps on the last component */
6438 if (i
== (path
.depth() - 1))
6440 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6443 // only follow trailing symlink if followsym. always follow
6444 // 'directory' symlinks.
6445 if (next
&& next
->is_symlink()) {
6447 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6448 if (symlinks
> MAXSYMLINKS
) {
6452 if (i
< path
.depth() - 1) {
6454 // replace consumed components of path with symlink dir target
6455 filepath
resolved(next
->symlink
.c_str());
6456 resolved
.append(path
.postfixpath(i
+ 1));
6459 if (next
->symlink
[0] == '/') {
6463 } else if (followsym
) {
6464 if (next
->symlink
[0] == '/') {
6465 path
= next
->symlink
.c_str();
6470 filepath
more(next
->symlink
.c_str());
6471 // we need to remove the symlink component from off of the path
6472 // before adding the target that the symlink points to. remain
6473 // at the same position in the path.
6493 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6495 std::lock_guard
lock(client_lock
);
6496 tout(cct
) << "link" << std::endl
;
6497 tout(cct
) << relexisting
<< std::endl
;
6498 tout(cct
) << relpath
<< std::endl
;
6503 filepath
existing(relexisting
);
6506 int r
= path_walk(existing
, &in
, perm
, true);
6509 if (std::string(relpath
) == "/") {
6513 filepath
path(relpath
);
6514 string name
= path
.last_dentry();
6517 r
= path_walk(path
, &dir
, perm
, true);
6520 if (cct
->_conf
->client_permissions
) {
6521 if (S_ISDIR(in
->mode
)) {
6525 r
= may_hardlink(in
.get(), perm
);
6528 r
= may_create(dir
.get(), perm
);
6532 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6536 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6538 std::lock_guard
lock(client_lock
);
6539 tout(cct
) << __func__
<< std::endl
;
6540 tout(cct
) << relpath
<< std::endl
;
6545 if (std::string(relpath
) == "/")
6548 filepath
path(relpath
);
6549 string name
= path
.last_dentry();
6552 int r
= path_walk(path
, &dir
, perm
);
6555 if (cct
->_conf
->client_permissions
) {
6556 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6560 return _unlink(dir
.get(), name
.c_str(), perm
);
6563 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6565 std::lock_guard
lock(client_lock
);
6566 tout(cct
) << __func__
<< std::endl
;
6567 tout(cct
) << relfrom
<< std::endl
;
6568 tout(cct
) << relto
<< std::endl
;
6573 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6576 filepath
from(relfrom
);
6578 string fromname
= from
.last_dentry();
6580 string toname
= to
.last_dentry();
6583 InodeRef fromdir
, todir
;
6584 int r
= path_walk(from
, &fromdir
, perm
);
6587 r
= path_walk(to
, &todir
, perm
);
6591 if (cct
->_conf
->client_permissions
) {
6592 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6595 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6596 if (r
< 0 && r
!= -ENOENT
)
6599 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6606 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6608 std::lock_guard
lock(client_lock
);
6609 tout(cct
) << __func__
<< std::endl
;
6610 tout(cct
) << relpath
<< std::endl
;
6611 tout(cct
) << mode
<< std::endl
;
6612 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6617 if (std::string(relpath
) == "/")
6620 filepath
path(relpath
);
6621 string name
= path
.last_dentry();
6624 int r
= path_walk(path
, &dir
, perm
);
6627 if (cct
->_conf
->client_permissions
) {
6628 r
= may_create(dir
.get(), perm
);
6632 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6635 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6637 std::lock_guard
lock(client_lock
);
6638 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6639 tout(cct
) << __func__
<< std::endl
;
6640 tout(cct
) << relpath
<< std::endl
;
6641 tout(cct
) << mode
<< std::endl
;
6646 //get through existing parts of path
6647 filepath
path(relpath
);
6649 int r
= 0, caps
= 0;
6652 for (i
=0; i
<path
.depth(); ++i
) {
6653 if (cct
->_conf
->client_permissions
) {
6654 r
= may_lookup(cur
.get(), perms
);
6657 caps
= CEPH_CAP_AUTH_SHARED
;
6659 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6664 if (r
!=-ENOENT
) return r
;
6665 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6666 //make new directory at each level
6667 for (; i
<path
.depth(); ++i
) {
6668 if (cct
->_conf
->client_permissions
) {
6669 r
= may_create(cur
.get(), perms
);
6674 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6676 //check proper creation/existence
6677 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6678 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6682 //move to new dir and continue
6684 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6685 << filepath(cur
->ino
).get_path() << dendl
;
6690 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6692 std::lock_guard
lock(client_lock
);
6693 tout(cct
) << __func__
<< std::endl
;
6694 tout(cct
) << relpath
<< std::endl
;
6699 if (std::string(relpath
) == "/")
6702 filepath
path(relpath
);
6703 string name
= path
.last_dentry();
6706 int r
= path_walk(path
, &dir
, perms
);
6709 if (cct
->_conf
->client_permissions
) {
6710 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6714 return _rmdir(dir
.get(), name
.c_str(), perms
);
6717 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6719 std::lock_guard
lock(client_lock
);
6720 tout(cct
) << __func__
<< std::endl
;
6721 tout(cct
) << relpath
<< std::endl
;
6722 tout(cct
) << mode
<< std::endl
;
6723 tout(cct
) << rdev
<< std::endl
;
6728 if (std::string(relpath
) == "/")
6731 filepath
path(relpath
);
6732 string name
= path
.last_dentry();
6735 int r
= path_walk(path
, &dir
, perms
);
6738 if (cct
->_conf
->client_permissions
) {
6739 int r
= may_create(dir
.get(), perms
);
6743 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6748 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6750 std::lock_guard
lock(client_lock
);
6751 tout(cct
) << __func__
<< std::endl
;
6752 tout(cct
) << target
<< std::endl
;
6753 tout(cct
) << relpath
<< std::endl
;
6758 if (std::string(relpath
) == "/")
6761 filepath
path(relpath
);
6762 string name
= path
.last_dentry();
6765 int r
= path_walk(path
, &dir
, perms
);
6768 if (cct
->_conf
->client_permissions
) {
6769 int r
= may_create(dir
.get(), perms
);
6773 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6776 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6778 std::lock_guard
lock(client_lock
);
6779 tout(cct
) << __func__
<< std::endl
;
6780 tout(cct
) << relpath
<< std::endl
;
6785 filepath
path(relpath
);
6787 int r
= path_walk(path
, &in
, perms
, false);
6791 return _readlink(in
.get(), buf
, size
);
6794 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6796 if (!in
->is_symlink())
6799 // copy into buf (at most size bytes)
6800 int r
= in
->symlink
.length();
6803 memcpy(buf
, in
->symlink
.c_str(), r
);
6810 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6812 bool yes
= in
->caps_issued_mask(mask
, true);
6814 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6818 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6820 in
->make_nosnap_relative_path(path
);
6821 req
->set_filepath(path
);
6823 req
->head
.args
.getattr
.mask
= mask
;
6825 int res
= make_request(req
, perms
);
6826 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6830 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6831 const UserPerm
& perms
, InodeRef
*inp
)
6833 int issued
= in
->caps_issued();
6835 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6836 ccap_string(issued
) << dendl
;
6838 if (in
->snapid
!= CEPH_NOSNAP
) {
6841 if ((mask
& CEPH_SETATTR_SIZE
) &&
6842 (unsigned long)stx
->stx_size
> in
->size
&&
6843 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6848 // make the change locally?
6849 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6850 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6851 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6852 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6853 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6856 * This works because we implicitly flush the caps as part of the
6857 * request, so the cap update check will happen with the writeback
6858 * cap context, and then the setattr check will happen with the
6861 * In reality this pattern is likely pretty rare (different users
6862 * setattr'ing the same file). If that turns out not to be the
6863 * case later, we can build a more complex pipelined cap writeback
6867 mask
|= CEPH_SETATTR_CTIME
;
6872 // caller just needs us to bump the ctime
6873 in
->ctime
= ceph_clock_now();
6874 in
->cap_dirtier_uid
= perms
.uid();
6875 in
->cap_dirtier_gid
= perms
.gid();
6876 if (issued
& CEPH_CAP_AUTH_EXCL
)
6877 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6878 else if (issued
& CEPH_CAP_FILE_EXCL
)
6879 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6880 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6881 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6883 mask
|= CEPH_SETATTR_CTIME
;
6886 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6887 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6889 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6891 if (mask
& CEPH_SETATTR_UID
) {
6892 in
->ctime
= ceph_clock_now();
6893 in
->cap_dirtier_uid
= perms
.uid();
6894 in
->cap_dirtier_gid
= perms
.gid();
6895 in
->uid
= stx
->stx_uid
;
6896 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6897 mask
&= ~CEPH_SETATTR_UID
;
6899 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6901 if (mask
& CEPH_SETATTR_GID
) {
6902 in
->ctime
= ceph_clock_now();
6903 in
->cap_dirtier_uid
= perms
.uid();
6904 in
->cap_dirtier_gid
= perms
.gid();
6905 in
->gid
= stx
->stx_gid
;
6906 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6907 mask
&= ~CEPH_SETATTR_GID
;
6909 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6912 if (mask
& CEPH_SETATTR_MODE
) {
6913 in
->ctime
= ceph_clock_now();
6914 in
->cap_dirtier_uid
= perms
.uid();
6915 in
->cap_dirtier_gid
= perms
.gid();
6916 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6917 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6918 mask
&= ~CEPH_SETATTR_MODE
;
6919 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6920 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6921 /* Must squash the any setuid/setgid bits with an ownership change */
6922 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6923 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6926 if (mask
& CEPH_SETATTR_BTIME
) {
6927 in
->ctime
= ceph_clock_now();
6928 in
->cap_dirtier_uid
= perms
.uid();
6929 in
->cap_dirtier_gid
= perms
.gid();
6930 in
->btime
= utime_t(stx
->stx_btime
);
6931 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6932 mask
&= ~CEPH_SETATTR_BTIME
;
6933 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6935 } else if (mask
& CEPH_SETATTR_SIZE
) {
6936 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6937 mask
|= CEPH_SETATTR_KILL_SGUID
;
6940 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6941 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6942 if (mask
& CEPH_SETATTR_MTIME
)
6943 in
->mtime
= utime_t(stx
->stx_mtime
);
6944 if (mask
& CEPH_SETATTR_ATIME
)
6945 in
->atime
= utime_t(stx
->stx_atime
);
6946 in
->ctime
= ceph_clock_now();
6947 in
->cap_dirtier_uid
= perms
.uid();
6948 in
->cap_dirtier_gid
= perms
.gid();
6949 in
->time_warp_seq
++;
6950 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6951 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6960 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6964 in
->make_nosnap_relative_path(path
);
6965 req
->set_filepath(path
);
6968 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6969 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6971 if (mask
& CEPH_SETATTR_MODE
) {
6972 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
6973 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6974 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6976 if (mask
& CEPH_SETATTR_UID
) {
6977 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
6978 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6979 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6981 if (mask
& CEPH_SETATTR_GID
) {
6982 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
6983 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6984 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6986 if (mask
& CEPH_SETATTR_BTIME
) {
6987 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
6988 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6990 if (mask
& CEPH_SETATTR_MTIME
) {
6991 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
6992 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6995 if (mask
& CEPH_SETATTR_ATIME
) {
6996 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
6997 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7000 if (mask
& CEPH_SETATTR_SIZE
) {
7001 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7002 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7003 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7006 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7009 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7012 req
->head
.args
.setattr
.mask
= mask
;
7014 req
->regetattr_mask
= mask
;
7016 int res
= make_request(req
, perms
, inp
);
7017 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7021 /* Note that we only care about attrs that setattr cares about */
7022 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7024 stx
->stx_size
= st
->st_size
;
7025 stx
->stx_mode
= st
->st_mode
;
7026 stx
->stx_uid
= st
->st_uid
;
7027 stx
->stx_gid
= st
->st_gid
;
7029 stx
->stx_mtime
= st
->st_mtimespec
;
7030 stx
->stx_atime
= st
->st_atimespec
;
7032 stx
->stx_mtime
= st
->st_mtim
;
7033 stx
->stx_atime
= st
->st_atim
;
7037 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7038 const UserPerm
& perms
, InodeRef
*inp
)
7040 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7043 if (mask
& CEPH_SETATTR_MODE
)
7044 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7048 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7049 const UserPerm
& perms
)
7051 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7052 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7053 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7054 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7055 if (cct
->_conf
->client_permissions
) {
7056 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7060 return __setattrx(in
.get(), stx
, mask
, perms
);
7063 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7064 const UserPerm
& perms
)
7066 struct ceph_statx stx
;
7068 stat_to_statx(attr
, &stx
);
7069 mask
&= ~CEPH_SETATTR_BTIME
;
7071 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7072 mask
&= ~CEPH_SETATTR_UID
;
7074 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7075 mask
&= ~CEPH_SETATTR_GID
;
7078 return _setattrx(in
, &stx
, mask
, perms
);
7081 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7082 const UserPerm
& perms
)
7084 std::lock_guard
lock(client_lock
);
7085 tout(cct
) << __func__
<< std::endl
;
7086 tout(cct
) << relpath
<< std::endl
;
7087 tout(cct
) << mask
<< std::endl
;
7092 filepath
path(relpath
);
7094 int r
= path_walk(path
, &in
, perms
);
7097 return _setattr(in
, attr
, mask
, perms
);
7100 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7101 const UserPerm
& perms
, int flags
)
7103 std::lock_guard
lock(client_lock
);
7104 tout(cct
) << __func__
<< std::endl
;
7105 tout(cct
) << relpath
<< std::endl
;
7106 tout(cct
) << mask
<< std::endl
;
7111 filepath
path(relpath
);
7113 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7116 return _setattrx(in
, stx
, mask
, perms
);
7119 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7121 std::lock_guard
lock(client_lock
);
7122 tout(cct
) << __func__
<< std::endl
;
7123 tout(cct
) << fd
<< std::endl
;
7124 tout(cct
) << mask
<< std::endl
;
7129 Fh
*f
= get_filehandle(fd
);
7132 #if defined(__linux__) && defined(O_PATH)
7133 if (f
->flags
& O_PATH
)
7136 return _setattr(f
->inode
, attr
, mask
, perms
);
7139 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7141 std::lock_guard
lock(client_lock
);
7142 tout(cct
) << __func__
<< std::endl
;
7143 tout(cct
) << fd
<< std::endl
;
7144 tout(cct
) << mask
<< std::endl
;
7149 Fh
*f
= get_filehandle(fd
);
7152 #if defined(__linux__) && defined(O_PATH)
7153 if (f
->flags
& O_PATH
)
7156 return _setattrx(f
->inode
, stx
, mask
, perms
);
7159 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7160 frag_info_t
*dirstat
, int mask
)
7162 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7163 std::lock_guard
lock(client_lock
);
7164 tout(cct
) << "stat" << std::endl
;
7165 tout(cct
) << relpath
<< std::endl
;
7170 filepath
path(relpath
);
7172 int r
= path_walk(path
, &in
, perms
, true, mask
);
7175 r
= _getattr(in
, mask
, perms
);
7177 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7180 fill_stat(in
, stbuf
, dirstat
);
7181 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7185 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7189 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7190 if (flags
& AT_NO_ATTR_SYNC
)
7193 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7194 mask
|= CEPH_CAP_PIN
;
7195 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7196 mask
|= CEPH_CAP_AUTH_SHARED
;
7197 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7198 mask
|= CEPH_CAP_LINK_SHARED
;
7199 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7200 mask
|= CEPH_CAP_FILE_SHARED
;
7201 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7202 mask
|= CEPH_CAP_XATTR_SHARED
;
7207 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7208 const UserPerm
& perms
,
7209 unsigned int want
, unsigned int flags
)
7211 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7212 std::lock_guard
lock(client_lock
);
7213 tout(cct
) << "statx" << std::endl
;
7214 tout(cct
) << relpath
<< std::endl
;
7219 filepath
path(relpath
);
7222 unsigned mask
= statx_to_mask(flags
, want
);
7224 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7228 r
= _getattr(in
, mask
, perms
);
7230 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7234 fill_statx(in
, mask
, stx
);
7235 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7239 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7240 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7242 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7243 std::lock_guard
lock(client_lock
);
7244 tout(cct
) << __func__
<< std::endl
;
7245 tout(cct
) << relpath
<< std::endl
;
7250 filepath
path(relpath
);
7252 // don't follow symlinks
7253 int r
= path_walk(path
, &in
, perms
, false, mask
);
7256 r
= _getattr(in
, mask
, perms
);
7258 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7261 fill_stat(in
, stbuf
, dirstat
);
7262 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7266 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7268 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7269 << " mode 0" << oct
<< in
->mode
<< dec
7270 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7271 memset(st
, 0, sizeof(struct stat
));
7272 if (use_faked_inos())
7273 st
->st_ino
= in
->faked_ino
;
7275 st
->st_ino
= in
->ino
;
7276 st
->st_dev
= in
->snapid
;
7277 st
->st_mode
= in
->mode
;
7278 st
->st_rdev
= in
->rdev
;
7280 switch (in
->nlink
) {
7282 st
->st_nlink
= 0; /* dir is unlinked */
7285 st
->st_nlink
= 1 /* parent dentry */
7287 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7293 st
->st_nlink
= in
->nlink
;
7295 st
->st_uid
= in
->uid
;
7296 st
->st_gid
= in
->gid
;
7297 if (in
->ctime
> in
->mtime
) {
7298 stat_set_ctime_sec(st
, in
->ctime
.sec());
7299 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7301 stat_set_ctime_sec(st
, in
->mtime
.sec());
7302 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7304 stat_set_atime_sec(st
, in
->atime
.sec());
7305 stat_set_atime_nsec(st
, in
->atime
.nsec());
7306 stat_set_mtime_sec(st
, in
->mtime
.sec());
7307 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7309 if (cct
->_conf
->client_dirsize_rbytes
)
7310 st
->st_size
= in
->rstat
.rbytes
;
7312 st
->st_size
= in
->dirstat
.size();
7315 st
->st_size
= in
->size
;
7316 st
->st_blocks
= (in
->size
+ 511) >> 9;
7318 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7321 *dirstat
= in
->dirstat
;
7325 return in
->caps_issued();
7328 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7330 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7331 << " mode 0" << oct
<< in
->mode
<< dec
7332 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7333 memset(stx
, 0, sizeof(struct ceph_statx
));
7336 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7337 * so that all bits are set.
7342 /* These are always considered to be available */
7343 stx
->stx_dev
= in
->snapid
;
7344 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7346 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7347 stx
->stx_mode
= S_IFMT
& in
->mode
;
7348 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7349 stx
->stx_rdev
= in
->rdev
;
7350 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7352 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7353 stx
->stx_uid
= in
->uid
;
7354 stx
->stx_gid
= in
->gid
;
7355 stx
->stx_mode
= in
->mode
;
7356 in
->btime
.to_timespec(&stx
->stx_btime
);
7357 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7360 if (mask
& CEPH_CAP_LINK_SHARED
) {
7362 switch (in
->nlink
) {
7364 stx
->stx_nlink
= 0; /* dir is unlinked */
7367 stx
->stx_nlink
= 1 /* parent dentry */
7369 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7375 stx
->stx_nlink
= in
->nlink
;
7377 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7380 if (mask
& CEPH_CAP_FILE_SHARED
) {
7382 in
->atime
.to_timespec(&stx
->stx_atime
);
7383 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7386 if (cct
->_conf
->client_dirsize_rbytes
)
7387 stx
->stx_size
= in
->rstat
.rbytes
;
7389 stx
->stx_size
= in
->dirstat
.size();
7390 stx
->stx_blocks
= 1;
7392 stx
->stx_size
= in
->size
;
7393 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7395 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7396 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7399 /* Change time and change_attr both require all shared caps to view */
7400 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7401 stx
->stx_version
= in
->change_attr
;
7402 if (in
->ctime
> in
->mtime
)
7403 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7405 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7406 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7411 void Client::touch_dn(Dentry
*dn
)
7416 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7418 std::lock_guard
lock(client_lock
);
7419 tout(cct
) << __func__
<< std::endl
;
7420 tout(cct
) << relpath
<< std::endl
;
7421 tout(cct
) << mode
<< std::endl
;
7426 filepath
path(relpath
);
7428 int r
= path_walk(path
, &in
, perms
);
7432 attr
.st_mode
= mode
;
7433 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7436 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7438 std::lock_guard
lock(client_lock
);
7439 tout(cct
) << __func__
<< std::endl
;
7440 tout(cct
) << fd
<< std::endl
;
7441 tout(cct
) << mode
<< std::endl
;
7446 Fh
*f
= get_filehandle(fd
);
7449 #if defined(__linux__) && defined(O_PATH)
7450 if (f
->flags
& O_PATH
)
7454 attr
.st_mode
= mode
;
7455 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7458 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7460 std::lock_guard
lock(client_lock
);
7461 tout(cct
) << __func__
<< std::endl
;
7462 tout(cct
) << relpath
<< std::endl
;
7463 tout(cct
) << mode
<< std::endl
;
7468 filepath
path(relpath
);
7470 // don't follow symlinks
7471 int r
= path_walk(path
, &in
, perms
, false);
7475 attr
.st_mode
= mode
;
7476 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7479 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7480 const UserPerm
& perms
)
7482 std::lock_guard
lock(client_lock
);
7483 tout(cct
) << __func__
<< std::endl
;
7484 tout(cct
) << relpath
<< std::endl
;
7485 tout(cct
) << new_uid
<< std::endl
;
7486 tout(cct
) << new_gid
<< std::endl
;
7491 filepath
path(relpath
);
7493 int r
= path_walk(path
, &in
, perms
);
7497 attr
.st_uid
= new_uid
;
7498 attr
.st_gid
= new_gid
;
7499 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7502 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7504 std::lock_guard
lock(client_lock
);
7505 tout(cct
) << __func__
<< std::endl
;
7506 tout(cct
) << fd
<< std::endl
;
7507 tout(cct
) << new_uid
<< std::endl
;
7508 tout(cct
) << new_gid
<< std::endl
;
7513 Fh
*f
= get_filehandle(fd
);
7516 #if defined(__linux__) && defined(O_PATH)
7517 if (f
->flags
& O_PATH
)
7521 attr
.st_uid
= new_uid
;
7522 attr
.st_gid
= new_gid
;
7524 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7525 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7526 return _setattr(f
->inode
, &attr
, mask
, perms
);
7529 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7530 const UserPerm
& perms
)
7532 std::lock_guard
lock(client_lock
);
7533 tout(cct
) << __func__
<< std::endl
;
7534 tout(cct
) << relpath
<< std::endl
;
7535 tout(cct
) << new_uid
<< std::endl
;
7536 tout(cct
) << new_gid
<< std::endl
;
7541 filepath
path(relpath
);
7543 // don't follow symlinks
7544 int r
= path_walk(path
, &in
, perms
, false);
7548 attr
.st_uid
= new_uid
;
7549 attr
.st_gid
= new_gid
;
7551 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7552 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7553 return _setattr(in
, &attr
, mask
, perms
);
7556 static void attr_set_atime_and_mtime(struct stat
*attr
,
7557 const utime_t
&atime
,
7558 const utime_t
&mtime
)
7560 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7561 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7562 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7563 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7566 // for [l]utime() invoke the timeval variant as the timespec
7567 // variant are not yet implemented. for futime[s](), invoke
7568 // the timespec variant.
7569 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7570 const UserPerm
& perms
)
7572 struct timeval tv
[2];
7573 tv
[0].tv_sec
= buf
->actime
;
7575 tv
[1].tv_sec
= buf
->modtime
;
7578 return utimes(relpath
, tv
, perms
);
7581 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7582 const UserPerm
& perms
)
7584 struct timeval tv
[2];
7585 tv
[0].tv_sec
= buf
->actime
;
7587 tv
[1].tv_sec
= buf
->modtime
;
7590 return lutimes(relpath
, tv
, perms
);
7593 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7595 struct timespec ts
[2];
7596 ts
[0].tv_sec
= buf
->actime
;
7598 ts
[1].tv_sec
= buf
->modtime
;
7601 return futimens(fd
, ts
, perms
);
7604 int Client::utimes(const char *relpath
, struct timeval times
[2],
7605 const UserPerm
& perms
)
7607 std::lock_guard
lock(client_lock
);
7608 tout(cct
) << __func__
<< std::endl
;
7609 tout(cct
) << relpath
<< std::endl
;
7610 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7612 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7618 filepath
path(relpath
);
7620 int r
= path_walk(path
, &in
, perms
);
7624 utime_t
atime(times
[0]);
7625 utime_t
mtime(times
[1]);
7627 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7628 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7631 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7632 const UserPerm
& perms
)
7634 std::lock_guard
lock(client_lock
);
7635 tout(cct
) << __func__
<< std::endl
;
7636 tout(cct
) << relpath
<< std::endl
;
7637 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7639 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7645 filepath
path(relpath
);
7647 int r
= path_walk(path
, &in
, perms
, false);
7651 utime_t
atime(times
[0]);
7652 utime_t
mtime(times
[1]);
7654 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7655 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7658 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7660 struct timespec ts
[2];
7661 ts
[0].tv_sec
= times
[0].tv_sec
;
7662 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7663 ts
[1].tv_sec
= times
[1].tv_sec
;
7664 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7666 return futimens(fd
, ts
, perms
);
7669 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7671 std::lock_guard
lock(client_lock
);
7672 tout(cct
) << __func__
<< std::endl
;
7673 tout(cct
) << fd
<< std::endl
;
7674 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7676 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7682 Fh
*f
= get_filehandle(fd
);
7685 #if defined(__linux__) && defined(O_PATH)
7686 if (f
->flags
& O_PATH
)
7690 utime_t
atime(times
[0]);
7691 utime_t
mtime(times
[1]);
7693 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7694 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7697 int Client::flock(int fd
, int operation
, uint64_t owner
)
7699 std::lock_guard
lock(client_lock
);
7700 tout(cct
) << __func__
<< std::endl
;
7701 tout(cct
) << fd
<< std::endl
;
7702 tout(cct
) << operation
<< std::endl
;
7703 tout(cct
) << owner
<< std::endl
;
7708 Fh
*f
= get_filehandle(fd
);
7712 return _flock(f
, operation
, owner
);
7715 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7717 std::lock_guard
lock(client_lock
);
7718 tout(cct
) << __func__
<< std::endl
;
7719 tout(cct
) << relpath
<< std::endl
;
7724 filepath
path(relpath
);
7726 int r
= path_walk(path
, &in
, perms
, true);
7729 if (cct
->_conf
->client_permissions
) {
7730 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7734 r
= _opendir(in
.get(), dirpp
, perms
);
7735 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7737 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7741 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7745 *dirpp
= new dir_result_t(in
, perms
);
7746 opened_dirs
.insert(*dirpp
);
7747 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7752 int Client::closedir(dir_result_t
*dir
)
7754 std::lock_guard
lock(client_lock
);
7755 tout(cct
) << __func__
<< std::endl
;
7756 tout(cct
) << (unsigned long)dir
<< std::endl
;
7758 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7763 void Client::_closedir(dir_result_t
*dirp
)
7765 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7767 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7768 dirp
->inode
.reset();
7770 _readdir_drop_dirp_buffer(dirp
);
7771 opened_dirs
.erase(dirp
);
7775 void Client::rewinddir(dir_result_t
*dirp
)
7777 std::lock_guard
lock(client_lock
);
7778 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7783 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7784 _readdir_drop_dirp_buffer(d
);
7788 loff_t
Client::telldir(dir_result_t
*dirp
)
7790 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7791 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7795 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7797 std::lock_guard
lock(client_lock
);
7799 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7804 if (offset
== dirp
->offset
)
7807 if (offset
> dirp
->offset
)
7808 dirp
->release_count
= 0; // bump if we do a forward seek
7810 dirp
->ordered_count
= 0; // disable filling readdir cache
7812 if (dirp
->hash_order()) {
7813 if (dirp
->offset
> offset
) {
7814 _readdir_drop_dirp_buffer(dirp
);
7819 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7820 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7821 _readdir_drop_dirp_buffer(dirp
);
7826 dirp
->offset
= offset
;
7831 // ino_t d_ino; /* inode number */
7832 // off_t d_off; /* offset to the next dirent */
7833 // unsigned short d_reclen; /* length of this record */
7834 // unsigned char d_type; /* type of file */
7835 // char d_name[256]; /* filename */
7837 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7839 strncpy(de
->d_name
, name
, 255);
7840 de
->d_name
[255] = '\0';
7843 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7844 de
->d_off
= next_off
;
7847 de
->d_type
= IFTODT(type
);
7848 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7849 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7853 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7855 frag_t fg
= dirp
->buffer_frag
;
7857 if (fg
.is_rightmost()) {
7858 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7865 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7867 if (dirp
->hash_order()) {
7869 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7870 if (dirp
->offset
< new_offset
) // don't decrease offset
7871 dirp
->offset
= new_offset
;
7873 dirp
->last_name
.clear();
7874 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7875 _readdir_rechoose_frag(dirp
);
7879 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7881 ceph_assert(dirp
->inode
);
7883 if (dirp
->hash_order())
7886 frag_t cur
= frag_t(dirp
->offset_high());
7887 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7889 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7890 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7891 dirp
->last_name
.clear();
7892 dirp
->next_offset
= 2;
7896 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7898 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7899 dirp
->buffer
.clear();
7902 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7905 ceph_assert(dirp
->inode
);
7907 // get the current frag.
7909 if (dirp
->hash_order())
7910 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7912 fg
= frag_t(dirp
->offset_high());
7914 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7915 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7917 int op
= CEPH_MDS_OP_READDIR
;
7918 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7919 op
= CEPH_MDS_OP_LSSNAP
;
7921 InodeRef
& diri
= dirp
->inode
;
7923 MetaRequest
*req
= new MetaRequest(op
);
7925 diri
->make_nosnap_relative_path(path
);
7926 req
->set_filepath(path
);
7927 req
->set_inode(diri
.get());
7928 req
->head
.args
.readdir
.frag
= fg
;
7929 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7930 if (dirp
->last_name
.length()) {
7931 req
->path2
.set_path(dirp
->last_name
);
7932 } else if (dirp
->hash_order()) {
7933 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7938 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7940 if (res
== -EAGAIN
) {
7941 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
7942 _readdir_rechoose_frag(dirp
);
7943 return _readdir_get_frag(dirp
);
7947 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
7948 << " size " << dirp
->buffer
.size() << dendl
;
7950 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
7957 struct dentry_off_lt
{
7958 bool operator()(const Dentry
* dn
, int64_t off
) const {
7959 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7963 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7964 int caps
, bool getref
)
7966 ceph_assert(client_lock
.is_locked());
7967 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
7968 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
7970 Dir
*dir
= dirp
->inode
->dir
;
7973 ldout(cct
, 10) << " dir is empty" << dendl
;
7978 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
7979 dir
->readdir_cache
.end(),
7980 dirp
->offset
, dentry_off_lt());
7984 if (!dirp
->inode
->is_complete_and_ordered())
7986 if (pd
== dir
->readdir_cache
.end())
7989 if (dn
->inode
== NULL
) {
7990 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
7994 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
7995 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8000 int idx
= pd
- dir
->readdir_cache
.begin();
8001 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8005 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8006 pd
= dir
->readdir_cache
.begin() + idx
;
8007 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8010 struct ceph_statx stx
;
8012 fill_statx(dn
->inode
, caps
, &stx
);
8014 uint64_t next_off
= dn
->offset
+ 1;
8015 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8017 if (pd
== dir
->readdir_cache
.end())
8018 next_off
= dir_result_t::END
;
8022 in
= dn
->inode
.get();
8026 dn_name
= dn
->name
; // fill in name while we have lock
8028 client_lock
.Unlock();
8029 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8031 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8032 << " = " << r
<< dendl
;
8037 dirp
->offset
= next_off
;
8039 dirp
->next_offset
= 2;
8041 dirp
->next_offset
= dirp
->offset_low();
8042 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8043 dirp
->release_count
= 0; // last_name no longer match cache index
8048 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8053 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8054 unsigned want
, unsigned flags
, bool getref
)
8056 int caps
= statx_to_mask(flags
, want
);
8058 std::lock_guard
lock(client_lock
);
8063 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8065 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8066 << dec
<< " at_end=" << dirp
->at_end()
8067 << " hash_order=" << dirp
->hash_order() << dendl
;
8070 struct ceph_statx stx
;
8071 memset(&de
, 0, sizeof(de
));
8072 memset(&stx
, 0, sizeof(stx
));
8074 InodeRef
& diri
= dirp
->inode
;
8079 if (dirp
->offset
== 0) {
8080 ldout(cct
, 15) << " including ." << dendl
;
8081 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8082 uint64_t next_off
= 1;
8085 r
= _getattr(diri
, caps
, dirp
->perms
);
8089 fill_statx(diri
, caps
, &stx
);
8090 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8092 Inode
*inode
= NULL
;
8098 client_lock
.Unlock();
8099 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8104 dirp
->offset
= next_off
;
8108 if (dirp
->offset
== 1) {
8109 ldout(cct
, 15) << " including .." << dendl
;
8110 uint64_t next_off
= 2;
8112 if (diri
->dentries
.empty())
8115 in
= diri
->get_first_parent()->dir
->parent_inode
;
8118 r
= _getattr(in
, caps
, dirp
->perms
);
8122 fill_statx(in
, caps
, &stx
);
8123 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8125 Inode
*inode
= NULL
;
8131 client_lock
.Unlock();
8132 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8137 dirp
->offset
= next_off
;
8142 // can we read from our cache?
8143 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8144 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8145 << dirp
->inode
->is_complete_and_ordered()
8146 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8148 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8149 dirp
->inode
->is_complete_and_ordered() &&
8150 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8151 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8160 bool check_caps
= true;
8161 if (!dirp
->is_cached()) {
8162 int r
= _readdir_get_frag(dirp
);
8165 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8166 // different than the requested one. (our dirfragtree was outdated)
8169 frag_t fg
= dirp
->buffer_frag
;
8171 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8172 << " offset " << hex
<< dirp
->offset
<< dendl
;
8174 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8175 dirp
->offset
, dir_result_t::dentry_off_lt());
8176 it
!= dirp
->buffer
.end();
8178 dir_result_t::dentry
&entry
= *it
;
8180 uint64_t next_off
= entry
.offset
+ 1;
8184 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8189 fill_statx(entry
.inode
, caps
, &stx
);
8190 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8192 Inode
*inode
= NULL
;
8194 inode
= entry
.inode
.get();
8198 client_lock
.Unlock();
8199 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8202 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8203 << " = " << r
<< dendl
;
8207 dirp
->offset
= next_off
;
8212 if (dirp
->next_offset
> 2) {
8213 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8214 _readdir_drop_dirp_buffer(dirp
);
8218 if (!fg
.is_rightmost()) {
8220 _readdir_next_frag(dirp
);
8224 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8225 diri
->dir_release_count
== dirp
->release_count
) {
8226 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8227 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8229 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8230 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8232 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8234 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8235 diri
->flags
|= I_COMPLETE
;
8247 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8249 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8256 * 1 if we got a dirent
8257 * 0 for end of directory
8261 struct single_readdir
{
8263 struct ceph_statx
*stx
;
8268 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8269 struct ceph_statx
*stx
, off_t off
,
8272 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8275 return -1; // already filled this dirent
8285 struct dirent
*Client::readdir(dir_result_t
*d
)
8288 static struct dirent de
;
8295 // our callback fills the dirent and sets sr.full=true on first
8296 // call, and returns -1 the second time around.
8297 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8299 errno
= -ret
; // this sucks.
8300 return (dirent
*) NULL
;
8305 return (dirent
*) NULL
;
8308 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8309 struct ceph_statx
*stx
, unsigned want
,
8310 unsigned flags
, Inode
**out
)
8318 // our callback fills the dirent and sets sr.full=true on first
8319 // call, and returns -1 the second time around.
8320 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8332 struct getdents_result
{
8339 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8340 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8342 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8348 dlen
= strlen(de
->d_name
) + 1;
8350 if (c
->pos
+ dlen
> c
->buflen
)
8351 return -1; // doesn't fit
8354 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8356 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8362 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8367 gr
.fullent
= fullent
;
8370 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8372 if (r
< 0) { // some error
8373 if (r
== -1) { // buffer ran out of space
8374 if (gr
.pos
) { // but we got some entries already!
8376 } // or we need a larger buffer
8378 } else { // actual error, return it
8387 struct getdir_result
{
8388 list
<string
> *contents
;
8392 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8394 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8396 r
->contents
->push_back(de
->d_name
);
8401 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8402 const UserPerm
& perms
)
8404 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8406 std::lock_guard
lock(client_lock
);
8407 tout(cct
) << "getdir" << std::endl
;
8408 tout(cct
) << relpath
<< std::endl
;
8412 int r
= opendir(relpath
, &d
, perms
);
8417 gr
.contents
= &contents
;
8419 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8429 /****** file i/o **********/
8430 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8431 mode_t mode
, int stripe_unit
, int stripe_count
,
8432 int object_size
, const char *data_pool
)
8434 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8435 std::lock_guard
lock(client_lock
);
8436 tout(cct
) << "open" << std::endl
;
8437 tout(cct
) << relpath
<< std::endl
;
8438 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8445 #if defined(__linux__) && defined(O_PATH)
8446 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8447 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8448 * in kernel (fs/open.c). */
8450 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8453 filepath
path(relpath
);
8455 bool created
= false;
8456 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8457 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8458 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8460 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8463 #if defined(__linux__) && defined(O_PATH)
8464 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8466 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8470 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8471 filepath dirpath
= path
;
8472 string dname
= dirpath
.last_dentry();
8473 dirpath
.pop_dentry();
8475 r
= path_walk(dirpath
, &dir
, perms
, true,
8476 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8479 if (cct
->_conf
->client_permissions
) {
8480 r
= may_create(dir
.get(), perms
);
8484 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8485 stripe_count
, object_size
, data_pool
, &created
, perms
);
8491 // posix says we can only check permissions of existing files
8492 if (cct
->_conf
->client_permissions
) {
8493 r
= may_open(in
.get(), flags
, perms
);
8500 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8502 // allocate a integer file descriptor
8505 ceph_assert(fd_map
.count(r
) == 0);
8510 tout(cct
) << r
<< std::endl
;
8511 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8515 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8517 /* Use default file striping parameters */
8518 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8521 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8522 const UserPerm
& perms
)
8524 std::lock_guard
lock(client_lock
);
8525 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8530 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8532 req
->set_filepath(path
);
8534 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8536 sprintf(f
, "%u", h
);
8537 filepath
path2(dirino
);
8538 path2
.push_dentry(string(f
));
8539 req
->set_filepath2(path2
);
8541 int r
= make_request(req
, perms
, NULL
, NULL
,
8542 rand() % mdsmap
->get_num_in_mds());
8543 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8549 * Load inode into local cache.
8551 * If inode pointer is non-NULL, and take a reference on
8552 * the resulting Inode object in one operation, so that caller
8553 * can safely assume inode will still be there after return.
8555 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8557 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8562 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8564 req
->set_filepath(path
);
8566 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8567 if (r
== 0 && inode
!= NULL
) {
8568 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8569 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8570 ceph_assert(p
!= inode_map
.end());
8574 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8578 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8580 std::lock_guard
lock(client_lock
);
8581 return _lookup_ino(ino
, perms
, inode
);
8585 * Find the parent inode of `ino` and insert it into
8586 * our cache. Conditionally also set `parent` to a referenced
8587 * Inode* if caller provides non-NULL value.
8589 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8591 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8593 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8594 filepath
path(ino
->ino
);
8595 req
->set_filepath(path
);
8598 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8599 // Give caller a reference to the parent ino if they provided a pointer.
8600 if (parent
!= NULL
) {
8602 *parent
= target
.get();
8604 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8609 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8614 * Populate the parent dentry for `ino`, provided it is
8615 * a child of `parent`.
8617 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8619 ceph_assert(parent
->is_dir());
8620 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8625 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8626 req
->set_filepath2(filepath(parent
->ino
));
8627 req
->set_filepath(filepath(ino
->ino
));
8628 req
->set_inode(ino
);
8630 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8631 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8635 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8637 std::lock_guard
lock(client_lock
);
8638 return _lookup_name(ino
, parent
, perms
);
8641 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8644 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8646 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8648 if (in
->snapid
!= CEPH_NOSNAP
) {
8649 in
->snap_cap_refs
++;
8650 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8651 << ccap_string(in
->caps_issued()) << dendl
;
8654 const auto& conf
= cct
->_conf
;
8655 f
->readahead
.set_trigger_requests(1);
8656 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8657 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8658 if (conf
->client_readahead_max_bytes
) {
8659 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8661 if (conf
->client_readahead_max_periods
) {
8662 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8664 f
->readahead
.set_max_readahead_size(max_readahead
);
8665 vector
<uint64_t> alignments
;
8666 alignments
.push_back(in
->layout
.get_period());
8667 alignments
.push_back(in
->layout
.stripe_unit
);
8668 f
->readahead
.set_alignments(alignments
);
8673 int Client::_release_fh(Fh
*f
)
8675 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8676 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8677 Inode
*in
= f
->inode
.get();
8678 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8682 if (in
->snapid
== CEPH_NOSNAP
) {
8683 if (in
->put_open_ref(f
->mode
)) {
8684 _flush(in
, new C_Client_FlushComplete(this, in
));
8688 ceph_assert(in
->snap_cap_refs
> 0);
8689 in
->snap_cap_refs
--;
8692 _release_filelocks(f
);
8694 // Finally, read any async err (i.e. from flushes)
8695 int err
= f
->take_async_err();
8697 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8698 << cpp_strerror(err
) << dendl
;
8700 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8708 void Client::_put_fh(Fh
*f
)
8710 int left
= f
->put();
8716 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8717 const UserPerm
& perms
)
8719 if (in
->snapid
!= CEPH_NOSNAP
&&
8720 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8724 // use normalized flags to generate cmode
8725 int cflags
= ceph_flags_sys2wire(flags
);
8726 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8727 cflags
|= CEPH_O_LAZY
;
8729 int cmode
= ceph_flags_to_mode(cflags
);
8730 int want
= ceph_caps_for_mode(cmode
);
8733 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8735 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8737 check_caps(in
, CHECK_CAPS_NODELAY
);
8740 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8742 in
->make_nosnap_relative_path(path
);
8743 req
->set_filepath(path
);
8744 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8745 req
->head
.args
.open
.mode
= mode
;
8746 req
->head
.args
.open
.pool
= -1;
8747 if (cct
->_conf
->client_debug_getattr_caps
)
8748 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8750 req
->head
.args
.open
.mask
= 0;
8751 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8753 result
= make_request(req
, perms
);
8756 * NFS expects that delegations will be broken on a conflicting open,
8757 * not just when there is actual conflicting access to the file. SMB leases
8758 * and oplocks also have similar semantics.
8760 * Ensure that clients that have delegations enabled will wait on minimal
8761 * caps during open, just to ensure that other clients holding delegations
8762 * return theirs first.
8764 if (deleg_timeout
&& result
== 0) {
8767 if (cmode
& CEPH_FILE_MODE_WR
)
8768 need
|= CEPH_CAP_FILE_WR
;
8769 if (cmode
& CEPH_FILE_MODE_RD
)
8770 need
|= CEPH_CAP_FILE_RD
;
8772 result
= get_caps(in
, need
, want
, &have
, -1);
8774 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8775 " . Denying open: " <<
8776 cpp_strerror(result
) << dendl
;
8777 in
->put_open_ref(cmode
);
8779 put_cap_ref(in
, need
);
8787 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8789 in
->put_open_ref(cmode
);
8797 int Client::_renew_caps(Inode
*in
)
8799 int wanted
= in
->caps_file_wanted();
8800 if (in
->is_any_caps() &&
8801 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8802 check_caps(in
, CHECK_CAPS_NODELAY
);
8807 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8809 else if (wanted
& CEPH_CAP_FILE_RD
)
8811 else if (wanted
& CEPH_CAP_FILE_WR
)
8814 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8816 in
->make_nosnap_relative_path(path
);
8817 req
->set_filepath(path
);
8818 req
->head
.args
.open
.flags
= flags
;
8819 req
->head
.args
.open
.pool
= -1;
8820 if (cct
->_conf
->client_debug_getattr_caps
)
8821 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8823 req
->head
.args
.open
.mask
= 0;
8826 // duplicate in case Cap goes away; not sure if that race is a concern?
8827 const UserPerm
*pperm
= in
->get_best_perms();
8831 int ret
= make_request(req
, perms
);
8835 int Client::close(int fd
)
8837 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8838 std::lock_guard
lock(client_lock
);
8839 tout(cct
) << "close" << std::endl
;
8840 tout(cct
) << fd
<< std::endl
;
8845 Fh
*fh
= get_filehandle(fd
);
8848 int err
= _release_fh(fh
);
8851 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8859 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8861 std::lock_guard
lock(client_lock
);
8862 tout(cct
) << "lseek" << std::endl
;
8863 tout(cct
) << fd
<< std::endl
;
8864 tout(cct
) << offset
<< std::endl
;
8865 tout(cct
) << whence
<< std::endl
;
8870 Fh
*f
= get_filehandle(fd
);
8873 #if defined(__linux__) && defined(O_PATH)
8874 if (f
->flags
& O_PATH
)
8877 return _lseek(f
, offset
, whence
);
8880 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8882 Inode
*in
= f
->inode
.get();
8886 if (whence
== SEEK_END
|| whence
== SEEK_DATA
|| whence
== SEEK_HOLE
) {
8887 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8899 pos
= f
->pos
+ offset
;
8903 pos
= in
->size
+ offset
;
8907 if (offset
< 0 || offset
>= in
->size
) {
8915 if (offset
< 0 || offset
>= in
->size
) {
8924 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
8934 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8939 void Client::lock_fh_pos(Fh
*f
)
8941 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8943 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8945 f
->pos_waiters
.push_back(&cond
);
8946 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
8947 while (f
->pos_locked
|| f
->pos_waiters
.front() != &cond
)
8948 cond
.Wait(client_lock
);
8949 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
8950 ceph_assert(f
->pos_waiters
.front() == &cond
);
8951 f
->pos_waiters
.pop_front();
8954 f
->pos_locked
= true;
8957 void Client::unlock_fh_pos(Fh
*f
)
8959 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8960 f
->pos_locked
= false;
8963 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
8965 if (!in
->inline_data
.length()) {
8966 onfinish
->complete(0);
8971 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
8972 object_t oid
= oid_buf
;
8974 ObjectOperation create_ops
;
8975 create_ops
.create(false);
8977 objecter
->mutate(oid
,
8978 OSDMap::file_to_object_locator(in
->layout
),
8980 in
->snaprealm
->get_snap_context(),
8981 ceph::real_clock::now(),
8985 bufferlist inline_version_bl
;
8986 encode(in
->inline_version
, inline_version_bl
);
8988 ObjectOperation uninline_ops
;
8989 uninline_ops
.cmpxattr("inline_version",
8990 CEPH_OSD_CMPXATTR_OP_GT
,
8991 CEPH_OSD_CMPXATTR_MODE_U64
,
8993 bufferlist inline_data
= in
->inline_data
;
8994 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
8995 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
8997 objecter
->mutate(oid
,
8998 OSDMap::file_to_object_locator(in
->layout
),
9000 in
->snaprealm
->get_snap_context(),
9001 ceph::real_clock::now(),
9010 // blocking osd interface
9012 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9014 std::lock_guard
lock(client_lock
);
9015 tout(cct
) << "read" << std::endl
;
9016 tout(cct
) << fd
<< std::endl
;
9017 tout(cct
) << size
<< std::endl
;
9018 tout(cct
) << offset
<< std::endl
;
9023 Fh
*f
= get_filehandle(fd
);
9026 #if defined(__linux__) && defined(O_PATH)
9027 if (f
->flags
& O_PATH
)
9031 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9032 size
= std::min(size
, (loff_t
)INT_MAX
);
9033 int r
= _read(f
, offset
, size
, &bl
);
9034 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9036 bl
.copy(0, bl
.length(), buf
);
9042 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9046 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9049 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9052 bool movepos
= false;
9053 std::unique_ptr
<C_SaferCond
> onuninline
;
9055 const auto& conf
= cct
->_conf
;
9056 Inode
*in
= f
->inode
.get();
9058 utime_t start
= ceph_clock_now();
9060 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9062 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9069 loff_t start_pos
= offset
;
9071 if (in
->inline_version
== 0) {
9072 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9076 ceph_assert(in
->inline_version
> 0);
9080 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9081 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9083 want
= CEPH_CAP_FILE_CACHE
;
9084 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9088 if (f
->flags
& O_DIRECT
)
9089 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9091 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9092 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9093 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9094 uninline_data(in
, onuninline
.get());
9096 uint32_t len
= in
->inline_data
.length();
9097 uint64_t endoff
= offset
+ size
;
9098 if (endoff
> in
->size
)
9102 if (endoff
<= len
) {
9103 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9105 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9106 bl
->append_zero(endoff
- len
);
9108 r
= endoff
- offset
;
9109 } else if ((uint64_t)offset
< endoff
) {
9110 bl
->append_zero(endoff
- offset
);
9111 r
= endoff
- offset
;
9119 if (!conf
->client_debug_force_sync_read
&&
9121 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9123 if (f
->flags
& O_RSYNC
) {
9124 _flush_range(in
, offset
, size
);
9126 r
= _read_async(f
, offset
, size
, bl
);
9130 if (f
->flags
& O_DIRECT
)
9131 _flush_range(in
, offset
, size
);
9133 bool checkeof
= false;
9134 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9141 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9144 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9149 if ((uint64_t)offset
< in
->size
)
9155 ceph_assert(r
>= 0);
9158 f
->pos
= start_pos
+ r
;
9161 lat
= ceph_clock_now();
9163 logger
->tinc(l_c_read
, lat
);
9169 client_lock
.Unlock();
9170 int ret
= onuninline
->wait();
9172 if (ret
>= 0 || ret
== -ECANCELED
) {
9173 in
->inline_data
.clear();
9174 in
->inline_version
= CEPH_INLINE_NONE
;
9175 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9181 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9189 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9192 f
->readahead
.inc_pending();
9195 Client::C_Readahead::~C_Readahead() {
9196 f
->readahead
.dec_pending();
9200 void Client::C_Readahead::finish(int r
) {
9201 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9202 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9205 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9207 const auto& conf
= cct
->_conf
;
9208 Inode
*in
= f
->inode
.get();
9210 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9212 // trim read based on file size?
9213 if (off
>= in
->size
)
9217 if (off
+ len
> in
->size
) {
9218 len
= in
->size
- off
;
9221 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9222 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9223 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9225 // read (and possibly block)
9227 C_SaferCond
onfinish("Client::_read_async flock");
9228 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9229 off
, len
, bl
, 0, &onfinish
);
9231 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9232 client_lock
.Unlock();
9233 r
= onfinish
.wait();
9235 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9238 if(f
->readahead
.get_min_readahead_size() > 0) {
9239 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9240 if (readahead_extent
.second
> 0) {
9241 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9242 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9243 Context
*onfinish2
= new C_Readahead(this, f
);
9244 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9245 readahead_extent
.first
, readahead_extent
.second
,
9246 NULL
, 0, onfinish2
);
9248 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9249 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9251 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9260 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9263 Inode
*in
= f
->inode
.get();
9268 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9270 Mutex
flock("Client::_read_sync flock");
9273 C_SaferCond
onfinish("Client::_read_sync flock");
9277 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9279 in
->truncate_size
, in
->truncate_seq
,
9281 client_lock
.Unlock();
9282 int r
= onfinish
.wait();
9285 // if we get ENOENT from OSD, assume 0 bytes returned
9296 bl
->claim_append(tbl
);
9299 if (r
>= 0 && r
< wanted
) {
9300 if (pos
< in
->size
) {
9301 // zero up to known EOF
9302 int64_t some
= in
->size
- pos
;
9305 auto z
= buffer::ptr_node::create(some
);
9307 bl
->push_back(std::move(z
));
9324 * we keep count of uncommitted sync writes on the inode, so that
9327 void Client::_sync_write_commit(Inode
*in
)
9329 ceph_assert(unsafe_sync_write
> 0);
9330 unsafe_sync_write
--;
9332 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9334 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9335 if (unsafe_sync_write
== 0 && unmounting
) {
9336 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9337 mount_cond
.Signal();
9341 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9343 std::lock_guard
lock(client_lock
);
9344 tout(cct
) << "write" << std::endl
;
9345 tout(cct
) << fd
<< std::endl
;
9346 tout(cct
) << size
<< std::endl
;
9347 tout(cct
) << offset
<< std::endl
;
9352 Fh
*fh
= get_filehandle(fd
);
9355 #if defined(__linux__) && defined(O_PATH)
9356 if (fh
->flags
& O_PATH
)
9359 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9360 size
= std::min(size
, (loff_t
)INT_MAX
);
9361 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9362 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9366 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9370 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9373 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9374 unsigned iovcnt
, int64_t offset
, bool write
,
9377 #if defined(__linux__) && defined(O_PATH)
9378 if (fh
->flags
& O_PATH
)
9381 loff_t totallen
= 0;
9382 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9383 totallen
+= iov
[i
].iov_len
;
9387 * Some of the API functions take 64-bit size values, but only return
9388 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9389 * we don't do I/Os larger than the values we can return.
9392 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9395 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9396 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9400 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9401 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9406 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9408 * This piece of code aims to handle the case that bufferlist does not have enough data
9409 * to fill in the iov
9411 if (resid
< iov
[j
].iov_len
) {
9412 bl
.copy(bufoff
, resid
, (char *)iov
[j
].iov_base
);
9415 bl
.copy(bufoff
, iov
[j
].iov_len
, (char *)iov
[j
].iov_base
);
9417 resid
-= iov
[j
].iov_len
;
9418 bufoff
+= iov
[j
].iov_len
;
9424 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9426 std::lock_guard
lock(client_lock
);
9427 tout(cct
) << fd
<< std::endl
;
9428 tout(cct
) << offset
<< std::endl
;
9433 Fh
*fh
= get_filehandle(fd
);
9436 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9439 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9440 const struct iovec
*iov
, int iovcnt
)
9444 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9447 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9448 Inode
*in
= f
->inode
.get();
9450 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9454 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9456 // was Fh opened as writeable?
9457 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9460 // use/adjust fd pos?
9464 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9465 * change out from under us.
9467 if (f
->flags
& O_APPEND
) {
9468 int r
= _lseek(f
, 0, SEEK_END
);
9480 uint64_t endoff
= offset
+ size
;
9481 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9486 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9488 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9491 utime_t start
= ceph_clock_now();
9493 if (in
->inline_version
== 0) {
9494 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9497 ceph_assert(in
->inline_version
> 0);
9500 // copy into fresh buffer (since our write may be resub, async)
9504 bl
.append(buf
, size
);
9506 for (int i
= 0; i
< iovcnt
; i
++) {
9507 if (iov
[i
].iov_len
> 0) {
9508 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9514 uint64_t totalwritten
;
9516 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9517 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9519 want
= CEPH_CAP_FILE_BUFFER
;
9520 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9524 /* clear the setuid/setgid bits, if any */
9525 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9526 struct ceph_statx stx
= { 0 };
9528 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9529 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9533 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9536 if (f
->flags
& O_DIRECT
)
9537 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9539 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9541 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9543 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9544 if (endoff
> cct
->_conf
->client_max_inline_size
||
9545 endoff
> CEPH_INLINE_MAX_SIZE
||
9546 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9547 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9548 uninline_data(in
, onuninline
.get());
9550 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9552 uint32_t len
= in
->inline_data
.length();
9555 in
->inline_data
.copy(endoff
, len
- endoff
, bl
);
9558 in
->inline_data
.splice(offset
, len
- offset
);
9559 else if (offset
> len
)
9560 in
->inline_data
.append_zero(offset
- len
);
9562 in
->inline_data
.append(bl
);
9563 in
->inline_version
++;
9565 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9571 if (cct
->_conf
->client_oc
&&
9572 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9573 // do buffered write
9574 if (!in
->oset
.dirty_or_tx
)
9575 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9577 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9579 // async, caching, non-blocking.
9580 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9581 in
->snaprealm
->get_snap_context(),
9582 offset
, size
, bl
, ceph::real_clock::now(),
9584 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9589 // flush cached write if O_SYNC is set on file fh
9590 // O_DSYNC == O_SYNC on linux < 2.6.33
9591 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9592 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9593 _flush_range(in
, offset
, size
);
9596 if (f
->flags
& O_DIRECT
)
9597 _flush_range(in
, offset
, size
);
9599 // simple, non-atomic sync write
9600 C_SaferCond
onfinish("Client::_write flock");
9601 unsafe_sync_write
++;
9602 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9604 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9605 offset
, size
, bl
, ceph::real_clock::now(), 0,
9606 in
->truncate_size
, in
->truncate_seq
,
9608 client_lock
.Unlock();
9611 _sync_write_commit(in
);
9614 // if we get here, write was successful, update client metadata
9617 lat
= ceph_clock_now();
9619 logger
->tinc(l_c_wrlat
, lat
);
9626 totalwritten
= size
;
9627 r
= (int64_t)totalwritten
;
9630 if (totalwritten
+ offset
> in
->size
) {
9631 in
->size
= totalwritten
+ offset
;
9632 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9634 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9635 check_caps(in
, CHECK_CAPS_NODELAY
);
9636 } else if (is_max_size_approaching(in
)) {
9640 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9642 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9646 in
->mtime
= in
->ctime
= ceph_clock_now();
9648 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9652 if (nullptr != onuninline
) {
9653 client_lock
.Unlock();
9654 int uninline_ret
= onuninline
->wait();
9657 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9658 in
->inline_data
.clear();
9659 in
->inline_version
= CEPH_INLINE_NONE
;
9660 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9666 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9670 int Client::_flush(Fh
*f
)
9672 Inode
*in
= f
->inode
.get();
9673 int err
= f
->take_async_err();
9675 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9676 << cpp_strerror(err
) << dendl
;
9678 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9684 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9686 struct ceph_statx stx
;
9687 stx
.stx_size
= length
;
9688 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9691 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9693 std::lock_guard
lock(client_lock
);
9694 tout(cct
) << __func__
<< std::endl
;
9695 tout(cct
) << fd
<< std::endl
;
9696 tout(cct
) << length
<< std::endl
;
9701 Fh
*f
= get_filehandle(fd
);
9704 #if defined(__linux__) && defined(O_PATH)
9705 if (f
->flags
& O_PATH
)
9709 attr
.st_size
= length
;
9710 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9713 int Client::fsync(int fd
, bool syncdataonly
)
9715 std::lock_guard
lock(client_lock
);
9716 tout(cct
) << "fsync" << std::endl
;
9717 tout(cct
) << fd
<< std::endl
;
9718 tout(cct
) << syncdataonly
<< std::endl
;
9723 Fh
*f
= get_filehandle(fd
);
9726 #if defined(__linux__) && defined(O_PATH)
9727 if (f
->flags
& O_PATH
)
9730 int r
= _fsync(f
, syncdataonly
);
9732 // The IOs in this fsync were okay, but maybe something happened
9733 // in the background that we shoudl be reporting?
9734 r
= f
->take_async_err();
9735 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9736 << ") = 0, async_err = " << r
<< dendl
;
9738 // Assume that an error we encountered during fsync, even reported
9739 // synchronously, would also have applied the error to the Fh, and we
9740 // should clear it here to avoid returning the same error again on next
9742 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9744 f
->take_async_err();
9749 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9752 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9753 ceph_tid_t flush_tid
= 0;
9756 utime_t start
= ceph_clock_now();
9758 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9760 if (cct
->_conf
->client_oc
) {
9761 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9762 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9763 _flush(in
, object_cacher_completion
.get());
9764 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9767 if (!syncdataonly
&& in
->dirty_caps
) {
9768 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9769 if (in
->flushing_caps
)
9770 flush_tid
= last_flush_tid
;
9771 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9773 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9776 MetaRequest
*req
= in
->unsafe_ops
.back();
9777 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9780 wait_on_list(req
->waitfor_safe
);
9784 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9785 client_lock
.Unlock();
9786 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9787 r
= object_cacher_completion
->wait();
9789 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9791 // FIXME: this can starve
9792 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9793 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9794 << " uncommitted, waiting" << dendl
;
9795 wait_on_list(in
->waitfor_commit
);
9801 wait_sync_caps(in
, flush_tid
);
9803 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9805 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9806 << cpp_strerror(-r
) << dendl
;
9809 lat
= ceph_clock_now();
9811 logger
->tinc(l_c_fsync
, lat
);
9816 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9818 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9819 return _fsync(f
->inode
.get(), syncdataonly
);
9822 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9824 std::lock_guard
lock(client_lock
);
9825 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9826 tout(cct
) << fd
<< std::endl
;
9831 Fh
*f
= get_filehandle(fd
);
9834 int r
= _getattr(f
->inode
, mask
, perms
);
9837 fill_stat(f
->inode
, stbuf
, NULL
);
9838 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9842 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9843 unsigned int want
, unsigned int flags
)
9845 std::lock_guard
lock(client_lock
);
9846 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9847 tout(cct
) << fd
<< std::endl
;
9852 Fh
*f
= get_filehandle(fd
);
9856 unsigned mask
= statx_to_mask(flags
, want
);
9859 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9860 r
= _getattr(f
->inode
, mask
, perms
);
9862 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9867 fill_statx(f
->inode
, mask
, stx
);
9868 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9872 // not written yet, but i want to link!
9874 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9875 const UserPerm
& perms
)
9877 std::lock_guard
lock(client_lock
);
9878 tout(cct
) << "chdir" << std::endl
;
9879 tout(cct
) << relpath
<< std::endl
;
9884 filepath
path(relpath
);
9886 int r
= path_walk(path
, &in
, perms
);
9890 if (!(in
.get()->is_dir()))
9895 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9897 _getcwd(new_cwd
, perms
);
9901 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9904 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
9906 Inode
*in
= cwd
.get();
9907 while (in
!= root
) {
9908 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
9910 // A cwd or ancester is unlinked
9911 if (in
->dentries
.empty()) {
9915 Dentry
*dn
= in
->get_first_parent();
9920 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
9921 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9922 filepath
path(in
->ino
);
9923 req
->set_filepath(path
);
9925 int res
= make_request(req
, perms
);
9934 path
.push_front_dentry(dn
->name
);
9935 in
= dn
->dir
->parent_inode
;
9938 dir
+= path
.get_path();
9941 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9943 std::lock_guard
l(client_lock
);
9945 _getcwd(dir
, perms
);
9948 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9949 const UserPerm
& perms
)
9951 std::lock_guard
l(client_lock
);
9952 tout(cct
) << __func__
<< std::endl
;
9953 unsigned long int total_files_on_fs
;
9961 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
9962 if (data_pools
.size() == 1) {
9963 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
9965 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
9968 client_lock
.Unlock();
9969 int rval
= cond
.wait();
9971 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
9975 ldout(cct
, 1) << "underlying call to statfs returned error: "
9976 << cpp_strerror(rval
)
9981 memset(stbuf
, 0, sizeof(*stbuf
));
9984 * we're going to set a block size of 4MB so we can represent larger
9985 * FSes without overflowing. Additionally convert the space
9986 * measurements from KB to bytes while making them in terms of
9987 * blocks. We use 4MB only because it is big enough, and because it
9988 * actually *is* the (ceph) default block size.
9990 const int CEPH_BLOCK_SHIFT
= 22;
9991 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
9992 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
9993 stbuf
->f_files
= total_files_on_fs
;
9995 stbuf
->f_favail
= -1;
9996 stbuf
->f_fsid
= -1; // ??
9997 stbuf
->f_flag
= 0; // ??
9998 stbuf
->f_namemax
= NAME_MAX
;
10000 // Usually quota_root will == root_ancestor, but if the mount root has no
10001 // quota but we can see a parent of it that does have a quota, we'll
10002 // respect that one instead.
10003 ceph_assert(root
!= nullptr);
10004 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10006 // get_quota_root should always give us something
10007 // because client quotas are always enabled
10008 ceph_assert(quota_root
!= nullptr);
10010 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10012 // Skip the getattr if any sessions are stale, as we don't want to
10013 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10015 if (!_any_stale_sessions()) {
10016 int r
= _getattr(quota_root
, 0, perms
, true);
10018 // Ignore return value: error getting latest inode metadata is not a good
10019 // reason to break "df".
10020 lderr(cct
) << "Error in getattr on quota root 0x"
10021 << std::hex
<< quota_root
->ino
<< std::dec
10022 << " statfs result may be outdated" << dendl
;
10026 // Special case: if there is a size quota set on the Inode acting
10027 // as the root for this client mount, then report the quota status
10028 // as the filesystem statistics.
10029 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10030 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10031 // It is possible for a quota to be exceeded: arithmetic here must
10032 // handle case where used > total.
10033 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10035 stbuf
->f_blocks
= total
;
10036 stbuf
->f_bfree
= free
;
10037 stbuf
->f_bavail
= free
;
10039 // General case: report the cluster statistics returned from RADOS. Because
10040 // multiple pools may be used without one filesystem namespace via
10041 // layouts, this is the most correct thing we can do.
10042 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10043 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10044 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10050 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10051 struct flock
*fl
, uint64_t owner
, bool removing
)
10053 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10054 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10055 << " type " << fl
->l_type
<< " owner " << owner
10056 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10059 if (F_RDLCK
== fl
->l_type
)
10060 lock_cmd
= CEPH_LOCK_SHARED
;
10061 else if (F_WRLCK
== fl
->l_type
)
10062 lock_cmd
= CEPH_LOCK_EXCL
;
10063 else if (F_UNLCK
== fl
->l_type
)
10064 lock_cmd
= CEPH_LOCK_UNLOCK
;
10068 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10072 * Set the most significant bit, so that MDS knows the 'owner'
10073 * is sufficient to identify the owner of lock. (old code uses
10074 * both 'owner' and 'pid')
10076 owner
|= (1ULL << 63);
10078 MetaRequest
*req
= new MetaRequest(op
);
10080 in
->make_nosnap_relative_path(path
);
10081 req
->set_filepath(path
);
10082 req
->set_inode(in
);
10084 req
->head
.args
.filelock_change
.rule
= lock_type
;
10085 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10086 req
->head
.args
.filelock_change
.owner
= owner
;
10087 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10088 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10089 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10090 req
->head
.args
.filelock_change
.wait
= sleep
;
10095 if (sleep
&& switch_interrupt_cb
) {
10096 // enable interrupt
10097 switch_interrupt_cb(callback_handle
, req
->get());
10098 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10099 // disable interrupt
10100 switch_interrupt_cb(callback_handle
, NULL
);
10101 if (ret
== 0 && req
->aborted()) {
10102 // effect of this lock request has been revoked by the 'lock intr' request
10103 ret
= req
->get_abort_code();
10107 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10111 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10112 ceph_filelock filelock
;
10113 auto p
= bl
.cbegin();
10114 decode(filelock
, p
);
10116 if (CEPH_LOCK_SHARED
== filelock
.type
)
10117 fl
->l_type
= F_RDLCK
;
10118 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10119 fl
->l_type
= F_WRLCK
;
10121 fl
->l_type
= F_UNLCK
;
10123 fl
->l_whence
= SEEK_SET
;
10124 fl
->l_start
= filelock
.start
;
10125 fl
->l_len
= filelock
.length
;
10126 fl
->l_pid
= filelock
.pid
;
10127 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10128 ceph_lock_state_t
*lock_state
;
10129 if (lock_type
== CEPH_LOCK_FCNTL
) {
10130 if (!in
->fcntl_locks
)
10131 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10132 lock_state
= in
->fcntl_locks
.get();
10133 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10134 if (!in
->flock_locks
)
10135 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10136 lock_state
= in
->flock_locks
.get();
10141 _update_lock_state(fl
, owner
, lock_state
);
10144 if (lock_type
== CEPH_LOCK_FCNTL
) {
10145 if (!fh
->fcntl_locks
)
10146 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10147 lock_state
= fh
->fcntl_locks
.get();
10149 if (!fh
->flock_locks
)
10150 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10151 lock_state
= fh
->flock_locks
.get();
10153 _update_lock_state(fl
, owner
, lock_state
);
10161 int Client::_interrupt_filelock(MetaRequest
*req
)
10163 // Set abort code, but do not kick. The abort code prevents the request
10164 // from being re-sent.
10165 req
->abort(-EINTR
);
10167 return 0; // haven't sent the request
10169 Inode
*in
= req
->inode();
10172 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10173 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10174 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10175 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10181 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10183 in
->make_nosnap_relative_path(path
);
10184 intr_req
->set_filepath(path
);
10185 intr_req
->set_inode(in
);
10186 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10187 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10188 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10190 UserPerm
perms(req
->get_uid(), req
->get_gid());
10191 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10194 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10196 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10199 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10200 encode(nr_fcntl_locks
, bl
);
10201 if (nr_fcntl_locks
) {
10202 auto &lock_state
= in
->fcntl_locks
;
10203 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10204 p
!= lock_state
->held_locks
.end();
10206 encode(p
->second
, bl
);
10209 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10210 encode(nr_flock_locks
, bl
);
10211 if (nr_flock_locks
) {
10212 auto &lock_state
= in
->flock_locks
;
10213 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10214 p
!= lock_state
->held_locks
.end();
10216 encode(p
->second
, bl
);
10219 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10220 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10223 void Client::_release_filelocks(Fh
*fh
)
10225 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10228 Inode
*in
= fh
->inode
.get();
10229 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10231 list
<pair
<int, ceph_filelock
> > to_release
;
10233 if (fh
->fcntl_locks
) {
10234 auto &lock_state
= fh
->fcntl_locks
;
10235 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10236 p
!= lock_state
->held_locks
.end();
10238 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10239 lock_state
.reset();
10241 if (fh
->flock_locks
) {
10242 auto &lock_state
= fh
->flock_locks
;
10243 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10244 p
!= lock_state
->held_locks
.end();
10246 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10247 lock_state
.reset();
10250 if (to_release
.empty())
10253 // mds has already released filelocks if session was closed.
10254 if (in
->caps
.empty())
10258 memset(&fl
, 0, sizeof(fl
));
10259 fl
.l_whence
= SEEK_SET
;
10260 fl
.l_type
= F_UNLCK
;
10262 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10263 p
!= to_release
.end();
10265 fl
.l_start
= p
->second
.start
;
10266 fl
.l_len
= p
->second
.length
;
10267 fl
.l_pid
= p
->second
.pid
;
10268 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10269 p
->second
.owner
, true);
10273 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10274 ceph_lock_state_t
*lock_state
)
10277 if (F_RDLCK
== fl
->l_type
)
10278 lock_cmd
= CEPH_LOCK_SHARED
;
10279 else if (F_WRLCK
== fl
->l_type
)
10280 lock_cmd
= CEPH_LOCK_EXCL
;
10282 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10284 ceph_filelock filelock
;
10285 filelock
.start
= fl
->l_start
;
10286 filelock
.length
= fl
->l_len
;
10287 filelock
.client
= 0;
10288 // see comment in _do_filelock()
10289 filelock
.owner
= owner
| (1ULL << 63);
10290 filelock
.pid
= fl
->l_pid
;
10291 filelock
.type
= lock_cmd
;
10293 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10294 list
<ceph_filelock
> activated_locks
;
10295 lock_state
->remove_lock(filelock
, activated_locks
);
10297 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10302 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10304 Inode
*in
= fh
->inode
.get();
10305 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10306 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10310 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10312 Inode
*in
= fh
->inode
.get();
10313 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10314 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10315 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10319 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10321 Inode
*in
= fh
->inode
.get();
10322 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10324 int sleep
= !(cmd
& LOCK_NB
);
10343 memset(&fl
, 0, sizeof(fl
));
10345 fl
.l_whence
= SEEK_SET
;
10347 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10348 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10352 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10354 /* Since the only thing this does is wrap a call to statfs, and
10355 statfs takes a lock, it doesn't seem we have a need to split it
10357 return statfs(0, stbuf
, perms
);
10360 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10364 std::lock_guard
l(client_lock
);
10365 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10366 << " invalidate_ino_cb " << args
->ino_cb
10367 << " invalidate_dentry_cb " << args
->dentry_cb
10368 << " switch_interrupt_cb " << args
->switch_intr_cb
10369 << " remount_cb " << args
->remount_cb
10371 callback_handle
= args
->handle
;
10372 if (args
->ino_cb
) {
10373 ino_invalidate_cb
= args
->ino_cb
;
10374 async_ino_invalidator
.start();
10376 if (args
->dentry_cb
) {
10377 dentry_invalidate_cb
= args
->dentry_cb
;
10378 async_dentry_invalidator
.start();
10380 if (args
->switch_intr_cb
) {
10381 switch_interrupt_cb
= args
->switch_intr_cb
;
10382 interrupt_finisher
.start();
10384 if (args
->remount_cb
) {
10385 remount_cb
= args
->remount_cb
;
10386 remount_finisher
.start();
10388 umask_cb
= args
->umask_cb
;
10391 int Client::test_dentry_handling(bool can_invalidate
)
10395 can_invalidate_dentries
= can_invalidate
;
10397 if (can_invalidate_dentries
) {
10398 ceph_assert(dentry_invalidate_cb
);
10399 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10402 ceph_assert(remount_cb
);
10403 ldout(cct
, 1) << "using remount_cb" << dendl
;
10404 r
= _do_remount(false);
10410 int Client::_sync_fs()
10412 ldout(cct
, 10) << __func__
<< dendl
;
10415 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10416 if (cct
->_conf
->client_oc
) {
10417 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10418 objectcacher
->flush_all(cond
.get());
10423 ceph_tid_t flush_tid
= last_flush_tid
;
10425 // wait for unsafe mds requests
10426 wait_unsafe_requests();
10428 wait_sync_caps(flush_tid
);
10430 if (nullptr != cond
) {
10431 client_lock
.Unlock();
10432 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10434 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10435 client_lock
.Lock();
10441 int Client::sync_fs()
10443 std::lock_guard
l(client_lock
);
10451 int64_t Client::drop_caches()
10453 std::lock_guard
l(client_lock
);
10454 return objectcacher
->release_all();
10457 int Client::_lazyio(Fh
*fh
, int enable
)
10459 Inode
*in
= fh
->inode
.get();
10460 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10462 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10465 int orig_mode
= fh
->mode
;
10467 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10468 in
->get_open_ref(fh
->mode
);
10469 in
->put_open_ref(orig_mode
);
10470 check_caps(in
, CHECK_CAPS_NODELAY
);
10472 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10473 in
->get_open_ref(fh
->mode
);
10474 in
->put_open_ref(orig_mode
);
10481 int Client::lazyio(int fd
, int enable
)
10483 std::lock_guard
l(client_lock
);
10484 Fh
*f
= get_filehandle(fd
);
10488 return _lazyio(f
, enable
);
10491 int Client::ll_lazyio(Fh
*fh
, int enable
)
10493 std::lock_guard
lock(client_lock
);
10494 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10495 tout(cct
) << __func__
<< std::endl
;
10497 return _lazyio(fh
, enable
);
10500 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10502 std::lock_guard
l(client_lock
);
10503 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10504 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10506 Fh
*f
= get_filehandle(fd
);
10516 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10518 std::lock_guard
l(client_lock
);
10519 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10520 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10522 Fh
*f
= get_filehandle(fd
);
10525 Inode
*in
= f
->inode
.get();
10528 if (_release(in
)) {
10529 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10537 // =============================
10540 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10542 std::lock_guard
l(client_lock
);
10547 filepath
path(relpath
);
10549 int r
= path_walk(path
, &in
, perm
);
10552 if (cct
->_conf
->client_permissions
) {
10553 r
= may_create(in
.get(), perm
);
10557 Inode
*snapdir
= open_snapdir(in
.get());
10558 return _mkdir(snapdir
, name
, 0, perm
);
10561 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10563 std::lock_guard
l(client_lock
);
10568 filepath
path(relpath
);
10570 int r
= path_walk(path
, &in
, perms
);
10573 if (cct
->_conf
->client_permissions
) {
10574 r
= may_delete(in
.get(), NULL
, perms
);
10578 Inode
*snapdir
= open_snapdir(in
.get());
10579 return _rmdir(snapdir
, name
, perms
);
10582 // =============================
10585 int Client::get_caps_issued(int fd
) {
10587 std::lock_guard
lock(client_lock
);
10592 Fh
*f
= get_filehandle(fd
);
10596 return f
->inode
->caps_issued();
10599 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10601 std::lock_guard
lock(client_lock
);
10608 int r
= path_walk(p
, &in
, perms
, true);
10611 return in
->caps_issued();
10614 // =========================================
10617 Inode
*Client::open_snapdir(Inode
*diri
)
10620 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10621 if (!inode_map
.count(vino
)) {
10622 in
= new Inode(this, vino
, &diri
->layout
);
10624 in
->ino
= diri
->ino
;
10625 in
->snapid
= CEPH_SNAPDIR
;
10626 in
->mode
= diri
->mode
;
10627 in
->uid
= diri
->uid
;
10628 in
->gid
= diri
->gid
;
10630 in
->mtime
= diri
->mtime
;
10631 in
->ctime
= diri
->ctime
;
10632 in
->btime
= diri
->btime
;
10633 in
->size
= diri
->size
;
10634 in
->change_attr
= diri
->change_attr
;
10636 in
->dirfragtree
.clear();
10637 in
->snapdir_parent
= diri
;
10638 diri
->flags
|= I_SNAPDIR_OPEN
;
10639 inode_map
[vino
] = in
;
10640 if (use_faked_inos())
10641 _assign_faked_ino(in
);
10642 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10644 in
= inode_map
[vino
];
10645 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10650 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10651 Inode
**out
, const UserPerm
& perms
)
10653 std::lock_guard
lock(client_lock
);
10654 vinodeno_t vparent
= _get_vino(parent
);
10655 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10656 tout(cct
) << __func__
<< std::endl
;
10657 tout(cct
) << name
<< std::endl
;
10663 if (!fuse_default_permissions
) {
10664 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10665 r
= may_lookup(parent
, perms
);
10671 string
dname(name
);
10674 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10681 fill_stat(in
, attr
);
10685 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10686 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10687 tout(cct
) << attr
->st_ino
<< std::endl
;
10692 int Client::ll_lookup_inode(
10693 struct inodeno_t ino
,
10694 const UserPerm
& perms
,
10697 ceph_assert(inode
!= NULL
);
10698 std::lock_guard
lock(client_lock
);
10699 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10704 // Num1: get inode and *inode
10705 int r
= _lookup_ino(ino
, perms
, inode
);
10709 ceph_assert(*inode
!= NULL
);
10711 if (!(*inode
)->dentries
.empty()) {
10712 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10716 if ((*inode
)->is_root()) {
10717 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10721 // Num2: Request the parent inode, so that we can look up the name
10723 r
= _lookup_parent(*inode
, perms
, &parent
);
10725 _ll_forget(*inode
, 1);
10729 ceph_assert(parent
!= NULL
);
10731 // Num3: Finally, get the name (dentry) of the requested inode
10732 r
= _lookup_name(*inode
, parent
, perms
);
10734 // Unexpected error
10735 _ll_forget(parent
, 1);
10736 _ll_forget(*inode
, 1);
10740 _ll_forget(parent
, 1);
10744 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10745 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10746 const UserPerm
& perms
)
10748 std::lock_guard
lock(client_lock
);
10749 vinodeno_t vparent
= _get_vino(parent
);
10750 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10751 tout(cct
) << "ll_lookupx" << std::endl
;
10752 tout(cct
) << name
<< std::endl
;
10758 if (!fuse_default_permissions
) {
10759 r
= may_lookup(parent
, perms
);
10764 string
dname(name
);
10767 unsigned mask
= statx_to_mask(flags
, want
);
10768 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10774 fill_statx(in
, mask
, stx
);
10778 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10779 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10780 tout(cct
) << stx
->stx_ino
<< std::endl
;
10785 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10786 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10788 std::lock_guard
lock(client_lock
);
10793 filepath
fp(name
, 0);
10796 unsigned mask
= statx_to_mask(flags
, want
);
10798 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10799 tout(cct
) << __func__
<< std::endl
;
10800 tout(cct
) << name
<< std::endl
;
10802 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10804 /* zero out mask, just in case... */
10811 fill_statx(in
, mask
, stx
);
10818 void Client::_ll_get(Inode
*in
)
10820 if (in
->ll_ref
== 0) {
10822 if (in
->is_dir() && !in
->dentries
.empty()) {
10823 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10824 in
->get_first_parent()->get(); // pin dentry
10826 if (in
->snapid
!= CEPH_NOSNAP
)
10827 ll_snap_ref
[in
->snapid
]++;
10830 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10833 int Client::_ll_put(Inode
*in
, uint64_t num
)
10836 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10837 if (in
->ll_ref
== 0) {
10838 if (in
->is_dir() && !in
->dentries
.empty()) {
10839 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10840 in
->get_first_parent()->put(); // unpin dentry
10842 if (in
->snapid
!= CEPH_NOSNAP
) {
10843 auto p
= ll_snap_ref
.find(in
->snapid
);
10844 ceph_assert(p
!= ll_snap_ref
.end());
10845 ceph_assert(p
->second
> 0);
10846 if (--p
->second
== 0)
10847 ll_snap_ref
.erase(p
);
10856 void Client::_ll_drop_pins()
10858 ldout(cct
, 10) << __func__
<< dendl
;
10859 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10860 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10861 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10862 it
!= inode_map
.end();
10864 Inode
*in
= it
->second
;
10868 to_be_put
.insert(in
);
10869 _ll_put(in
, in
->ll_ref
);
10874 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
10876 inodeno_t ino
= in
->ino
;
10878 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10879 tout(cct
) << __func__
<< std::endl
;
10880 tout(cct
) << ino
.val
<< std::endl
;
10881 tout(cct
) << count
<< std::endl
;
10883 // Ignore forget if we're no longer mounted
10887 if (ino
== 1) return true; // ignore forget on root.
10890 if (in
->ll_ref
< count
) {
10891 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10892 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10893 _ll_put(in
, in
->ll_ref
);
10896 if (_ll_put(in
, count
) == 0)
10903 bool Client::ll_forget(Inode
*in
, uint64_t count
)
10905 std::lock_guard
lock(client_lock
);
10906 return _ll_forget(in
, count
);
10909 bool Client::ll_put(Inode
*in
)
10911 /* ll_forget already takes the lock */
10912 return ll_forget(in
, 1);
10915 int Client::ll_get_snap_ref(snapid_t snap
)
10917 std::lock_guard
lock(client_lock
);
10918 auto p
= ll_snap_ref
.find(snap
);
10919 if (p
!= ll_snap_ref
.end())
10924 snapid_t
Client::ll_get_snapid(Inode
*in
)
10926 std::lock_guard
lock(client_lock
);
10930 Inode
*Client::ll_get_inode(ino_t ino
)
10932 std::lock_guard
lock(client_lock
);
10937 vinodeno_t vino
= _map_faked_ino(ino
);
10938 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10939 if (p
== inode_map
.end())
10941 Inode
*in
= p
->second
;
10946 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10948 std::lock_guard
lock(client_lock
);
10953 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10954 if (p
== inode_map
.end())
10956 Inode
*in
= p
->second
;
10961 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
10963 vinodeno_t vino
= _get_vino(in
);
10965 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
10966 tout(cct
) << __func__
<< std::endl
;
10967 tout(cct
) << vino
.ino
.val
<< std::endl
;
10969 if (vino
.snapid
< CEPH_NOSNAP
)
10972 return _getattr(in
, caps
, perms
);
10975 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
10977 std::lock_guard
lock(client_lock
);
10982 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
10985 fill_stat(in
, attr
);
10986 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
10990 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
10991 unsigned int flags
, const UserPerm
& perms
)
10993 std::lock_guard
lock(client_lock
);
10999 unsigned mask
= statx_to_mask(flags
, want
);
11001 if (mask
&& !in
->caps_issued_mask(mask
, true))
11002 res
= _ll_getattr(in
, mask
, perms
);
11005 fill_statx(in
, mask
, stx
);
11006 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11010 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11011 const UserPerm
& perms
, InodeRef
*inp
)
11013 vinodeno_t vino
= _get_vino(in
);
11015 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11017 tout(cct
) << __func__
<< std::endl
;
11018 tout(cct
) << vino
.ino
.val
<< std::endl
;
11019 tout(cct
) << stx
->stx_mode
<< std::endl
;
11020 tout(cct
) << stx
->stx_uid
<< std::endl
;
11021 tout(cct
) << stx
->stx_gid
<< std::endl
;
11022 tout(cct
) << stx
->stx_size
<< std::endl
;
11023 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11024 tout(cct
) << stx
->stx_atime
<< std::endl
;
11025 tout(cct
) << stx
->stx_btime
<< std::endl
;
11026 tout(cct
) << mask
<< std::endl
;
11028 if (!fuse_default_permissions
) {
11029 int res
= may_setattr(in
, stx
, mask
, perms
);
11034 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11036 return __setattrx(in
, stx
, mask
, perms
, inp
);
11039 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11040 const UserPerm
& perms
)
11042 std::lock_guard
lock(client_lock
);
11047 InodeRef
target(in
);
11048 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11050 ceph_assert(in
== target
.get());
11051 fill_statx(in
, in
->caps_issued(), stx
);
11054 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11058 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11059 const UserPerm
& perms
)
11061 struct ceph_statx stx
;
11062 stat_to_statx(attr
, &stx
);
11064 std::lock_guard
lock(client_lock
);
11069 InodeRef
target(in
);
11070 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11072 ceph_assert(in
== target
.get());
11073 fill_stat(in
, attr
);
11076 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11084 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11085 const UserPerm
& perms
)
11087 std::lock_guard
lock(client_lock
);
11093 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11096 return _getxattr(in
, name
, value
, size
, perms
);
11099 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11100 const UserPerm
& perms
)
11102 std::lock_guard
lock(client_lock
);
11108 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11111 return _getxattr(in
, name
, value
, size
, perms
);
11114 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11115 const UserPerm
& perms
)
11117 std::lock_guard
lock(client_lock
);
11122 Fh
*f
= get_filehandle(fd
);
11125 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11128 int Client::listxattr(const char *path
, char *list
, size_t size
,
11129 const UserPerm
& perms
)
11131 std::lock_guard
lock(client_lock
);
11137 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11140 return Client::_listxattr(in
.get(), list
, size
, perms
);
11143 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11144 const UserPerm
& perms
)
11146 std::lock_guard
lock(client_lock
);
11152 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11155 return Client::_listxattr(in
.get(), list
, size
, perms
);
11158 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11160 std::lock_guard
lock(client_lock
);
11165 Fh
*f
= get_filehandle(fd
);
11168 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11171 int Client::removexattr(const char *path
, const char *name
,
11172 const UserPerm
& perms
)
11174 std::lock_guard
lock(client_lock
);
11180 int r
= Client::path_walk(path
, &in
, perms
, true);
11183 return _removexattr(in
, name
, perms
);
11186 int Client::lremovexattr(const char *path
, const char *name
,
11187 const UserPerm
& perms
)
11189 std::lock_guard
lock(client_lock
);
11195 int r
= Client::path_walk(path
, &in
, perms
, false);
11198 return _removexattr(in
, name
, perms
);
11201 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11203 std::lock_guard
lock(client_lock
);
11208 Fh
*f
= get_filehandle(fd
);
11211 return _removexattr(f
->inode
, name
, perms
);
11214 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11215 size_t size
, int flags
, const UserPerm
& perms
)
11217 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11219 std::lock_guard
lock(client_lock
);
11225 int r
= Client::path_walk(path
, &in
, perms
, true);
11228 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11231 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11232 size_t size
, int flags
, const UserPerm
& perms
)
11234 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11236 std::lock_guard
lock(client_lock
);
11242 int r
= Client::path_walk(path
, &in
, perms
, false);
11245 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11248 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11249 int flags
, const UserPerm
& perms
)
11251 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11253 std::lock_guard
lock(client_lock
);
11258 Fh
*f
= get_filehandle(fd
);
11261 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11264 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11265 const UserPerm
& perms
)
11269 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11273 // Do a force getattr to get the latest quota before returning
11274 // a value to userspace.
11276 if (vxattr
->flags
& VXATTR_RSTAT
) {
11277 flags
|= CEPH_STAT_RSTAT
;
11279 r
= _getattr(in
, flags
, perms
, true);
11281 // Error from getattr!
11285 // call pointer-to-member function
11287 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11288 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11294 if (r
> (int)size
) {
11296 } else if (r
> 0) {
11297 memcpy(value
, buf
, r
);
11303 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11308 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11312 if (in
->xattrs
.count(n
)) {
11313 r
= in
->xattrs
[n
].length();
11314 if (r
> 0 && size
!= 0) {
11315 if (size
>= (unsigned)r
)
11316 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11323 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11327 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11328 const UserPerm
& perms
)
11330 if (cct
->_conf
->client_permissions
) {
11331 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11335 return _getxattr(in
.get(), name
, value
, size
, perms
);
11338 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11339 size_t size
, const UserPerm
& perms
)
11341 std::lock_guard
lock(client_lock
);
11346 vinodeno_t vino
= _get_vino(in
);
11348 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11349 tout(cct
) << __func__
<< std::endl
;
11350 tout(cct
) << vino
.ino
.val
<< std::endl
;
11351 tout(cct
) << name
<< std::endl
;
11353 if (!fuse_default_permissions
) {
11354 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11359 return _getxattr(in
, name
, value
, size
, perms
);
11362 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11363 const UserPerm
& perms
)
11365 bool len_only
= (size
== 0);
11366 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11372 for (const auto& p
: in
->xattrs
) {
11373 size_t this_len
= p
.first
.length() + 1;
11378 if (this_len
> size
) {
11383 memcpy(name
, p
.first
.c_str(), this_len
);
11388 const VXattr
*vxattr
;
11389 for (vxattr
= _get_vxattrs(in
); vxattr
&& !vxattr
->name
.empty(); vxattr
++) {
11390 if (vxattr
->hidden
)
11392 // call pointer-to-member function
11393 if (vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))
11396 size_t this_len
= vxattr
->name
.length() + 1;
11401 if (this_len
> size
) {
11406 memcpy(name
, vxattr
->name
.c_str(), this_len
);
11411 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11415 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11416 const UserPerm
& perms
)
11418 std::lock_guard
lock(client_lock
);
11423 vinodeno_t vino
= _get_vino(in
);
11425 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11426 tout(cct
) << __func__
<< std::endl
;
11427 tout(cct
) << vino
.ino
.val
<< std::endl
;
11428 tout(cct
) << size
<< std::endl
;
11430 return _listxattr(in
, names
, size
, perms
);
11433 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11434 size_t size
, int flags
, const UserPerm
& perms
)
11437 int xattr_flags
= 0;
11439 xattr_flags
|= CEPH_XATTR_REMOVE
;
11440 if (flags
& XATTR_CREATE
)
11441 xattr_flags
|= CEPH_XATTR_CREATE
;
11442 if (flags
& XATTR_REPLACE
)
11443 xattr_flags
|= CEPH_XATTR_REPLACE
;
11445 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11447 in
->make_nosnap_relative_path(path
);
11448 req
->set_filepath(path
);
11449 req
->set_string2(name
);
11450 req
->set_inode(in
);
11451 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11454 assert (value
|| size
== 0);
11455 bl
.append((const char*)value
, size
);
11458 int res
= make_request(req
, perms
);
11461 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11466 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11467 size_t size
, int flags
, const UserPerm
& perms
)
11469 if (in
->snapid
!= CEPH_NOSNAP
) {
11473 bool posix_acl_xattr
= false;
11474 if (acl_type
== POSIX_ACL
)
11475 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11477 if (strncmp(name
, "user.", 5) &&
11478 strncmp(name
, "security.", 9) &&
11479 strncmp(name
, "trusted.", 8) &&
11480 strncmp(name
, "ceph.", 5) &&
11482 return -EOPNOTSUPP
;
11484 bool check_realm
= false;
11486 if (posix_acl_xattr
) {
11487 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11488 mode_t new_mode
= in
->mode
;
11490 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11497 if (new_mode
!= in
->mode
) {
11498 struct ceph_statx stx
;
11499 stx
.stx_mode
= new_mode
;
11500 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11505 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11507 if (!S_ISDIR(in
->mode
))
11509 int ret
= posix_acl_check(value
, size
);
11518 return -EOPNOTSUPP
;
11521 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11523 if (vxattr
->readonly
)
11524 return -EOPNOTSUPP
;
11525 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11526 check_realm
= true;
11530 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11531 if (ret
>= 0 && check_realm
) {
11532 // check if snaprealm was created for quota inode
11533 if (in
->quota
.is_enable() &&
11534 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11541 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11542 size_t size
, int flags
, const UserPerm
& perms
)
11544 if (cct
->_conf
->client_permissions
) {
11545 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11549 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11552 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11555 if (name
== "layout") {
11556 string::iterator begin
= value
.begin();
11557 string::iterator end
= value
.end();
11558 keys_and_values
<string::iterator
> p
; // create instance of parser
11559 std::map
<string
, string
> m
; // map to receive results
11560 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11565 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11566 if (q
->first
== "pool") {
11571 } else if (name
== "layout.pool") {
11575 if (tmp
.length()) {
11578 pool
= boost::lexical_cast
<unsigned>(tmp
);
11579 if (!osdmap
->have_pg_pool(pool
))
11581 } catch (boost::bad_lexical_cast
const&) {
11582 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11592 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11594 // For setting pool of layout, MetaRequest need osdmap epoch.
11595 // There is a race which create a new data pool but client and mds both don't have.
11596 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11597 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11598 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11599 string
rest(strstr(name
, "layout"));
11600 string
v((const char*)value
, size
);
11601 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11602 return _setxattr_check_data_pool(rest
, v
, &o
);
11605 if (r
== -ENOENT
) {
11607 objecter
->wait_for_latest_osdmap(&ctx
);
11613 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11614 size_t size
, int flags
, const UserPerm
& perms
)
11616 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11618 std::lock_guard
lock(client_lock
);
11623 vinodeno_t vino
= _get_vino(in
);
11625 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11626 tout(cct
) << __func__
<< std::endl
;
11627 tout(cct
) << vino
.ino
.val
<< std::endl
;
11628 tout(cct
) << name
<< std::endl
;
11630 if (!fuse_default_permissions
) {
11631 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11635 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11638 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11640 if (in
->snapid
!= CEPH_NOSNAP
) {
11644 // same xattrs supported by kernel client
11645 if (strncmp(name
, "user.", 5) &&
11646 strncmp(name
, "system.", 7) &&
11647 strncmp(name
, "security.", 9) &&
11648 strncmp(name
, "trusted.", 8) &&
11649 strncmp(name
, "ceph.", 5))
11650 return -EOPNOTSUPP
;
11652 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11653 if (vxattr
&& vxattr
->readonly
)
11654 return -EOPNOTSUPP
;
11656 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11658 in
->make_nosnap_relative_path(path
);
11659 req
->set_filepath(path
);
11660 req
->set_filepath2(name
);
11661 req
->set_inode(in
);
11663 int res
= make_request(req
, perms
);
11666 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11670 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11672 if (cct
->_conf
->client_permissions
) {
11673 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11677 return _removexattr(in
.get(), name
, perms
);
11680 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11682 std::lock_guard
lock(client_lock
);
11687 vinodeno_t vino
= _get_vino(in
);
11689 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11690 tout(cct
) << "ll_removexattr" << std::endl
;
11691 tout(cct
) << vino
.ino
.val
<< std::endl
;
11692 tout(cct
) << name
<< std::endl
;
11694 if (!fuse_default_permissions
) {
11695 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11700 return _removexattr(in
, name
, perms
);
11703 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11705 return in
->quota
.is_enable() &&
11706 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11708 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11710 return snprintf(val
, size
,
11711 "max_bytes=%lld max_files=%lld",
11712 (long long int)in
->quota
.max_bytes
,
11713 (long long int)in
->quota
.max_files
);
11715 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11717 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11719 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11721 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11724 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11726 return in
->layout
!= file_layout_t();
11728 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11730 int r
= snprintf(val
, size
,
11731 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11732 (unsigned long long)in
->layout
.stripe_unit
,
11733 (unsigned long long)in
->layout
.stripe_count
,
11734 (unsigned long long)in
->layout
.object_size
);
11735 objecter
->with_osdmap([&](const OSDMap
& o
) {
11736 if (o
.have_pg_pool(in
->layout
.pool_id
))
11737 r
+= snprintf(val
+ r
, size
- r
, "%s",
11738 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11740 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11741 (uint64_t)in
->layout
.pool_id
);
11743 if (in
->layout
.pool_ns
.length())
11744 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11745 in
->layout
.pool_ns
.c_str());
11748 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11750 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11752 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11754 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11756 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11758 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11760 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11763 objecter
->with_osdmap([&](const OSDMap
& o
) {
11764 if (o
.have_pg_pool(in
->layout
.pool_id
))
11765 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11766 in
->layout
.pool_id
).c_str());
11768 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11772 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11774 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11776 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11778 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11780 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11782 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11784 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11786 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11788 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11790 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11792 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11794 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11796 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11798 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11800 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11802 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11804 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11806 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11807 (long)in
->rstat
.rctime
.nsec());
11809 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11811 return in
->dir_pin
!= -ENODATA
;
11813 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11815 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11818 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11820 return !in
->snap_btime
.is_zero();
11823 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11825 return snprintf(val
, size
, "%llu.%09lu",
11826 (long long unsigned)in
->snap_btime
.sec(),
11827 (long unsigned)in
->snap_btime
.nsec());
11830 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11831 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11833 #define XATTR_NAME_CEPH(_type, _name) \
11835 name: CEPH_XATTR_NAME(_type, _name), \
11836 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11842 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11844 name: CEPH_XATTR_NAME(_type, _name), \
11845 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11851 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11853 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11854 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11857 exists_cb: &Client::_vxattrcb_layout_exists, \
11860 #define XATTR_QUOTA_FIELD(_type, _name) \
11862 name: CEPH_XATTR_NAME(_type, _name), \
11863 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11866 exists_cb: &Client::_vxattrcb_quota_exists, \
11870 const Client::VXattr
Client::_dir_vxattrs
[] = {
11872 name
: "ceph.dir.layout",
11873 getxattr_cb
: &Client::_vxattrcb_layout
,
11876 exists_cb
: &Client::_vxattrcb_layout_exists
,
11879 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11880 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11881 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11882 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11883 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11884 XATTR_NAME_CEPH(dir
, entries
),
11885 XATTR_NAME_CEPH(dir
, files
),
11886 XATTR_NAME_CEPH(dir
, subdirs
),
11887 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11888 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11889 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11890 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11891 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11893 name
: "ceph.quota",
11894 getxattr_cb
: &Client::_vxattrcb_quota
,
11897 exists_cb
: &Client::_vxattrcb_quota_exists
,
11900 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11901 XATTR_QUOTA_FIELD(quota
, max_files
),
11903 name
: "ceph.dir.pin",
11904 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11907 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11911 name
: "ceph.snap.btime",
11912 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11915 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11918 { name
: "" } /* Required table terminator */
11921 const Client::VXattr
Client::_file_vxattrs
[] = {
11923 name
: "ceph.file.layout",
11924 getxattr_cb
: &Client::_vxattrcb_layout
,
11927 exists_cb
: &Client::_vxattrcb_layout_exists
,
11930 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11931 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11932 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11933 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11934 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11936 name
: "ceph.snap.btime",
11937 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
11940 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
11943 { name
: "" } /* Required table terminator */
11946 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11949 return _dir_vxattrs
;
11950 else if (in
->is_file())
11951 return _file_vxattrs
;
11955 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11957 if (strncmp(name
, "ceph.", 5) == 0) {
11958 const VXattr
*vxattr
= _get_vxattrs(in
);
11960 while (!vxattr
->name
.empty()) {
11961 if (vxattr
->name
== name
)
11970 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11972 std::lock_guard
lock(client_lock
);
11977 vinodeno_t vino
= _get_vino(in
);
11979 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11980 tout(cct
) << "ll_readlink" << std::endl
;
11981 tout(cct
) << vino
.ino
.val
<< std::endl
;
11983 for (auto dn
: in
->dentries
) {
11987 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11988 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
11992 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
11993 const UserPerm
& perms
, InodeRef
*inp
)
11995 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
11996 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
11997 << ", gid " << perms
.gid() << ")" << dendl
;
11999 if (strlen(name
) > NAME_MAX
)
12000 return -ENAMETOOLONG
;
12002 if (dir
->snapid
!= CEPH_NOSNAP
) {
12005 if (is_quota_files_exceeded(dir
, perms
)) {
12009 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12012 dir
->make_nosnap_relative_path(path
);
12013 path
.push_dentry(name
);
12014 req
->set_filepath(path
);
12015 req
->set_inode(dir
);
12016 req
->head
.args
.mknod
.rdev
= rdev
;
12017 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12018 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12020 bufferlist xattrs_bl
;
12021 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12024 req
->head
.args
.mknod
.mode
= mode
;
12025 if (xattrs_bl
.length() > 0)
12026 req
->set_data(xattrs_bl
);
12029 res
= get_or_create(dir
, name
, &de
);
12032 req
->set_dentry(de
);
12034 res
= make_request(req
, perms
, inp
);
12038 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12046 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12047 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12048 const UserPerm
& perms
)
12050 std::lock_guard
lock(client_lock
);
12055 vinodeno_t vparent
= _get_vino(parent
);
12057 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12058 tout(cct
) << "ll_mknod" << std::endl
;
12059 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12060 tout(cct
) << name
<< std::endl
;
12061 tout(cct
) << mode
<< std::endl
;
12062 tout(cct
) << rdev
<< std::endl
;
12064 if (!fuse_default_permissions
) {
12065 int r
= may_create(parent
, perms
);
12071 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12073 fill_stat(in
, attr
);
12076 tout(cct
) << attr
->st_ino
<< std::endl
;
12077 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12078 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12083 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12084 dev_t rdev
, Inode
**out
,
12085 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12086 const UserPerm
& perms
)
12088 unsigned caps
= statx_to_mask(flags
, want
);
12089 std::lock_guard
lock(client_lock
);
12094 vinodeno_t vparent
= _get_vino(parent
);
12096 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12097 tout(cct
) << "ll_mknodx" << std::endl
;
12098 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12099 tout(cct
) << name
<< std::endl
;
12100 tout(cct
) << mode
<< std::endl
;
12101 tout(cct
) << rdev
<< std::endl
;
12103 if (!fuse_default_permissions
) {
12104 int r
= may_create(parent
, perms
);
12110 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12112 fill_statx(in
, caps
, stx
);
12115 tout(cct
) << stx
->stx_ino
<< std::endl
;
12116 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12117 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12122 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12123 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12124 int object_size
, const char *data_pool
, bool *created
,
12125 const UserPerm
& perms
)
12127 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12128 mode
<< dec
<< ")" << dendl
;
12130 if (strlen(name
) > NAME_MAX
)
12131 return -ENAMETOOLONG
;
12132 if (dir
->snapid
!= CEPH_NOSNAP
) {
12135 if (is_quota_files_exceeded(dir
, perms
)) {
12139 // use normalized flags to generate cmode
12140 int cflags
= ceph_flags_sys2wire(flags
);
12141 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12142 cflags
|= CEPH_O_LAZY
;
12144 int cmode
= ceph_flags_to_mode(cflags
);
12146 int64_t pool_id
= -1;
12147 if (data_pool
&& *data_pool
) {
12148 pool_id
= objecter
->with_osdmap(
12149 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12152 if (pool_id
> 0xffffffffll
)
12153 return -ERANGE
; // bummer!
12156 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12159 dir
->make_nosnap_relative_path(path
);
12160 path
.push_dentry(name
);
12161 req
->set_filepath(path
);
12162 req
->set_inode(dir
);
12163 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12165 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12166 req
->head
.args
.open
.stripe_count
= stripe_count
;
12167 req
->head
.args
.open
.object_size
= object_size
;
12168 if (cct
->_conf
->client_debug_getattr_caps
)
12169 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12171 req
->head
.args
.open
.mask
= 0;
12172 req
->head
.args
.open
.pool
= pool_id
;
12173 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12174 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12177 bufferlist xattrs_bl
;
12178 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12181 req
->head
.args
.open
.mode
= mode
;
12182 if (xattrs_bl
.length() > 0)
12183 req
->set_data(xattrs_bl
);
12186 res
= get_or_create(dir
, name
, &de
);
12189 req
->set_dentry(de
);
12191 res
= make_request(req
, perms
, inp
, created
);
12196 /* If the caller passed a value in fhp, do the open */
12198 (*inp
)->get_open_ref(cmode
);
12199 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12205 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12206 << " layout " << stripe_unit
12207 << ' ' << stripe_count
12208 << ' ' << object_size
12209 <<") = " << res
<< dendl
;
12218 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12221 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12222 << mode
<< dec
<< ", uid " << perm
.uid()
12223 << ", gid " << perm
.gid() << ")" << dendl
;
12225 if (strlen(name
) > NAME_MAX
)
12226 return -ENAMETOOLONG
;
12228 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12231 if (is_quota_files_exceeded(dir
, perm
)) {
12234 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12235 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12238 dir
->make_nosnap_relative_path(path
);
12239 path
.push_dentry(name
);
12240 req
->set_filepath(path
);
12241 req
->set_inode(dir
);
12242 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12243 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12246 bufferlist xattrs_bl
;
12247 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12250 req
->head
.args
.mkdir
.mode
= mode
;
12251 if (xattrs_bl
.length() > 0)
12252 req
->set_data(xattrs_bl
);
12255 res
= get_or_create(dir
, name
, &de
);
12258 req
->set_dentry(de
);
12260 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12261 res
= make_request(req
, perm
, inp
);
12262 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12266 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12274 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12275 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12277 std::lock_guard
lock(client_lock
);
12282 vinodeno_t vparent
= _get_vino(parent
);
12284 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12285 tout(cct
) << "ll_mkdir" << std::endl
;
12286 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12287 tout(cct
) << name
<< std::endl
;
12288 tout(cct
) << mode
<< std::endl
;
12290 if (!fuse_default_permissions
) {
12291 int r
= may_create(parent
, perm
);
12297 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12299 fill_stat(in
, attr
);
12302 tout(cct
) << attr
->st_ino
<< std::endl
;
12303 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12304 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12309 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12310 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12311 const UserPerm
& perms
)
12313 std::lock_guard
lock(client_lock
);
12318 vinodeno_t vparent
= _get_vino(parent
);
12320 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12321 tout(cct
) << "ll_mkdirx" << std::endl
;
12322 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12323 tout(cct
) << name
<< std::endl
;
12324 tout(cct
) << mode
<< std::endl
;
12326 if (!fuse_default_permissions
) {
12327 int r
= may_create(parent
, perms
);
12333 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12335 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12341 tout(cct
) << stx
->stx_ino
<< std::endl
;
12342 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12343 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12348 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12349 const UserPerm
& perms
, InodeRef
*inp
)
12351 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12352 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12355 if (strlen(name
) > NAME_MAX
)
12356 return -ENAMETOOLONG
;
12358 if (dir
->snapid
!= CEPH_NOSNAP
) {
12361 if (is_quota_files_exceeded(dir
, perms
)) {
12365 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12368 dir
->make_nosnap_relative_path(path
);
12369 path
.push_dentry(name
);
12370 req
->set_filepath(path
);
12371 req
->set_inode(dir
);
12372 req
->set_string2(target
);
12373 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12374 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12377 int res
= get_or_create(dir
, name
, &de
);
12380 req
->set_dentry(de
);
12382 res
= make_request(req
, perms
, inp
);
12385 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12394 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12395 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12397 std::lock_guard
lock(client_lock
);
12402 vinodeno_t vparent
= _get_vino(parent
);
12404 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12406 tout(cct
) << "ll_symlink" << std::endl
;
12407 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12408 tout(cct
) << name
<< std::endl
;
12409 tout(cct
) << value
<< std::endl
;
12411 if (!fuse_default_permissions
) {
12412 int r
= may_create(parent
, perms
);
12418 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12420 fill_stat(in
, attr
);
12423 tout(cct
) << attr
->st_ino
<< std::endl
;
12424 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12425 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12430 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12431 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12432 unsigned flags
, const UserPerm
& perms
)
12434 std::lock_guard
lock(client_lock
);
12439 vinodeno_t vparent
= _get_vino(parent
);
12441 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12443 tout(cct
) << "ll_symlinkx" << std::endl
;
12444 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12445 tout(cct
) << name
<< std::endl
;
12446 tout(cct
) << value
<< std::endl
;
12448 if (!fuse_default_permissions
) {
12449 int r
= may_create(parent
, perms
);
12455 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12457 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12460 tout(cct
) << stx
->stx_ino
<< std::endl
;
12461 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12462 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12467 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12469 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12470 << " uid " << perm
.uid() << " gid " << perm
.gid()
12473 if (dir
->snapid
!= CEPH_NOSNAP
) {
12477 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12480 dir
->make_nosnap_relative_path(path
);
12481 path
.push_dentry(name
);
12482 req
->set_filepath(path
);
12488 int res
= get_or_create(dir
, name
, &de
);
12491 req
->set_dentry(de
);
12492 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12493 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12495 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12499 in
= otherin
.get();
12500 req
->set_other_inode(in
);
12501 in
->break_all_delegs();
12502 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12504 req
->set_inode(dir
);
12506 res
= make_request(req
, perm
);
12509 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12517 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12519 std::lock_guard
lock(client_lock
);
12524 vinodeno_t vino
= _get_vino(in
);
12526 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12527 tout(cct
) << "ll_unlink" << std::endl
;
12528 tout(cct
) << vino
.ino
.val
<< std::endl
;
12529 tout(cct
) << name
<< std::endl
;
12531 if (!fuse_default_permissions
) {
12532 int r
= may_delete(in
, name
, perm
);
12536 return _unlink(in
, name
, perm
);
12539 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12541 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12542 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12544 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12548 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12549 MetaRequest
*req
= new MetaRequest(op
);
12551 dir
->make_nosnap_relative_path(path
);
12552 path
.push_dentry(name
);
12553 req
->set_filepath(path
);
12554 req
->set_inode(dir
);
12556 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12557 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12558 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12563 int res
= get_or_create(dir
, name
, &de
);
12566 if (op
== CEPH_MDS_OP_RMDIR
)
12567 req
->set_dentry(de
);
12571 res
= _lookup(dir
, name
, 0, &in
, perms
);
12575 if (op
== CEPH_MDS_OP_RMSNAP
) {
12576 unlink(de
, true, true);
12579 req
->set_other_inode(in
.get());
12581 res
= make_request(req
, perms
);
12584 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12592 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12594 std::lock_guard
lock(client_lock
);
12599 vinodeno_t vino
= _get_vino(in
);
12601 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12602 tout(cct
) << "ll_rmdir" << std::endl
;
12603 tout(cct
) << vino
.ino
.val
<< std::endl
;
12604 tout(cct
) << name
<< std::endl
;
12606 if (!fuse_default_permissions
) {
12607 int r
= may_delete(in
, name
, perms
);
12612 return _rmdir(in
, name
, perms
);
12615 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12617 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12618 << todir
->ino
<< " " << toname
12619 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12622 if (fromdir
->snapid
!= todir
->snapid
)
12625 int op
= CEPH_MDS_OP_RENAME
;
12626 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12627 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12628 op
= CEPH_MDS_OP_RENAMESNAP
;
12632 if (fromdir
!= todir
) {
12633 Inode
*fromdir_root
=
12634 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12635 Inode
*todir_root
=
12636 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12637 if (fromdir_root
!= todir_root
) {
12643 MetaRequest
*req
= new MetaRequest(op
);
12646 fromdir
->make_nosnap_relative_path(from
);
12647 from
.push_dentry(fromname
);
12649 todir
->make_nosnap_relative_path(to
);
12650 to
.push_dentry(toname
);
12651 req
->set_filepath(to
);
12652 req
->set_filepath2(from
);
12655 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12659 res
= get_or_create(todir
, toname
, &de
);
12663 if (op
== CEPH_MDS_OP_RENAME
) {
12664 req
->set_old_dentry(oldde
);
12665 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12666 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12668 req
->set_dentry(de
);
12669 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12670 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12672 InodeRef oldin
, otherin
;
12673 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
12677 Inode
*oldinode
= oldin
.get();
12678 oldinode
->break_all_delegs();
12679 req
->set_old_inode(oldinode
);
12680 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12682 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12686 Inode
*in
= otherin
.get();
12687 req
->set_other_inode(in
);
12688 in
->break_all_delegs();
12690 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12698 req
->set_inode(todir
);
12700 // renamesnap reply contains no tracedn, so we need to invalidate
12702 unlink(oldde
, true, true);
12703 unlink(de
, true, true);
12705 req
->set_inode(todir
);
12708 res
= make_request(req
, perm
, &target
);
12709 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12711 // renamed item from our cache
12714 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12722 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12723 const char *newname
, const UserPerm
& perm
)
12725 std::lock_guard
lock(client_lock
);
12730 vinodeno_t vparent
= _get_vino(parent
);
12731 vinodeno_t vnewparent
= _get_vino(newparent
);
12733 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12734 << vnewparent
<< " " << newname
<< dendl
;
12735 tout(cct
) << "ll_rename" << std::endl
;
12736 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12737 tout(cct
) << name
<< std::endl
;
12738 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12739 tout(cct
) << newname
<< std::endl
;
12741 if (!fuse_default_permissions
) {
12742 int r
= may_delete(parent
, name
, perm
);
12745 r
= may_delete(newparent
, newname
, perm
);
12746 if (r
< 0 && r
!= -ENOENT
)
12750 return _rename(parent
, name
, newparent
, newname
, perm
);
12753 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12755 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12756 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12758 if (strlen(newname
) > NAME_MAX
)
12759 return -ENAMETOOLONG
;
12761 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12764 if (is_quota_files_exceeded(dir
, perm
)) {
12768 in
->break_all_delegs();
12769 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12771 filepath
path(newname
, dir
->ino
);
12772 req
->set_filepath(path
);
12773 filepath
existing(in
->ino
);
12774 req
->set_filepath2(existing
);
12776 req
->set_inode(dir
);
12777 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12778 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12781 int res
= get_or_create(dir
, newname
, &de
);
12784 req
->set_dentry(de
);
12786 res
= make_request(req
, perm
, inp
);
12787 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12790 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12798 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12799 const UserPerm
& perm
)
12801 std::lock_guard
lock(client_lock
);
12806 vinodeno_t vino
= _get_vino(in
);
12807 vinodeno_t vnewparent
= _get_vino(newparent
);
12809 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12811 tout(cct
) << "ll_link" << std::endl
;
12812 tout(cct
) << vino
.ino
.val
<< std::endl
;
12813 tout(cct
) << vnewparent
<< std::endl
;
12814 tout(cct
) << newname
<< std::endl
;
12818 if (!fuse_default_permissions
) {
12819 if (S_ISDIR(in
->mode
))
12822 int r
= may_hardlink(in
, perm
);
12826 r
= may_create(newparent
, perm
);
12831 return _link(in
, newparent
, newname
, perm
, &target
);
12834 int Client::ll_num_osds(void)
12836 std::lock_guard
lock(client_lock
);
12837 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12840 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12842 std::lock_guard
lock(client_lock
);
12845 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12846 if (!o
.exists(osd
))
12848 g
= o
.get_addrs(osd
).front();
12853 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12854 *addr
= ntohl(nb_addr
);
12858 uint32_t Client::ll_stripe_unit(Inode
*in
)
12860 std::lock_guard
lock(client_lock
);
12861 return in
->layout
.stripe_unit
;
12864 uint64_t Client::ll_snap_seq(Inode
*in
)
12866 std::lock_guard
lock(client_lock
);
12867 return in
->snaprealm
->seq
;
12870 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12872 std::lock_guard
lock(client_lock
);
12873 *layout
= in
->layout
;
12877 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12879 return ll_file_layout(fh
->inode
.get(), layout
);
12882 /* Currently we cannot take advantage of redundancy in reads, since we
12883 would have to go through all possible placement groups (a
12884 potentially quite large number determined by a hash), and use CRUSH
12885 to calculate the appropriate set of OSDs for each placement group,
12886 then index into that. An array with one entry per OSD is much more
12887 tractable and works for demonstration purposes. */
12889 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12890 file_layout_t
* layout
)
12892 std::lock_guard
lock(client_lock
);
12894 inodeno_t ino
= in
->ino
;
12895 uint32_t object_size
= layout
->object_size
;
12896 uint32_t su
= layout
->stripe_unit
;
12897 uint32_t stripe_count
= layout
->stripe_count
;
12898 uint64_t stripes_per_object
= object_size
/ su
;
12899 uint64_t stripeno
= 0, stripepos
= 0;
12902 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12903 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12905 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12906 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12908 object_t oid
= file_object_t(ino
, objectno
);
12909 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12910 ceph_object_layout olayout
=
12911 o
.file_to_object_layout(oid
, *layout
);
12912 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12915 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12920 /* Return the offset of the block, internal to the object */
12922 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12924 std::lock_guard
lock(client_lock
);
12925 file_layout_t
*layout
=&(in
->layout
);
12926 uint32_t object_size
= layout
->object_size
;
12927 uint32_t su
= layout
->stripe_unit
;
12928 uint64_t stripes_per_object
= object_size
/ su
;
12930 return (blockno
% stripes_per_object
) * su
;
12933 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12934 const UserPerm
& perms
)
12936 std::lock_guard
lock(client_lock
);
12941 vinodeno_t vino
= _get_vino(in
);
12943 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
12944 tout(cct
) << "ll_opendir" << std::endl
;
12945 tout(cct
) << vino
.ino
.val
<< std::endl
;
12947 if (!fuse_default_permissions
) {
12948 int r
= may_open(in
, flags
, perms
);
12953 int r
= _opendir(in
, dirpp
, perms
);
12954 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
12956 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
12961 int Client::ll_releasedir(dir_result_t
*dirp
)
12963 std::lock_guard
lock(client_lock
);
12964 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
12965 tout(cct
) << "ll_releasedir" << std::endl
;
12966 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12975 int Client::ll_fsyncdir(dir_result_t
*dirp
)
12977 std::lock_guard
lock(client_lock
);
12978 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
12979 tout(cct
) << "ll_fsyncdir" << std::endl
;
12980 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12985 return _fsync(dirp
->inode
.get(), false);
12988 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
12990 ceph_assert(!(flags
& O_CREAT
));
12992 std::lock_guard
lock(client_lock
);
12997 vinodeno_t vino
= _get_vino(in
);
12999 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13000 tout(cct
) << "ll_open" << std::endl
;
13001 tout(cct
) << vino
.ino
.val
<< std::endl
;
13002 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13005 if (!fuse_default_permissions
) {
13006 r
= may_open(in
, flags
, perms
);
13011 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13014 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13016 ll_unclosed_fh_set
.insert(fhptr
);
13018 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13019 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13020 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13024 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13025 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13026 const UserPerm
& perms
)
13030 vinodeno_t vparent
= _get_vino(parent
);
13032 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13033 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13034 << ", gid " << perms
.gid() << dendl
;
13035 tout(cct
) << "ll_create" << std::endl
;
13036 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13037 tout(cct
) << name
<< std::endl
;
13038 tout(cct
) << mode
<< std::endl
;
13039 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13041 bool created
= false;
13042 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13044 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13047 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13048 if (!fuse_default_permissions
) {
13049 r
= may_create(parent
, perms
);
13053 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13064 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13066 if (!fuse_default_permissions
) {
13067 r
= may_open(in
->get(), flags
, perms
);
13070 int release_r
= _release_fh(*fhp
);
13071 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13076 if (*fhp
== NULL
) {
13077 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13085 ll_unclosed_fh_set
.insert(*fhp
);
13090 Inode
*inode
= in
->get();
13091 if (use_faked_inos())
13092 ino
= inode
->faked_ino
;
13097 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13098 tout(cct
) << ino
<< std::endl
;
13099 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13100 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13101 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13106 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13107 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13108 const UserPerm
& perms
)
13110 std::lock_guard
lock(client_lock
);
13116 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13121 // passing an Inode in outp requires an additional ref
13126 fill_stat(in
, attr
);
13134 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13135 int oflags
, Inode
**outp
, Fh
**fhp
,
13136 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13137 const UserPerm
& perms
)
13139 unsigned caps
= statx_to_mask(lflags
, want
);
13140 std::lock_guard
lock(client_lock
);
13146 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13150 // passing an Inode in outp requires an additional ref
13155 fill_statx(in
, caps
, stx
);
13164 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13166 std::lock_guard
lock(client_lock
);
13167 tout(cct
) << "ll_lseek" << std::endl
;
13168 tout(cct
) << offset
<< std::endl
;
13169 tout(cct
) << whence
<< std::endl
;
13174 return _lseek(fh
, offset
, whence
);
13177 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13179 std::lock_guard
lock(client_lock
);
13180 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13181 tout(cct
) << "ll_read" << std::endl
;
13182 tout(cct
) << (unsigned long)fh
<< std::endl
;
13183 tout(cct
) << off
<< std::endl
;
13184 tout(cct
) << len
<< std::endl
;
13189 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13190 len
= std::min(len
, (loff_t
)INT_MAX
);
13191 return _read(fh
, off
, len
, bl
);
13194 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13198 file_layout_t
* layout
)
13200 std::lock_guard
lock(client_lock
);
13205 vinodeno_t vino
= _get_vino(in
);
13206 object_t oid
= file_object_t(vino
.ino
, blockid
);
13207 C_SaferCond onfinish
;
13210 objecter
->read(oid
,
13211 object_locator_t(layout
->pool_id
),
13216 CEPH_OSD_FLAG_READ
,
13219 client_lock
.Unlock();
13220 int r
= onfinish
.wait();
13221 client_lock
.Lock();
13224 bl
.copy(0, bl
.length(), buf
);
13231 /* It appears that the OSD doesn't return success unless the entire
13232 buffer was written, return the write length on success. */
13234 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13235 char* buf
, uint64_t offset
,
13236 uint64_t length
, file_layout_t
* layout
,
13237 uint64_t snapseq
, uint32_t sync
)
13239 vinodeno_t vino
= ll_get_vino(in
);
13241 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13246 if (true || sync
) {
13247 /* if write is stable, the epilogue is waiting on
13249 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13251 object_t oid
= file_object_t(vino
.ino
, blockid
);
13252 SnapContext fakesnap
;
13253 ceph::bufferlist bl
;
13255 bl
.push_back(buffer::copy(buf
, length
));
13258 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13261 fakesnap
.seq
= snapseq
;
13263 /* lock just in time */
13264 client_lock
.Lock();
13266 client_lock
.Unlock();
13270 objecter
->write(oid
,
13271 object_locator_t(layout
->pool_id
),
13276 ceph::real_clock::now(),
13280 client_lock
.Unlock();
13281 if (nullptr != onsafe
) {
13282 r
= onsafe
->wait();
13292 int Client::ll_commit_blocks(Inode
*in
,
13296 std::lock_guard
lock(client_lock
);
13298 BarrierContext *bctx;
13299 vinodeno_t vino = _get_vino(in);
13300 uint64_t ino = vino.ino;
13302 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13303 << offset << " to " << length << dendl;
13309 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13310 if (p != barriers.end()) {
13311 barrier_interval civ(offset, offset + length);
13312 p->second->commit_barrier(civ);
13318 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13320 std::lock_guard
lock(client_lock
);
13321 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13322 "~" << len
<< dendl
;
13323 tout(cct
) << "ll_write" << std::endl
;
13324 tout(cct
) << (unsigned long)fh
<< std::endl
;
13325 tout(cct
) << off
<< std::endl
;
13326 tout(cct
) << len
<< std::endl
;
13331 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13332 len
= std::min(len
, (loff_t
)INT_MAX
);
13333 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13334 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13339 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13341 std::lock_guard
lock(client_lock
);
13344 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13347 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13349 std::lock_guard
lock(client_lock
);
13352 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13355 int Client::ll_flush(Fh
*fh
)
13357 std::lock_guard
lock(client_lock
);
13358 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13359 tout(cct
) << "ll_flush" << std::endl
;
13360 tout(cct
) << (unsigned long)fh
<< std::endl
;
13368 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13370 std::lock_guard
lock(client_lock
);
13371 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13372 tout(cct
) << "ll_fsync" << std::endl
;
13373 tout(cct
) << (unsigned long)fh
<< std::endl
;
13378 int r
= _fsync(fh
, syncdataonly
);
13380 // If we're returning an error, clear it from the FH
13381 fh
->take_async_err();
13386 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13388 std::lock_guard
lock(client_lock
);
13389 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13390 tout(cct
) << "ll_sync_inode" << std::endl
;
13391 tout(cct
) << (unsigned long)in
<< std::endl
;
13396 return _fsync(in
, syncdataonly
);
13399 #ifdef FALLOC_FL_PUNCH_HOLE
13401 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13403 if (offset
< 0 || length
<= 0)
13406 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13407 return -EOPNOTSUPP
;
13409 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13410 return -EOPNOTSUPP
;
13412 Inode
*in
= fh
->inode
.get();
13414 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13415 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13419 if (in
->snapid
!= CEPH_NOSNAP
)
13422 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13425 uint64_t size
= offset
+ length
;
13426 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13428 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13433 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13437 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13438 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13439 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13440 (have
& CEPH_CAP_FILE_BUFFER
)) {
13442 int len
= in
->inline_data
.length();
13443 if (offset
< len
) {
13445 in
->inline_data
.copy(0, offset
, bl
);
13447 if (offset
+ size
> len
)
13448 size
= len
- offset
;
13450 bl
.append_zero(size
);
13451 if (offset
+ size
< len
)
13452 in
->inline_data
.copy(offset
+ size
, len
- offset
- size
, bl
);
13453 in
->inline_data
= bl
;
13454 in
->inline_version
++;
13456 in
->mtime
= in
->ctime
= ceph_clock_now();
13458 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13460 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13461 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13462 uninline_data(in
, onuninline
.get());
13465 C_SaferCond
onfinish("Client::_punch_hole flock");
13467 unsafe_sync_write
++;
13468 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13470 _invalidate_inode_cache(in
, offset
, length
);
13471 filer
->zero(in
->ino
, &in
->layout
,
13472 in
->snaprealm
->get_snap_context(),
13474 ceph::real_clock::now(),
13475 0, true, &onfinish
);
13476 in
->mtime
= in
->ctime
= ceph_clock_now();
13478 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13480 client_lock
.Unlock();
13482 client_lock
.Lock();
13483 _sync_write_commit(in
);
13485 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13486 uint64_t size
= offset
+ length
;
13487 if (size
> in
->size
) {
13489 in
->mtime
= in
->ctime
= ceph_clock_now();
13491 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13493 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13494 check_caps(in
, CHECK_CAPS_NODELAY
);
13495 } else if (is_max_size_approaching(in
)) {
13501 if (nullptr != onuninline
) {
13502 client_lock
.Unlock();
13503 int ret
= onuninline
->wait();
13504 client_lock
.Lock();
13506 if (ret
>= 0 || ret
== -ECANCELED
) {
13507 in
->inline_data
.clear();
13508 in
->inline_version
= CEPH_INLINE_NONE
;
13509 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13515 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13520 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13522 return -EOPNOTSUPP
;
13528 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13530 std::lock_guard
lock(client_lock
);
13531 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13532 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13533 tout(cct
) << (unsigned long)fh
<< std::endl
;
13538 return _fallocate(fh
, mode
, offset
, length
);
13541 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13543 std::lock_guard
lock(client_lock
);
13544 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13549 Fh
*fh
= get_filehandle(fd
);
13552 #if defined(__linux__) && defined(O_PATH)
13553 if (fh
->flags
& O_PATH
)
13556 return _fallocate(fh
, mode
, offset
, length
);
13559 int Client::ll_release(Fh
*fh
)
13561 std::lock_guard
lock(client_lock
);
13566 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13568 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13569 tout(cct
) << (unsigned long)fh
<< std::endl
;
13571 if (ll_unclosed_fh_set
.count(fh
))
13572 ll_unclosed_fh_set
.erase(fh
);
13573 return _release_fh(fh
);
13576 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13578 std::lock_guard
lock(client_lock
);
13580 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13581 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13586 return _getlk(fh
, fl
, owner
);
13589 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13591 std::lock_guard
lock(client_lock
);
13593 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13594 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13599 return _setlk(fh
, fl
, owner
, sleep
);
13602 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13604 std::lock_guard
lock(client_lock
);
13606 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13607 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13612 return _flock(fh
, cmd
, owner
);
13615 int Client::set_deleg_timeout(uint32_t timeout
)
13617 std::lock_guard
lock(client_lock
);
13620 * The whole point is to prevent blacklisting so we must time out the
13621 * delegation before the session autoclose timeout kicks in.
13623 if (timeout
>= mdsmap
->get_session_autoclose())
13626 deleg_timeout
= timeout
;
13630 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13634 std::lock_guard
lock(client_lock
);
13639 Inode
*inode
= fh
->inode
.get();
13642 case CEPH_DELEGATION_NONE
:
13643 inode
->unset_deleg(fh
);
13648 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13649 } catch (std::bad_alloc
&) {
13657 class C_Client_RequestInterrupt
: public Context
{
13662 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13665 void finish(int r
) override
{
13666 std::lock_guard
l(client
->client_lock
);
13667 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13668 client
->_interrupt_filelock(req
);
13669 client
->put_request(req
);
13673 void Client::ll_interrupt(void *d
)
13675 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13676 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13677 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13678 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13681 // =========================================
13684 // expose file layouts
13686 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13687 const UserPerm
& perms
)
13689 std::lock_guard
lock(client_lock
);
13694 filepath
path(relpath
);
13696 int r
= path_walk(path
, &in
, perms
);
13702 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13706 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13708 std::lock_guard
lock(client_lock
);
13713 Fh
*f
= get_filehandle(fd
);
13716 Inode
*in
= f
->inode
.get();
13720 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13724 int64_t Client::get_default_pool_id()
13726 std::lock_guard
lock(client_lock
);
13731 /* first data pool is the default */
13732 return mdsmap
->get_first_data_pool();
13737 int64_t Client::get_pool_id(const char *pool_name
)
13739 std::lock_guard
lock(client_lock
);
13744 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13748 string
Client::get_pool_name(int64_t pool
)
13750 std::lock_guard
lock(client_lock
);
13755 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13756 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13760 int Client::get_pool_replication(int64_t pool
)
13762 std::lock_guard
lock(client_lock
);
13767 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13768 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13772 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13774 std::lock_guard
lock(client_lock
);
13779 Fh
*f
= get_filehandle(fd
);
13782 Inode
*in
= f
->inode
.get();
13784 vector
<ObjectExtent
> extents
;
13785 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13786 ceph_assert(extents
.size() == 1);
13788 objecter
->with_osdmap([&](const OSDMap
& o
) {
13789 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13790 o
.pg_to_acting_osds(pg
, osds
);
13797 * Return the remainder of the extent (stripe unit)
13799 * If length = 1 is passed to Striper::file_to_extents we get a single
13800 * extent back, but its length is one so we still need to compute the length
13801 * to the end of the stripe unit.
13803 * If length = su then we may get 1 or 2 objects back in the extents vector
13804 * which would have to be examined. Even then, the offsets are local to the
13805 * object, so matching up to the file offset is extra work.
13807 * It seems simpler to stick with length = 1 and manually compute the
13811 uint64_t su
= in
->layout
.stripe_unit
;
13812 *len
= su
- (off
% su
);
13818 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13820 std::lock_guard
lock(client_lock
);
13827 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13828 return o
.crush
->get_full_location_ordered(id
, path
);
13832 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13833 vector
<entity_addr_t
>& address
)
13835 std::lock_guard
lock(client_lock
);
13840 Fh
*f
= get_filehandle(fd
);
13843 Inode
*in
= f
->inode
.get();
13846 vector
<ObjectExtent
> extents
;
13847 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13848 in
->truncate_size
, extents
);
13849 ceph_assert(extents
.size() == 1);
13851 // now we have the object and its 'layout'
13852 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13853 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13855 o
.pg_to_acting_osds(pg
, osds
);
13858 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13859 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13860 address
.push_back(addr
);
13866 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13868 std::lock_guard
lock(client_lock
);
13873 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13874 if (!o
.exists(osd
))
13877 addr
= o
.get_addrs(osd
).front();
13882 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13883 loff_t length
, loff_t offset
)
13885 std::lock_guard
lock(client_lock
);
13890 Fh
*f
= get_filehandle(fd
);
13893 Inode
*in
= f
->inode
.get();
13895 // map to a list of extents
13896 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13898 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13903 /* find an osd with the same ip. -ENXIO if none. */
13904 int Client::get_local_osd()
13906 std::lock_guard
lock(client_lock
);
13911 objecter
->with_osdmap([this](const OSDMap
& o
) {
13912 if (o
.get_epoch() != local_osd_epoch
) {
13913 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
13914 local_osd_epoch
= o
.get_epoch();
13925 // ===============================
13927 void Client::ms_handle_connect(Connection
*con
)
13929 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13932 bool Client::ms_handle_reset(Connection
*con
)
13934 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13938 void Client::ms_handle_remote_reset(Connection
*con
)
13940 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13941 std::lock_guard
l(client_lock
);
13942 switch (con
->get_peer_type()) {
13943 case CEPH_ENTITY_TYPE_MDS
:
13945 // kludge to figure out which mds this is; fixme with a Connection* state
13946 mds_rank_t mds
= MDS_RANK_NONE
;
13947 MetaSession
*s
= NULL
;
13948 for (auto &p
: mds_sessions
) {
13949 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
13955 assert (s
!= NULL
);
13956 switch (s
->state
) {
13957 case MetaSession::STATE_CLOSING
:
13958 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
13959 _closed_mds_session(s
);
13962 case MetaSession::STATE_OPENING
:
13964 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
13965 list
<Context
*> waiters
;
13966 waiters
.swap(s
->waiting_for_open
);
13967 _closed_mds_session(s
);
13968 MetaSession
*news
= _get_or_open_mds_session(mds
);
13969 news
->waiting_for_open
.swap(waiters
);
13973 case MetaSession::STATE_OPEN
:
13975 objecter
->maybe_request_map(); /* to check if we are blacklisted */
13976 const auto& conf
= cct
->_conf
;
13977 if (conf
->client_reconnect_stale
) {
13978 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
13979 _closed_mds_session(s
);
13981 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
13982 s
->state
= MetaSession::STATE_STALE
;
13987 case MetaSession::STATE_NEW
:
13988 case MetaSession::STATE_CLOSED
:
13998 bool Client::ms_handle_refused(Connection
*con
)
14000 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14004 bool Client::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
14006 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
14008 *authorizer
= monclient
->build_authorizer(dest_type
);
14012 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14014 Inode
*quota_in
= root_ancestor
;
14015 SnapRealm
*realm
= in
->snaprealm
;
14017 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14018 if (realm
->ino
!= in
->ino
) {
14019 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14020 if (p
== inode_map
.end())
14023 if (p
->second
->quota
.is_enable()) {
14024 quota_in
= p
->second
;
14028 realm
= realm
->pparent
;
14030 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14035 * Traverse quota ancestors of the Inode, return true
14036 * if any of them passes the passed function
14038 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14039 std::function
<bool (const Inode
&in
)> test
)
14042 ceph_assert(in
!= NULL
);
14047 if (in
== root_ancestor
) {
14048 // We're done traversing, drop out
14051 // Continue up the tree
14052 in
= get_quota_root(in
, perms
);
14059 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14061 return check_quota_condition(in
, perms
,
14062 [](const Inode
&in
) {
14063 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14067 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14068 const UserPerm
& perms
)
14070 return check_quota_condition(in
, perms
,
14071 [&new_bytes
](const Inode
&in
) {
14072 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14073 > in
.quota
.max_bytes
;
14077 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14079 return check_quota_condition(in
, perms
,
14080 [](const Inode
&in
) {
14081 if (in
.quota
.max_bytes
) {
14082 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14086 ceph_assert(in
.size
>= in
.reported_size
);
14087 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14088 const uint64_t size
= in
.size
- in
.reported_size
;
14089 return (space
>> 4) < size
;
14103 int Client::check_pool_perm(Inode
*in
, int need
)
14105 if (!cct
->_conf
->client_check_pool_perm
)
14108 int64_t pool_id
= in
->layout
.pool_id
;
14109 std::string pool_ns
= in
->layout
.pool_ns
;
14110 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14113 auto it
= pool_perms
.find(perm_key
);
14114 if (it
== pool_perms
.end())
14116 if (it
->second
== POOL_CHECKING
) {
14117 // avoid concurrent checkings
14118 wait_on_list(waiting_for_pool_perm
);
14121 ceph_assert(have
& POOL_CHECKED
);
14127 if (in
->snapid
!= CEPH_NOSNAP
) {
14128 // pool permission check needs to write to the first object. But for snapshot,
14129 // head of the first object may have alread been deleted. To avoid creating
14130 // orphan object, skip the check for now.
14134 pool_perms
[perm_key
] = POOL_CHECKING
;
14137 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14138 object_t oid
= oid_buf
;
14140 SnapContext nullsnapc
;
14142 C_SaferCond rd_cond
;
14143 ObjectOperation rd_op
;
14144 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14146 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14147 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14149 C_SaferCond wr_cond
;
14150 ObjectOperation wr_op
;
14151 wr_op
.create(true);
14153 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14154 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14156 client_lock
.Unlock();
14157 int rd_ret
= rd_cond
.wait();
14158 int wr_ret
= wr_cond
.wait();
14159 client_lock
.Lock();
14161 bool errored
= false;
14163 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14165 else if (rd_ret
!= -EPERM
) {
14166 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14167 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14171 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14172 have
|= POOL_WRITE
;
14173 else if (wr_ret
!= -EPERM
) {
14174 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14175 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14180 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14181 // Raise EIO because actual error code might be misleading for
14182 // userspace filesystem user.
14183 pool_perms
.erase(perm_key
);
14184 signal_cond_list(waiting_for_pool_perm
);
14188 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14189 signal_cond_list(waiting_for_pool_perm
);
14192 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14193 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14194 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14197 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14198 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14199 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14206 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14208 if (acl_type
== POSIX_ACL
) {
14209 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14210 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14212 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14218 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14220 if (acl_type
== NO_ACL
)
14223 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14227 if (acl_type
== POSIX_ACL
) {
14228 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14229 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14230 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14231 r
= posix_acl_access_chmod(acl
, mode
);
14234 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14240 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14244 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14245 const UserPerm
& perms
)
14247 if (acl_type
== NO_ACL
)
14250 if (S_ISLNK(*mode
))
14253 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14257 if (acl_type
== POSIX_ACL
) {
14258 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14259 map
<string
, bufferptr
> xattrs
;
14261 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14262 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14263 r
= posix_acl_inherit_mode(acl
, mode
);
14268 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14272 xattrs
[ACL_EA_ACCESS
] = acl
;
14275 if (S_ISDIR(*mode
))
14276 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14280 encode(xattrs
, xattrs_bl
);
14283 *mode
&= ~umask_cb(callback_handle
);
14288 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14292 void Client::set_filer_flags(int flags
)
14294 std::lock_guard
l(client_lock
);
14295 ceph_assert(flags
== 0 ||
14296 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14297 objecter
->add_global_op_flags(flags
);
14300 void Client::clear_filer_flags(int flags
)
14302 std::lock_guard
l(client_lock
);
14303 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14304 objecter
->clear_global_op_flag(flags
);
14307 // called before mount
14308 void Client::set_uuid(const std::string
& uuid
)
14310 std::lock_guard
l(client_lock
);
14311 assert(initialized
);
14312 assert(!uuid
.empty());
14314 metadata
["uuid"] = uuid
;
14318 // called before mount. 0 means infinite
14319 void Client::set_session_timeout(unsigned timeout
)
14321 std::lock_guard
l(client_lock
);
14322 assert(initialized
);
14324 metadata
["timeout"] = stringify(timeout
);
14327 // called before mount
14328 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14329 const std::string
& fs_name
)
14331 std::lock_guard
l(client_lock
);
14339 auto it
= metadata
.find("uuid");
14340 if (it
!= metadata
.end() && it
->second
== uuid
)
14344 int r
= subscribe_mdsmap(fs_name
);
14346 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14350 if (metadata
.empty())
14351 populate_metadata("");
14353 while (mdsmap
->get_epoch() == 0)
14354 wait_on_list(waiting_for_mdsmap
);
14357 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14358 if (!mdsmap
->is_up(mds
)) {
14359 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14360 wait_on_list(waiting_for_mdsmap
);
14364 MetaSession
*session
;
14365 if (!have_open_session(mds
)) {
14366 session
= _get_or_open_mds_session(mds
);
14367 if (session
->state
!= MetaSession::STATE_OPENING
) {
14371 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14372 wait_on_context_list(session
->waiting_for_open
);
14373 if (rejected_by_mds
.count(mds
))
14378 session
= &mds_sessions
.at(mds
);
14379 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14380 return -EOPNOTSUPP
;
14382 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14383 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14384 session
->reclaim_state
= MetaSession::RECLAIMING
;
14385 auto m
= MClientReclaim::create(uuid
, flags
);
14386 session
->con
->send_message2(std::move(m
));
14387 wait_on_list(waiting_for_reclaim
);
14388 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14389 return reclaim_errno
? : -ENOTRECOVERABLE
;
14395 // didn't find target session in any mds
14396 if (reclaim_target_addrs
.empty()) {
14397 if (flags
& CEPH_RECLAIM_RESET
)
14399 return -ENOTRECOVERABLE
;
14402 if (flags
& CEPH_RECLAIM_RESET
)
14405 // use blacklist to check if target session was killed
14406 // (config option mds_session_blacklist_on_evict needs to be true)
14408 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14409 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14410 client_lock
.Unlock();
14412 client_lock
.Lock();
14415 bool blacklisted
= objecter
->with_osdmap(
14416 [this](const OSDMap
&osd_map
) -> bool {
14417 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14420 return -ENOTRECOVERABLE
;
14422 metadata
["reclaiming_uuid"] = uuid
;
14426 void Client::finish_reclaim()
14428 auto it
= metadata
.find("reclaiming_uuid");
14429 if (it
== metadata
.end()) {
14430 for (auto &p
: mds_sessions
)
14431 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14435 for (auto &p
: mds_sessions
) {
14436 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14437 auto m
= MClientReclaim::create("", MClientReclaim::FLAG_FINISH
);
14438 p
.second
.con
->send_message2(std::move(m
));
14441 metadata
["uuid"] = it
->second
;
14442 metadata
.erase(it
);
14445 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14447 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14448 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14450 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14452 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14456 if (reply
->get_result() >= 0) {
14457 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14458 if (reply
->get_epoch() > reclaim_osd_epoch
)
14459 reclaim_osd_epoch
= reply
->get_epoch();
14460 if (!reply
->get_addrs().empty())
14461 reclaim_target_addrs
= reply
->get_addrs();
14463 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14464 reclaim_errno
= reply
->get_result();
14467 signal_cond_list(waiting_for_reclaim
);
14471 * This is included in cap release messages, to cause
14472 * the MDS to wait until this OSD map epoch. It is necessary
14473 * in corner cases where we cancel RADOS ops, so that
14474 * nobody else tries to do IO to the same objects in
14475 * the same epoch as the cancelled ops.
14477 void Client::set_cap_epoch_barrier(epoch_t e
)
14479 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14480 cap_epoch_barrier
= e
;
14483 const char** Client::get_tracked_conf_keys() const
14485 static const char* keys
[] = {
14486 "client_cache_size",
14487 "client_cache_mid",
14489 "client_deleg_timeout",
14490 "client_deleg_break_on_open",
14496 void Client::handle_conf_change(const ConfigProxy
& conf
,
14497 const std::set
<std::string
> &changed
)
14499 std::lock_guard
lock(client_lock
);
14501 if (changed
.count("client_cache_mid")) {
14502 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14504 if (changed
.count("client_acl_type")) {
14506 if (cct
->_conf
->client_acl_type
== "posix_acl")
14507 acl_type
= POSIX_ACL
;
14511 void intrusive_ptr_add_ref(Inode
*in
)
14516 void intrusive_ptr_release(Inode
*in
)
14518 in
->client
->put_inode(in
);
14521 mds_rank_t
Client::_get_random_up_mds() const
14523 ceph_assert(client_lock
.is_locked_by_me());
14525 std::set
<mds_rank_t
> up
;
14526 mdsmap
->get_up_mds_set(up
);
14529 return MDS_RANK_NONE
;
14530 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14531 for (int n
= rand() % up
.size(); n
; n
--)
14537 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14538 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14540 monclient
->set_messenger(m
);
14541 objecter
->set_client_incarnation(0);
14544 StandaloneClient::~StandaloneClient()
14547 objecter
= nullptr;
14550 int StandaloneClient::init()
14553 objectcacher
->start();
14556 client_lock
.Lock();
14557 ceph_assert(!is_initialized());
14559 messenger
->add_dispatcher_tail(objecter
);
14560 messenger
->add_dispatcher_tail(this);
14562 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14563 int r
= monclient
->init();
14565 // need to do cleanup because we're in an intermediate init state
14567 client_lock
.Unlock();
14568 objecter
->shutdown();
14569 objectcacher
->stop();
14570 monclient
->shutdown();
14575 client_lock
.Unlock();
14581 void StandaloneClient::shutdown()
14583 Client::shutdown();
14584 objecter
->shutdown();
14585 monclient
->shutdown();