1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/Mutex.h"
74 #include "common/perf_counters.h"
75 #include "common/admin_socket.h"
76 #include "common/errno.h"
77 #include "include/str_list.h"
79 #define dout_subsys ceph_subsys_client
81 #include "include/lru.h"
82 #include "include/compat.h"
83 #include "include/stringify.h"
88 #include "Delegation.h"
90 #include "ClientSnapRealm.h"
92 #include "MetaSession.h"
93 #include "MetaRequest.h"
94 #include "ObjecterWriteback.h"
95 #include "posix_acl.h"
97 #include "include/ceph_assert.h"
98 #include "include/stat.h"
100 #include "include/cephfs/ceph_statx.h"
102 #if HAVE_GETGROUPLIST
109 #define dout_prefix *_dout << "client." << whoami << " "
111 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
113 // FreeBSD fails to define this
117 // Darwin fails to define this
126 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
128 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
130 Client
*client
= static_cast<Client
*>(p
);
131 client
->flush_set_callback(oset
);
137 Client::CommandHook::CommandHook(Client
*client
) :
142 bool Client::CommandHook::call(std::string_view command
,
143 const cmdmap_t
& cmdmap
,
144 std::string_view format
, bufferlist
& out
)
146 std::unique_ptr
<Formatter
> f(Formatter::create(format
));
147 f
->open_object_section("result");
148 m_client
->client_lock
.Lock();
149 if (command
== "mds_requests")
150 m_client
->dump_mds_requests(f
.get());
151 else if (command
== "mds_sessions")
152 m_client
->dump_mds_sessions(f
.get());
153 else if (command
== "dump_cache")
154 m_client
->dump_cache(f
.get());
155 else if (command
== "kick_stale_sessions")
156 m_client
->_kick_stale_sessions();
157 else if (command
== "status")
158 m_client
->dump_status(f
.get());
160 ceph_abort_msg("bad command registered");
161 m_client
->client_lock
.Unlock();
170 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
171 : inode(in
), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
176 void Client::_reset_faked_inos()
179 free_faked_inos
.clear();
180 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
181 last_used_faked_ino
= 0;
182 last_used_faked_root
= 0;
183 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
186 void Client::_assign_faked_ino(Inode
*in
)
188 if (0 == last_used_faked_ino
)
189 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
190 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
191 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
192 last_used_faked_ino
= 2048;
193 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
195 ceph_assert(it
!= free_faked_inos
.end());
196 if (last_used_faked_ino
< it
.get_start()) {
197 ceph_assert(it
.get_len() > 0);
198 last_used_faked_ino
= it
.get_start();
200 ++last_used_faked_ino
;
201 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
203 in
->faked_ino
= last_used_faked_ino
;
204 free_faked_inos
.erase(in
->faked_ino
);
205 faked_ino_map
[in
->faked_ino
] = in
->vino();
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
215 void Client::_assign_faked_root(Inode
*in
)
217 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
218 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
219 last_used_faked_root
= 0;
220 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
222 assert(it
!= free_faked_inos
.end());
223 vinodeno_t inode_info
= in
->vino();
224 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
225 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
226 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
229 in
->faked_ino
= last_used_faked_root
;
230 free_faked_inos
.erase(in
->faked_ino
);
231 faked_ino_map
[in
->faked_ino
] = in
->vino();
234 void Client::_release_faked_ino(Inode
*in
)
236 free_faked_inos
.insert(in
->faked_ino
);
237 faked_ino_map
.erase(in
->faked_ino
);
240 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
245 else if (faked_ino_map
.count(ino
))
246 vino
= faked_ino_map
[ino
];
248 vino
= vinodeno_t(0, CEPH_NOSNAP
);
249 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
253 vinodeno_t
Client::map_faked_ino(ino_t ino
)
255 std::lock_guard
lock(client_lock
);
256 return _map_faked_ino(ino
);
261 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
262 : Dispatcher(m
->cct
),
263 timer(m
->cct
, client_lock
),
264 client_lock("Client::client_lock"),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 objecter_finisher(m
->cct
),
274 m_command_hook(this),
279 _dir_vxattrs_name_size
= _vxattrs_calcu_name_size(_dir_vxattrs
);
280 _file_vxattrs_name_size
= _vxattrs_calcu_name_size(_file_vxattrs
);
282 user_id
= cct
->_conf
->client_mount_uid
;
283 group_id
= cct
->_conf
->client_mount_gid
;
285 if (cct
->_conf
->client_acl_type
== "posix_acl")
286 acl_type
= POSIX_ACL
;
288 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
291 free_fd_set
.insert(10, 1<<30);
293 mdsmap
.reset(new MDSMap
);
296 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
298 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
299 client_flush_set_callback
, // all commit callback
301 cct
->_conf
->client_oc_size
,
302 cct
->_conf
->client_oc_max_objects
,
303 cct
->_conf
->client_oc_max_dirty
,
304 cct
->_conf
->client_oc_target_dirty
,
305 cct
->_conf
->client_oc_max_dirty_age
,
307 objecter_finisher
.start();
308 filer
.reset(new Filer(objecter
, &objecter_finisher
));
309 objecter
->enable_blacklist_events();
315 ceph_assert(!client_lock
.is_locked());
317 // It is necessary to hold client_lock, because any inode destruction
318 // may call into ObjectCacher, which asserts that it's lock (which is
319 // client_lock) is held.
322 client_lock
.Unlock();
325 void Client::tear_down_cache()
328 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
332 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
337 while (!opened_dirs
.empty()) {
338 dir_result_t
*dirp
= *opened_dirs
.begin();
339 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
348 ceph_assert(lru
.lru_get_size() == 0);
351 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
352 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
356 while (!root_parents
.empty())
357 root_parents
.erase(root_parents
.begin());
362 ceph_assert(inode_map
.empty());
365 inodeno_t
Client::get_root_ino()
367 std::lock_guard
l(client_lock
);
368 if (use_faked_inos())
369 return root
->faked_ino
;
374 Inode
*Client::get_root()
376 std::lock_guard
l(client_lock
);
384 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
387 in
->make_long_path(path
);
388 ldout(cct
, 1) << "dump_inode: "
389 << (disconnected
? "DISCONNECTED ":"")
390 << "inode " << in
->ino
392 << " ref " << in
->get_num_ref()
396 f
->open_object_section("inode");
397 f
->dump_stream("path") << path
;
399 f
->dump_int("disconnected", 1);
406 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
407 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
408 it
!= in
->dir
->dentries
.end();
410 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
412 f
->open_object_section("dentry");
416 if (it
->second
->inode
)
417 dump_inode(f
, it
->second
->inode
.get(), did
, false);
422 void Client::dump_cache(Formatter
*f
)
426 ldout(cct
, 1) << __func__
<< dendl
;
429 f
->open_array_section("cache");
432 dump_inode(f
, root
, did
, true);
434 // make a second pass to catch anything disconnected
435 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
436 it
!= inode_map
.end();
438 if (did
.count(it
->second
))
440 dump_inode(f
, it
->second
, did
, true);
447 void Client::dump_status(Formatter
*f
)
449 ceph_assert(client_lock
.is_locked_by_me());
451 ldout(cct
, 1) << __func__
<< dendl
;
453 const epoch_t osd_epoch
454 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
457 f
->open_object_section("metadata");
458 for (const auto& kv
: metadata
)
459 f
->dump_string(kv
.first
.c_str(), kv
.second
);
462 f
->dump_int("dentry_count", lru
.lru_get_size());
463 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
464 f
->dump_int("id", get_nodeid().v
);
465 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
466 f
->dump_object("inst", inst
);
467 f
->dump_object("addr", inst
.addr
);
468 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
469 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
470 f
->dump_int("inode_count", inode_map
.size());
471 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
472 f
->dump_int("osd_epoch", osd_epoch
);
473 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
474 f
->dump_bool("blacklisted", blacklisted
);
481 objectcacher
->start();
484 ceph_assert(!initialized
);
486 messenger
->add_dispatcher_tail(this);
487 client_lock
.Unlock();
493 void Client::_finish_init()
497 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
498 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
499 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
500 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
501 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
502 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
503 logger
.reset(plb
.create_perf_counters());
504 cct
->get_perfcounters_collection()->add(logger
.get());
506 client_lock
.Unlock();
508 cct
->_conf
.add_observer(this);
510 AdminSocket
* admin_socket
= cct
->get_admin_socket();
511 int ret
= admin_socket
->register_command("mds_requests",
514 "show in-progress mds requests");
516 lderr(cct
) << "error registering admin socket command: "
517 << cpp_strerror(-ret
) << dendl
;
519 ret
= admin_socket
->register_command("mds_sessions",
522 "show mds session state");
524 lderr(cct
) << "error registering admin socket command: "
525 << cpp_strerror(-ret
) << dendl
;
527 ret
= admin_socket
->register_command("dump_cache",
530 "show in-memory metadata cache contents");
532 lderr(cct
) << "error registering admin socket command: "
533 << cpp_strerror(-ret
) << dendl
;
535 ret
= admin_socket
->register_command("kick_stale_sessions",
536 "kick_stale_sessions",
538 "kick sessions that were remote reset");
540 lderr(cct
) << "error registering admin socket command: "
541 << cpp_strerror(-ret
) << dendl
;
543 ret
= admin_socket
->register_command("status",
546 "show overall client status");
548 lderr(cct
) << "error registering admin socket command: "
549 << cpp_strerror(-ret
) << dendl
;
554 client_lock
.Unlock();
557 void Client::shutdown()
559 ldout(cct
, 1) << __func__
<< dendl
;
561 // If we were not mounted, but were being used for sending
562 // MDS commands, we may have sessions that need closing.
565 client_lock
.Unlock();
567 cct
->_conf
.remove_observer(this);
569 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
571 if (ino_invalidate_cb
) {
572 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
573 async_ino_invalidator
.wait_for_empty();
574 async_ino_invalidator
.stop();
577 if (dentry_invalidate_cb
) {
578 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
579 async_dentry_invalidator
.wait_for_empty();
580 async_dentry_invalidator
.stop();
583 if (switch_interrupt_cb
) {
584 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
585 interrupt_finisher
.wait_for_empty();
586 interrupt_finisher
.stop();
590 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
591 remount_finisher
.wait_for_empty();
592 remount_finisher
.stop();
595 objectcacher
->stop(); // outside of client_lock! this does a join.
598 ceph_assert(initialized
);
601 client_lock
.Unlock();
603 objecter_finisher
.wait_for_empty();
604 objecter_finisher
.stop();
607 cct
->get_perfcounters_collection()->remove(logger
.get());
613 // ===================
614 // metadata cache stuff
616 void Client::trim_cache(bool trim_kernel_dcache
)
618 uint64_t max
= cct
->_conf
->client_cache_size
;
619 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
621 while (lru
.lru_get_size() != last
) {
622 last
= lru
.lru_get_size();
624 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
627 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
634 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
635 _invalidate_kernel_dcache();
638 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
639 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
643 while (!root_parents
.empty())
644 root_parents
.erase(root_parents
.begin());
650 void Client::trim_cache_for_reconnect(MetaSession
*s
)
652 mds_rank_t mds
= s
->mds_num
;
653 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
656 list
<Dentry
*> skipped
;
657 while (lru
.lru_get_size() > 0) {
658 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
662 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
663 dn
->dir
->parent_inode
->caps
.count(mds
)) {
667 skipped
.push_back(dn
);
670 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
671 lru
.lru_insert_mid(*p
);
673 ldout(cct
, 20) << __func__
<< " mds." << mds
674 << " trimmed " << trimmed
<< " dentries" << dendl
;
676 if (s
->caps
.size() > 0)
677 _invalidate_kernel_dcache();
680 void Client::trim_dentry(Dentry
*dn
)
682 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
684 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
687 Inode
*diri
= dn
->dir
->parent_inode
;
688 diri
->dir_release_count
++;
689 clear_dir_complete_and_ordered(diri
, true);
691 unlink(dn
, false, false); // drop dir, drop dentry
695 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
696 uint64_t truncate_seq
, uint64_t truncate_size
)
698 uint64_t prior_size
= in
->size
;
700 if (truncate_seq
> in
->truncate_seq
||
701 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
702 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
704 in
->reported_size
= size
;
705 if (truncate_seq
!= in
->truncate_seq
) {
706 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
707 << truncate_seq
<< dendl
;
708 in
->truncate_seq
= truncate_seq
;
709 in
->oset
.truncate_seq
= truncate_seq
;
711 // truncate cached file data
712 if (prior_size
> size
) {
713 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
717 // truncate inline data
718 if (in
->inline_version
< CEPH_INLINE_NONE
) {
719 uint32_t len
= in
->inline_data
.length();
721 in
->inline_data
.splice(size
, len
- size
);
724 if (truncate_seq
>= in
->truncate_seq
&&
725 in
->truncate_size
!= truncate_size
) {
727 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
728 << truncate_size
<< dendl
;
729 in
->truncate_size
= truncate_size
;
730 in
->oset
.truncate_size
= truncate_size
;
732 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
737 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
738 utime_t ctime
, utime_t mtime
, utime_t atime
)
740 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
741 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
743 if (time_warp_seq
> in
->time_warp_seq
)
744 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
745 << " is higher than local time_warp_seq "
746 << in
->time_warp_seq
<< dendl
;
749 // be careful with size, mtime, atime
750 if (issued
& (CEPH_CAP_FILE_EXCL
|
752 CEPH_CAP_FILE_BUFFER
|
754 CEPH_CAP_XATTR_EXCL
)) {
755 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
756 if (ctime
> in
->ctime
)
758 if (time_warp_seq
> in
->time_warp_seq
) {
759 //the mds updated times, so take those!
762 in
->time_warp_seq
= time_warp_seq
;
763 } else if (time_warp_seq
== in
->time_warp_seq
) {
765 if (mtime
> in
->mtime
)
767 if (atime
> in
->atime
)
769 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
770 //ignore mds values as we have a higher seq
773 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
774 if (time_warp_seq
>= in
->time_warp_seq
) {
778 in
->time_warp_seq
= time_warp_seq
;
782 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
783 << time_warp_seq
<< " is lower than local time_warp_seq "
789 void Client::_fragmap_remove_non_leaves(Inode
*in
)
791 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
792 if (!in
->dirfragtree
.is_leaf(p
->first
))
793 in
->fragmap
.erase(p
++);
798 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
800 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
801 if (p
->second
== mds
)
802 in
->fragmap
.erase(p
++);
807 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
808 MetaSession
*session
,
809 const UserPerm
& request_perms
)
812 bool was_new
= false;
813 if (inode_map
.count(st
->vino
)) {
814 in
= inode_map
[st
->vino
];
815 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
817 in
= new Inode(this, st
->vino
, &st
->layout
);
818 inode_map
[st
->vino
] = in
;
820 if (use_faked_inos())
821 _assign_faked_ino(in
);
825 if (use_faked_inos())
826 _assign_faked_root(root
);
829 } else if (!mounted
) {
830 root_parents
[root_ancestor
] = in
;
835 in
->ino
= st
->vino
.ino
;
836 in
->snapid
= st
->vino
.snapid
;
837 in
->mode
= st
->mode
& S_IFMT
;
842 if (in
->is_symlink())
843 in
->symlink
= st
->symlink
;
845 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
846 bool new_version
= false;
847 if (in
->version
== 0 ||
848 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
849 (in
->version
& ~1) < st
->version
))
853 in
->caps_issued(&issued
);
854 issued
|= in
->caps_dirty();
855 int new_issued
= ~issued
& (int)st
->cap
.caps
;
857 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
858 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
862 in
->btime
= st
->btime
;
865 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
866 !(issued
& CEPH_CAP_LINK_EXCL
)) {
867 in
->nlink
= st
->nlink
;
870 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
871 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
872 st
->ctime
, st
->mtime
, st
->atime
);
876 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
877 in
->layout
= st
->layout
;
878 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
882 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
883 in
->dirstat
= st
->dirstat
;
885 // dir_layout/rstat/quota are not tracked by capability, update them only if
886 // the inode stat is from auth mds
887 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
888 in
->dir_layout
= st
->dir_layout
;
889 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
890 in
->rstat
= st
->rstat
;
891 in
->quota
= st
->quota
;
892 in
->dir_pin
= st
->dir_pin
;
894 // move me if/when version reflects fragtree changes.
895 if (in
->dirfragtree
!= st
->dirfragtree
) {
896 in
->dirfragtree
= st
->dirfragtree
;
897 _fragmap_remove_non_leaves(in
);
901 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
902 st
->xattrbl
.length() &&
903 st
->xattr_version
> in
->xattr_version
) {
904 auto p
= st
->xattrbl
.cbegin();
905 decode(in
->xattrs
, p
);
906 in
->xattr_version
= st
->xattr_version
;
909 if (st
->inline_version
> in
->inline_version
) {
910 in
->inline_data
= st
->inline_data
;
911 in
->inline_version
= st
->inline_version
;
914 /* always take a newer change attr */
915 if (st
->change_attr
> in
->change_attr
)
916 in
->change_attr
= st
->change_attr
;
918 if (st
->version
> in
->version
)
919 in
->version
= st
->version
;
922 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
925 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
927 if (in
->snapid
== CEPH_NOSNAP
) {
928 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
929 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
930 st
->cap
.flags
, request_perms
);
931 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
932 in
->max_size
= st
->max_size
;
933 in
->rstat
= st
->rstat
;
936 // setting I_COMPLETE needs to happen after adding the cap
938 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
939 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
940 in
->dirstat
.nfiles
== 0 &&
941 in
->dirstat
.nsubdirs
== 0) {
942 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
943 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
945 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
946 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
947 in
->dir
->readdir_cache
.clear();
948 for (const auto& p
: in
->dir
->dentries
) {
949 unlink(p
.second
, true, true); // keep dir, keep dentry
951 if (in
->dir
->dentries
.empty())
956 in
->snap_caps
|= st
->cap
.caps
;
964 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
966 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
967 Inode
*in
, utime_t from
, MetaSession
*session
,
971 if (dir
->dentries
.count(dname
))
972 dn
= dir
->dentries
[dname
];
974 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
975 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
978 if (dn
&& dn
->inode
) {
979 if (dn
->inode
->vino() == in
->vino()) {
981 ldout(cct
, 12) << " had dentry " << dname
982 << " with correct vino " << dn
->inode
->vino()
985 ldout(cct
, 12) << " had dentry " << dname
986 << " with WRONG vino " << dn
->inode
->vino()
988 unlink(dn
, true, true); // keep dir, keep dentry
992 if (!dn
|| !dn
->inode
) {
993 InodeRef
tmp_ref(in
);
995 if (old_dentry
->dir
!= dir
) {
996 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
997 old_diri
->dir_ordered_count
++;
998 clear_dir_complete_and_ordered(old_diri
, false);
1000 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1002 Inode
*diri
= dir
->parent_inode
;
1003 diri
->dir_ordered_count
++;
1004 clear_dir_complete_and_ordered(diri
, false);
1005 dn
= link(dir
, dname
, in
, dn
);
1008 update_dentry_lease(dn
, dlease
, from
, session
);
1012 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1014 utime_t dttl
= from
;
1015 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1019 if (dlease
->mask
& CEPH_LOCK_DN
) {
1020 if (dttl
> dn
->lease_ttl
) {
1021 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1022 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1023 dn
->lease_ttl
= dttl
;
1024 dn
->lease_mds
= session
->mds_num
;
1025 dn
->lease_seq
= dlease
->seq
;
1026 dn
->lease_gen
= session
->cap_gen
;
1029 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1034 * update MDS location cache for a single inode
1036 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1039 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1040 if (dst
->auth
>= 0) {
1041 in
->fragmap
[dst
->frag
] = dst
->auth
;
1043 in
->fragmap
.erase(dst
->frag
);
1045 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1046 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1047 _fragmap_remove_non_leaves(in
);
1051 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1055 if (!st->dirfrag_dist.empty()) { // FIXME
1056 set<int> dist = st->dirfrag_dist.begin()->second;
1057 if (dist.empty() && !in->dir_contacts.empty())
1058 ldout(cct, 9) << "lost dist spec for " << in->ino
1059 << " " << dist << dendl;
1060 if (!dist.empty() && in->dir_contacts.empty())
1061 ldout(cct, 9) << "got dist spec for " << in->ino
1062 << " " << dist << dendl;
1063 in->dir_contacts = dist;
1068 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1070 if (diri
->flags
& I_COMPLETE
) {
1072 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1073 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1075 if (diri
->flags
& I_DIR_ORDERED
) {
1076 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1077 diri
->flags
&= ~I_DIR_ORDERED
;
1081 diri
->dir
->readdir_cache
.clear();
1086 * insert results from readdir or lssnap into the metadata cache.
1088 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1090 auto& reply
= request
->reply
;
1091 ConnectionRef con
= request
->reply
->get_connection();
1093 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1094 features
= (uint64_t)-1;
1097 features
= con
->get_features();
1100 dir_result_t
*dirp
= request
->dirp
;
1103 // the extra buffer list is only set for readdir and lssnap replies
1104 auto p
= reply
->get_extra_bl().cbegin();
1107 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1109 diri
= open_snapdir(diri
);
1112 // only open dir if we're actually adding stuff to it!
1113 Dir
*dir
= diri
->open_dir();
1117 DirStat
dst(p
, features
);
1123 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1124 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1126 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1127 unsigned readdir_offset
= dirp
->next_offset
;
1128 string readdir_start
= dirp
->last_name
;
1129 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1131 unsigned last_hash
= 0;
1133 if (!readdir_start
.empty()) {
1134 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1135 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1136 /* mds understands offset_hash */
1137 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1141 if (fg
!= dst
.frag
) {
1142 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1146 readdir_start
.clear();
1147 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1151 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1152 << ", hash_order=" << hash_order
1153 << ", readdir_start " << readdir_start
1154 << ", last_hash " << last_hash
1155 << ", next_offset " << readdir_offset
<< dendl
;
1157 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1158 fg
.is_leftmost() && readdir_offset
== 2 &&
1159 !(hash_order
&& last_hash
)) {
1160 dirp
->release_count
= diri
->dir_release_count
;
1161 dirp
->ordered_count
= diri
->dir_ordered_count
;
1162 dirp
->start_shared_gen
= diri
->shared_gen
;
1163 dirp
->cache_index
= 0;
1166 dirp
->buffer_frag
= fg
;
1168 _readdir_drop_dirp_buffer(dirp
);
1169 dirp
->buffer
.reserve(numdn
);
1173 for (unsigned i
=0; i
<numdn
; i
++) {
1175 dlease
.decode(p
, features
);
1176 InodeStat
ist(p
, features
);
1178 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1180 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1183 if (diri
->dir
->dentries
.count(dname
)) {
1184 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1185 if (olddn
->inode
!= in
) {
1186 // replace incorrect dentry
1187 unlink(olddn
, true, true); // keep dir, dentry
1188 dn
= link(dir
, dname
, in
, olddn
);
1189 ceph_assert(dn
== olddn
);
1197 dn
= link(dir
, dname
, in
, NULL
);
1200 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1202 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1203 if (hash
!= last_hash
)
1206 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1208 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1210 // add to readdir cache
1211 if (dirp
->release_count
== diri
->dir_release_count
&&
1212 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1213 dirp
->start_shared_gen
== diri
->shared_gen
) {
1214 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1216 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1217 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1219 dir
->readdir_cache
.push_back(dn
);
1220 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1221 if (dirp
->inode
->is_complete_and_ordered())
1222 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1224 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1226 ceph_abort_msg("unexpected readdir buffer idx");
1228 dirp
->cache_index
++;
1230 // add to cached result list
1231 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1232 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1236 dirp
->last_name
= dname
;
1238 dirp
->next_offset
= 2;
1240 dirp
->next_offset
= readdir_offset
;
1242 if (dir
->is_empty())
1249 * insert a trace from a MDS reply into the cache.
1251 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1253 auto& reply
= request
->reply
;
1254 int op
= request
->get_op();
1256 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1257 << " is_target=" << (int)reply
->head
.is_target
1258 << " is_dentry=" << (int)reply
->head
.is_dentry
1261 auto p
= reply
->get_trace_bl().cbegin();
1262 if (request
->got_unsafe
) {
1263 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1264 ceph_assert(p
.end());
1269 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1271 Dentry
*d
= request
->dentry();
1273 Inode
*diri
= d
->dir
->parent_inode
;
1274 diri
->dir_release_count
++;
1275 clear_dir_complete_and_ordered(diri
, true);
1278 if (d
&& reply
->get_result() == 0) {
1279 if (op
== CEPH_MDS_OP_RENAME
) {
1281 Dentry
*od
= request
->old_dentry();
1282 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1284 unlink(od
, true, true); // keep dir, dentry
1285 } else if (op
== CEPH_MDS_OP_RMDIR
||
1286 op
== CEPH_MDS_OP_UNLINK
) {
1288 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1289 unlink(d
, true, true); // keep dir, dentry
1295 ConnectionRef con
= request
->reply
->get_connection();
1297 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1298 features
= (uint64_t)-1;
1301 features
= con
->get_features();
1303 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1306 SnapRealm
*realm
= NULL
;
1307 if (reply
->snapbl
.length())
1308 update_snap_trace(reply
->snapbl
, &realm
);
1310 ldout(cct
, 10) << " hrm "
1311 << " is_target=" << (int)reply
->head
.is_target
1312 << " is_dentry=" << (int)reply
->head
.is_dentry
1321 if (reply
->head
.is_dentry
) {
1322 dirst
.decode(p
, features
);
1323 dst
.decode(p
, features
);
1325 dlease
.decode(p
, features
);
1329 if (reply
->head
.is_target
) {
1330 ist
.decode(p
, features
);
1331 if (cct
->_conf
->client_debug_getattr_caps
) {
1332 unsigned wanted
= 0;
1333 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1334 wanted
= request
->head
.args
.getattr
.mask
;
1335 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1336 wanted
= request
->head
.args
.open
.mask
;
1338 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1339 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1340 ceph_abort_msg("MDS reply does not contain xattrs");
1343 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1348 if (reply
->head
.is_dentry
) {
1349 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1351 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1354 Dir
*dir
= diri
->open_dir();
1355 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1356 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1359 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1360 dn
= diri
->dir
->dentries
[dname
];
1362 diri
->dir_ordered_count
++;
1363 clear_dir_complete_and_ordered(diri
, false);
1364 unlink(dn
, true, true); // keep dir, dentry
1367 if (dlease
.duration_ms
> 0) {
1369 Dir
*dir
= diri
->open_dir();
1370 dn
= link(dir
, dname
, NULL
, NULL
);
1372 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1375 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1376 op
== CEPH_MDS_OP_MKSNAP
) {
1377 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1378 // fake it for snap lookup
1379 vinodeno_t vino
= ist
.vino
;
1380 vino
.snapid
= CEPH_SNAPDIR
;
1381 ceph_assert(inode_map
.count(vino
));
1382 diri
= inode_map
[vino
];
1384 string dname
= request
->path
.last_dentry();
1387 dlease
.duration_ms
= 0;
1390 Dir
*dir
= diri
->open_dir();
1391 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1393 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1394 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1396 unlink(dn
, true, true); // keep dir, dentry
1402 if (op
== CEPH_MDS_OP_READDIR
||
1403 op
== CEPH_MDS_OP_LSSNAP
) {
1404 insert_readdir_results(request
, session
, in
);
1405 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1406 // hack: return parent inode instead
1410 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1411 // pin the target inode if its parent dentry is not pinned
1412 request
->set_other_inode(in
);
1417 put_snap_realm(realm
);
1419 request
->target
= in
;
1425 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1427 mds_rank_t mds
= MDS_RANK_NONE
;
1429 bool is_hash
= false;
1434 if (req
->resend_mds
>= 0) {
1435 mds
= req
->resend_mds
;
1436 req
->resend_mds
= -1;
1437 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1441 if (cct
->_conf
->client_use_random_mds
)
1447 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1448 if (req
->path
.depth()) {
1449 hash
= in
->hash_dentry_name(req
->path
[0]);
1450 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1451 << " on " << req
->path
[0]
1452 << " => " << hash
<< dendl
;
1457 in
= de
->inode
.get();
1458 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1460 in
= de
->dir
->parent_inode
;
1461 hash
= in
->hash_dentry_name(de
->name
);
1462 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1463 << " on " << de
->name
1464 << " => " << hash
<< dendl
;
1469 if (in
->snapid
!= CEPH_NOSNAP
) {
1470 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1471 while (in
->snapid
!= CEPH_NOSNAP
) {
1472 if (in
->snapid
== CEPH_SNAPDIR
)
1473 in
= in
->snapdir_parent
.get();
1474 else if (!in
->dentries
.empty())
1475 /* In most cases there will only be one dentry, so getting it
1476 * will be the correct action. If there are multiple hard links,
1477 * I think the MDS should be able to redirect as needed*/
1478 in
= in
->get_first_parent()->dir
->parent_inode
;
1480 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1487 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1488 << " hash=" << hash
<< dendl
;
1490 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1491 frag_t fg
= in
->dirfragtree
[hash
];
1492 if (in
->fragmap
.count(fg
)) {
1493 mds
= in
->fragmap
[fg
];
1496 } else if (in
->auth_cap
) {
1497 mds
= in
->auth_cap
->session
->mds_num
;
1500 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1505 if (in
->auth_cap
&& req
->auth_is_best()) {
1506 mds
= in
->auth_cap
->session
->mds_num
;
1507 } else if (!in
->caps
.empty()) {
1508 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1512 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1519 mds
= _get_random_up_mds();
1520 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1524 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1529 void Client::connect_mds_targets(mds_rank_t mds
)
1531 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1532 ceph_assert(mds_sessions
.count(mds
));
1533 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1534 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1535 q
!= info
.export_targets
.end();
1537 if (mds_sessions
.count(*q
) == 0 &&
1538 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1539 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1540 << " export target mds." << *q
<< dendl
;
1541 _open_mds_session(*q
);
1546 void Client::dump_mds_sessions(Formatter
*f
)
1548 f
->dump_int("id", get_nodeid().v
);
1549 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1550 f
->dump_object("inst", inst
);
1551 f
->dump_stream("inst_str") << inst
;
1552 f
->dump_stream("addr_str") << inst
.addr
;
1553 f
->open_array_section("sessions");
1554 for (const auto &p
: mds_sessions
) {
1555 f
->open_object_section("session");
1560 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1562 void Client::dump_mds_requests(Formatter
*f
)
1564 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1565 p
!= mds_requests
.end();
1567 f
->open_object_section("request");
1573 int Client::verify_reply_trace(int r
,
1574 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1575 InodeRef
*ptarget
, bool *pcreated
,
1576 const UserPerm
& perms
)
1578 // check whether this request actually did the create, and set created flag
1579 bufferlist extra_bl
;
1580 inodeno_t created_ino
;
1581 bool got_created_ino
= false;
1582 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1584 extra_bl
= reply
->get_extra_bl();
1585 if (extra_bl
.length() >= 8) {
1586 // if the extra bufferlist has a buffer, we assume its the created inode
1587 // and that this request to create succeeded in actually creating
1588 // the inode (won the race with other create requests)
1589 decode(created_ino
, extra_bl
);
1590 got_created_ino
= true;
1591 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1595 *pcreated
= got_created_ino
;
1597 if (request
->target
) {
1598 *ptarget
= request
->target
;
1599 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1601 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1602 (*ptarget
) = p
->second
;
1603 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1605 // we got a traceless reply, and need to look up what we just
1606 // created. for now, do this by name. someday, do this by the
1607 // ino... which we know! FIXME.
1609 Dentry
*d
= request
->dentry();
1612 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1613 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1614 << " got_ino " << got_created_ino
1615 << " ino " << created_ino
1617 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1620 // if the dentry is not linked, just do our best. see #5021.
1621 ceph_abort_msg("how did this happen? i want logs!");
1624 Inode
*in
= request
->inode();
1625 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1626 << in
->ino
<< dendl
;
1627 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1631 // verify ino returned in reply and trace_dist are the same
1632 if (got_created_ino
&&
1633 created_ino
.val
!= target
->ino
.val
) {
1634 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1638 ptarget
->swap(target
);
1650 * Blocking helper to make an MDS request.
1652 * If the ptarget flag is set, behavior changes slightly: the caller
1653 * expects to get a pointer to the inode we are creating or operating
1654 * on. As a result, we will follow up any traceless mutation reply
1655 * with a getattr or lookup to transparently handle a traceless reply
1656 * from the MDS (as when the MDS restarts and the client has to replay
1659 * @param request the MetaRequest to execute
1660 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1661 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1662 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1663 * @param use_mds [optional] prefer a specific mds (-1 for default)
1664 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1666 int Client::make_request(MetaRequest
*request
,
1667 const UserPerm
& perms
,
1668 InodeRef
*ptarget
, bool *pcreated
,
1674 // assign a unique tid
1675 ceph_tid_t tid
= ++last_tid
;
1676 request
->set_tid(tid
);
1679 request
->op_stamp
= ceph_clock_now();
1682 mds_requests
[tid
] = request
->get();
1683 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1686 request
->set_caller_perms(perms
);
1688 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1689 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1690 request
->set_oldest_client_tid(1);
1692 request
->set_oldest_client_tid(oldest_tid
);
1697 request
->resend_mds
= use_mds
;
1700 if (request
->aborted())
1704 request
->abort(-EBLACKLISTED
);
1710 request
->caller_cond
= &caller_cond
;
1713 Inode
*hash_diri
= NULL
;
1714 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1715 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1716 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1717 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1719 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1720 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1722 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1723 request
->resend_mds
= _get_random_up_mds();
1726 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1727 wait_on_list(waiting_for_mdsmap
);
1733 MetaSession
*session
= NULL
;
1734 if (!have_open_session(mds
)) {
1735 session
= _get_or_open_mds_session(mds
);
1738 if (session
->state
== MetaSession::STATE_OPENING
) {
1739 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1740 wait_on_context_list(session
->waiting_for_open
);
1741 // Abort requests on REJECT from MDS
1742 if (rejected_by_mds
.count(mds
)) {
1743 request
->abort(-EPERM
);
1749 if (!have_open_session(mds
))
1752 session
= &mds_sessions
.at(mds
);
1756 send_request(request
, session
);
1759 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1760 request
->kick
= false;
1761 while (!request
->reply
&& // reply
1762 request
->resend_mds
< 0 && // forward
1764 caller_cond
.Wait(client_lock
);
1765 request
->caller_cond
= NULL
;
1767 // did we get a reply?
1772 if (!request
->reply
) {
1773 ceph_assert(request
->aborted());
1774 ceph_assert(!request
->got_unsafe
);
1775 r
= request
->get_abort_code();
1776 request
->item
.remove_myself();
1777 unregister_request(request
);
1778 put_request(request
);
1783 auto reply
= std::move(request
->reply
);
1784 r
= reply
->get_result();
1786 request
->success
= true;
1788 // kick dispatcher (we've got it!)
1789 ceph_assert(request
->dispatch_cond
);
1790 request
->dispatch_cond
->Signal();
1791 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1792 request
->dispatch_cond
= 0;
1794 if (r
>= 0 && ptarget
)
1795 r
= verify_reply_trace(r
, request
, reply
, ptarget
, pcreated
, perms
);
1798 *pdirbl
= reply
->get_extra_bl();
1801 utime_t lat
= ceph_clock_now();
1802 lat
-= request
->sent_stamp
;
1803 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1804 logger
->tinc(l_c_lat
, lat
);
1805 logger
->tinc(l_c_reply
, lat
);
1807 put_request(request
);
1811 void Client::unregister_request(MetaRequest
*req
)
1813 mds_requests
.erase(req
->tid
);
1814 if (req
->tid
== oldest_tid
) {
1815 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1817 if (p
== mds_requests
.end()) {
1821 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1822 oldest_tid
= p
->first
;
1831 void Client::put_request(MetaRequest
*request
)
1833 if (request
->_put()) {
1835 if (request
->success
)
1836 op
= request
->get_op();
1838 request
->take_other_inode(&other_in
);
1842 (op
== CEPH_MDS_OP_RMDIR
||
1843 op
== CEPH_MDS_OP_RENAME
||
1844 op
== CEPH_MDS_OP_RMSNAP
)) {
1845 _try_to_trim_inode(other_in
.get(), false);
1850 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1851 mds_rank_t mds
, int drop
,
1852 int unless
, int force
)
1854 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1855 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1856 << ", have:" << ", force:" << force
<< ")" << dendl
;
1858 auto it
= in
->caps
.find(mds
);
1859 if (it
!= in
->caps
.end()) {
1860 Cap
&cap
= it
->second
;
1861 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1862 if ((drop
& cap
.issued
) &&
1863 !(unless
& cap
.issued
)) {
1864 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(cap
.issued
) << dendl
;
1865 cap
.issued
&= ~drop
;
1866 cap
.implemented
&= ~drop
;
1868 ldout(cct
, 25) << "Now have: " << ccap_string(cap
.issued
) << dendl
;
1873 ceph_mds_request_release rel
;
1875 rel
.cap_id
= cap
.cap_id
;
1877 rel
.issue_seq
= cap
.issue_seq
;
1878 rel
.mseq
= cap
.mseq
;
1879 rel
.caps
= cap
.implemented
;
1880 rel
.wanted
= cap
.wanted
;
1883 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1886 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1887 << released
<< dendl
;
1891 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1892 mds_rank_t mds
, int drop
, int unless
)
1894 ldout(cct
, 20) << __func__
<< " enter(dn:"
1895 << dn
<< ")" << dendl
;
1898 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1899 mds
, drop
, unless
, 1);
1900 if (released
&& dn
->lease_mds
== mds
) {
1901 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1902 auto& rel
= req
->cap_releases
.back();
1903 rel
.item
.dname_len
= dn
->name
.length();
1904 rel
.item
.dname_seq
= dn
->lease_seq
;
1905 rel
.dname
= dn
->name
;
1907 ldout(cct
, 25) << __func__
<< " exit(dn:"
1908 << dn
<< ")" << dendl
;
1913 * This requires the MClientRequest *request member to be set.
1914 * It will error out horribly without one.
1915 * Additionally, if you set any *drop member, you'd better have
1916 * set the corresponding dentry!
1918 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1920 ldout(cct
, 20) << __func__
<< " enter (req: "
1921 << req
<< ", mds: " << mds
<< ")" << dendl
;
1922 if (req
->inode_drop
&& req
->inode())
1923 encode_inode_release(req
->inode(), req
,
1924 mds
, req
->inode_drop
,
1927 if (req
->old_inode_drop
&& req
->old_inode())
1928 encode_inode_release(req
->old_inode(), req
,
1929 mds
, req
->old_inode_drop
,
1930 req
->old_inode_unless
);
1931 if (req
->other_inode_drop
&& req
->other_inode())
1932 encode_inode_release(req
->other_inode(), req
,
1933 mds
, req
->other_inode_drop
,
1934 req
->other_inode_unless
);
1936 if (req
->dentry_drop
&& req
->dentry())
1937 encode_dentry_release(req
->dentry(), req
,
1938 mds
, req
->dentry_drop
,
1939 req
->dentry_unless
);
1941 if (req
->old_dentry_drop
&& req
->old_dentry())
1942 encode_dentry_release(req
->old_dentry(), req
,
1943 mds
, req
->old_dentry_drop
,
1944 req
->old_dentry_unless
);
1945 ldout(cct
, 25) << __func__
<< " exit (req: "
1946 << req
<< ", mds " << mds
<<dendl
;
1949 bool Client::have_open_session(mds_rank_t mds
)
1951 const auto &it
= mds_sessions
.find(mds
);
1952 return it
!= mds_sessions
.end() &&
1953 (it
->second
.state
== MetaSession::STATE_OPEN
||
1954 it
->second
.state
== MetaSession::STATE_STALE
);
1957 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1959 const auto &it
= mds_sessions
.find(mds
);
1960 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1967 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1969 auto it
= mds_sessions
.find(mds
);
1970 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1974 * Populate a map of strings with client-identifying metadata,
1975 * such as the hostname. Call this once at initialization.
1977 void Client::populate_metadata(const std::string
&mount_root
)
1983 metadata
["hostname"] = u
.nodename
;
1984 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1986 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1989 metadata
["pid"] = stringify(getpid());
1991 // Ceph entity id (the '0' in "client.0")
1992 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1994 // Our mount position
1995 if (!mount_root
.empty()) {
1996 metadata
["root"] = mount_root
;
2000 metadata
["ceph_version"] = pretty_version_to_str();
2001 metadata
["ceph_sha1"] = git_version_to_str();
2003 // Apply any metadata from the user's configured overrides
2004 std::vector
<std::string
> tokens
;
2005 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2006 for (const auto &i
: tokens
) {
2007 auto eqpos
= i
.find("=");
2008 // Throw out anything that isn't of the form "<str>=<str>"
2009 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2010 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2013 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2018 * Optionally add or override client metadata fields.
2020 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2022 std::lock_guard
l(client_lock
);
2023 ceph_assert(initialized
);
2025 auto it
= metadata
.find(k
);
2026 if (it
!= metadata
.end()) {
2027 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2028 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2034 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2036 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2037 auto addrs
= mdsmap
->get_addrs(mds
);
2038 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2039 std::forward_as_tuple(mds
),
2040 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2041 ceph_assert(em
.second
); /* not already present */
2042 MetaSession
*session
= &em
.first
->second
;
2044 // Maybe skip sending a request to open if this MDS daemon
2045 // has previously sent us a REJECT.
2046 if (rejected_by_mds
.count(mds
)) {
2047 if (rejected_by_mds
[mds
] == session
->addrs
) {
2048 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2049 "because we were rejected" << dendl
;
2052 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2053 "rejected us, trying with new inst" << dendl
;
2054 rejected_by_mds
.erase(mds
);
2058 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_OPEN
);
2059 m
->metadata
= metadata
;
2060 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2061 session
->con
->send_message2(std::move(m
));
2065 void Client::_close_mds_session(MetaSession
*s
)
2067 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2068 s
->state
= MetaSession::STATE_CLOSING
;
2069 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2072 void Client::_closed_mds_session(MetaSession
*s
)
2074 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2075 s
->state
= MetaSession::STATE_CLOSED
;
2076 s
->con
->mark_down();
2077 signal_context_list(s
->waiting_for_open
);
2078 mount_cond
.Signal();
2079 remove_session_caps(s
);
2080 kick_requests_closed(s
);
2081 mds_sessions
.erase(s
->mds_num
);
2084 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2086 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2087 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2089 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2091 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2095 switch (m
->get_op()) {
2096 case CEPH_SESSION_OPEN
:
2098 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2099 missing_features
-= m
->supported_features
;
2100 if (!missing_features
.empty()) {
2101 lderr(cct
) << "mds." << from
<< " lacks required features '"
2102 << missing_features
<< "', closing session " << dendl
;
2103 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2104 _close_mds_session(session
);
2105 _closed_mds_session(session
);
2108 session
->mds_features
= std::move(m
->supported_features
);
2110 renew_caps(session
);
2111 session
->state
= MetaSession::STATE_OPEN
;
2113 mount_cond
.Signal();
2115 connect_mds_targets(from
);
2116 signal_context_list(session
->waiting_for_open
);
2120 case CEPH_SESSION_CLOSE
:
2121 _closed_mds_session(session
);
2124 case CEPH_SESSION_RENEWCAPS
:
2125 if (session
->cap_renew_seq
== m
->get_seq()) {
2126 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2128 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2130 wake_up_session_caps(session
, false);
2134 case CEPH_SESSION_STALE
:
2135 // invalidate session caps/leases
2137 session
->cap_ttl
= ceph_clock_now();
2138 session
->cap_ttl
-= 1;
2139 renew_caps(session
);
2142 case CEPH_SESSION_RECALL_STATE
:
2143 trim_caps(session
, m
->get_max_caps());
2146 case CEPH_SESSION_FLUSHMSG
:
2147 /* flush cap release */
2148 if (auto& m
= session
->release
; m
) {
2149 session
->con
->send_message2(std::move(m
));
2151 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2154 case CEPH_SESSION_FORCE_RO
:
2155 force_session_readonly(session
);
2158 case CEPH_SESSION_REJECT
:
2160 std::string_view error_str
;
2161 auto it
= m
->metadata
.find("error_string");
2162 if (it
!= m
->metadata
.end())
2163 error_str
= it
->second
;
2165 error_str
= "unknown error";
2166 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2168 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2169 _closed_mds_session(session
);
2178 bool Client::_any_stale_sessions() const
2180 ceph_assert(client_lock
.is_locked_by_me());
2182 for (const auto &p
: mds_sessions
) {
2183 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2191 void Client::_kick_stale_sessions()
2193 ldout(cct
, 1) << __func__
<< dendl
;
2195 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2196 MetaSession
&s
= it
->second
;
2198 if (s
.state
== MetaSession::STATE_STALE
)
2199 _closed_mds_session(&s
);
2203 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2204 bool drop_cap_releases
)
2207 mds_rank_t mds
= session
->mds_num
;
2208 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2209 << " for mds." << mds
<< dendl
;
2210 auto r
= build_client_request(request
);
2211 if (request
->dentry()) {
2212 r
->set_dentry_wanted();
2214 if (request
->got_unsafe
) {
2215 r
->set_replayed_op();
2216 if (request
->target
)
2217 r
->head
.ino
= request
->target
->ino
;
2219 encode_cap_releases(request
, mds
);
2220 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2221 request
->cap_releases
.clear();
2223 r
->releases
.swap(request
->cap_releases
);
2225 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2226 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2227 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2228 r
->set_osdmap_epoch(o
.get_epoch());
2232 if (request
->mds
== -1) {
2233 request
->sent_stamp
= ceph_clock_now();
2234 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2238 Inode
*in
= request
->inode();
2240 auto it
= in
->caps
.find(mds
);
2241 if (it
!= in
->caps
.end()) {
2242 request
->sent_on_mseq
= it
->second
.mseq
;
2246 session
->requests
.push_back(&request
->item
);
2248 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2249 session
->con
->send_message2(std::move(r
));
2252 MClientRequest::ref
Client::build_client_request(MetaRequest
*request
)
2254 auto req
= MClientRequest::create(request
->get_op());
2255 req
->set_tid(request
->tid
);
2256 req
->set_stamp(request
->op_stamp
);
2257 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2259 // if the filepath's haven't been set, set them!
2260 if (request
->path
.empty()) {
2261 Inode
*in
= request
->inode();
2262 Dentry
*de
= request
->dentry();
2264 in
->make_nosnap_relative_path(request
->path
);
2267 de
->inode
->make_nosnap_relative_path(request
->path
);
2269 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2270 request
->path
.push_dentry(de
->name
);
2272 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2273 << " No path, inode, or appropriately-endowed dentry given!"
2275 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2276 << " No path, inode, or dentry given!"
2279 req
->set_filepath(request
->get_filepath());
2280 req
->set_filepath2(request
->get_filepath2());
2281 req
->set_data(request
->data
);
2282 req
->set_retry_attempt(request
->retry_attempt
++);
2283 req
->head
.num_fwd
= request
->num_fwd
;
2285 int gid_count
= request
->perms
.get_gids(&_gids
);
2286 req
->set_gid_list(gid_count
, _gids
);
2292 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2294 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2295 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2299 ceph_tid_t tid
= fwd
->get_tid();
2301 if (mds_requests
.count(tid
) == 0) {
2302 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2306 MetaRequest
*request
= mds_requests
[tid
];
2307 ceph_assert(request
);
2309 // reset retry counter
2310 request
->retry_attempt
= 0;
2312 // request not forwarded, or dest mds has no session.
2314 ldout(cct
, 10) << __func__
<< " tid " << tid
2315 << " fwd " << fwd
->get_num_fwd()
2316 << " to mds." << fwd
->get_dest_mds()
2317 << ", resending to " << fwd
->get_dest_mds()
2321 request
->item
.remove_myself();
2322 request
->num_fwd
= fwd
->get_num_fwd();
2323 request
->resend_mds
= fwd
->get_dest_mds();
2324 request
->caller_cond
->Signal();
2327 bool Client::is_dir_operation(MetaRequest
*req
)
2329 int op
= req
->get_op();
2330 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2331 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2332 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2333 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2338 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2340 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2341 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2346 ceph_tid_t tid
= reply
->get_tid();
2347 bool is_safe
= reply
->is_safe();
2349 if (mds_requests
.count(tid
) == 0) {
2350 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2351 << " safe is:" << is_safe
<< dendl
;
2354 MetaRequest
*request
= mds_requests
.at(tid
);
2356 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2357 << " tid " << tid
<< dendl
;
2359 if (request
->got_unsafe
&& !is_safe
) {
2360 //duplicate response
2361 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2362 << mds_num
<< " safe:" << is_safe
<< dendl
;
2366 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2367 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2368 << " from mds." << request
->mds
<< dendl
;
2369 request
->send_to_auth
= true;
2370 request
->resend_mds
= choose_target_mds(request
);
2371 Inode
*in
= request
->inode();
2372 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2373 if (request
->resend_mds
>= 0 &&
2374 request
->resend_mds
== request
->mds
&&
2376 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2377 request
->sent_on_mseq
== it
->second
.mseq
)) {
2378 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2380 request
->caller_cond
->Signal();
2385 ceph_assert(!request
->reply
);
2386 request
->reply
= reply
;
2387 insert_trace(request
, session
);
2389 // Handle unsafe reply
2391 request
->got_unsafe
= true;
2392 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2393 if (is_dir_operation(request
)) {
2394 Inode
*dir
= request
->inode();
2396 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2398 if (request
->target
) {
2399 InodeRef
&in
= request
->target
;
2400 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2404 // Only signal the caller once (on the first reply):
2405 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2406 if (!is_safe
|| !request
->got_unsafe
) {
2408 request
->dispatch_cond
= &cond
;
2411 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2412 request
->caller_cond
->Signal();
2414 // wake for kick back
2415 while (request
->dispatch_cond
) {
2416 ldout(cct
, 20) << __func__
<< " awaiting kickback on tid " << tid
<< " " << &cond
<< dendl
;
2417 cond
.Wait(client_lock
);
2422 // the filesystem change is committed to disk
2423 // we're done, clean up
2424 if (request
->got_unsafe
) {
2425 request
->unsafe_item
.remove_myself();
2426 request
->unsafe_dir_item
.remove_myself();
2427 request
->unsafe_target_item
.remove_myself();
2428 signal_cond_list(request
->waitfor_safe
);
2430 request
->item
.remove_myself();
2431 unregister_request(request
);
2434 mount_cond
.Signal();
2437 void Client::_handle_full_flag(int64_t pool
)
2439 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2440 << "on " << pool
<< dendl
;
2441 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2442 // to do this rather than blocking, because otherwise when we fill up we
2443 // potentially lock caps forever on files with dirty pages, and we need
2444 // to be able to release those caps to the MDS so that it can delete files
2445 // and free up space.
2446 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2448 // For all inodes with layouts in this pool and a pending flush write op
2449 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2450 // from ObjectCacher so that it doesn't re-issue the write in response to
2451 // the ENOSPC error.
2452 // Fortunately since we're cancelling everything in a given pool, we don't
2453 // need to know which ops belong to which ObjectSet, we can just blow all
2454 // the un-flushed cached data away and mark any dirty inodes' async_err
2455 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2456 // affecting this pool, and all the objectsets we're purging were also
2458 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2459 i
!= inode_map
.end(); ++i
)
2461 Inode
*inode
= i
->second
;
2462 if (inode
->oset
.dirty_or_tx
2463 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2464 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2465 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2466 objectcacher
->purge_set(&inode
->oset
);
2467 inode
->set_async_err(-ENOSPC
);
2471 if (cancelled_epoch
!= (epoch_t
)-1) {
2472 set_cap_epoch_barrier(cancelled_epoch
);
2476 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2478 std::set
<entity_addr_t
> new_blacklists
;
2479 objecter
->consume_blacklist_events(&new_blacklists
);
2481 const auto myaddrs
= messenger
->get_myaddrs();
2482 bool new_blacklist
= false;
2483 bool prenautilus
= objecter
->with_osdmap(
2484 [&](const OSDMap
& o
) {
2485 return o
.require_osd_release
< CEPH_RELEASE_NAUTILUS
;
2488 for (auto a
: myaddrs
.v
) {
2489 // blacklist entries are always TYPE_ANY for nautilus+
2490 a
.set_type(entity_addr_t::TYPE_ANY
);
2491 if (new_blacklists
.count(a
)) {
2492 new_blacklist
= true;
2496 // ...except pre-nautilus, they were TYPE_LEGACY
2497 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2498 if (new_blacklists
.count(a
)) {
2499 new_blacklist
= true;
2505 if (new_blacklist
) {
2506 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2507 return o
.get_epoch();
2509 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2512 _abort_mds_sessions(-EBLACKLISTED
);
2514 // Since we know all our OSD ops will fail, cancel them all preemtively,
2515 // so that on an unhealthy cluster we can umount promptly even if e.g.
2516 // some PGs were inaccessible.
2517 objecter
->op_cancel_writes(-EBLACKLISTED
);
2519 } else if (blacklisted
) {
2520 // Handle case where we were blacklisted but no longer are
2521 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2522 return o
.is_blacklisted(myaddrs
);});
2525 // Always subscribe to next osdmap for blacklisted client
2526 // until this client is not blacklisted.
2528 objecter
->maybe_request_map();
2531 if (objecter
->osdmap_full_flag()) {
2532 _handle_full_flag(-1);
2534 // Accumulate local list of full pools so that I can drop
2535 // the objecter lock before re-entering objecter in
2537 std::vector
<int64_t> full_pools
;
2539 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2540 for (const auto& kv
: o
.get_pools()) {
2541 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2542 full_pools
.push_back(kv
.first
);
2547 for (auto p
: full_pools
)
2548 _handle_full_flag(p
);
2550 // Subscribe to subsequent maps to watch for the full flag going
2551 // away. For the global full flag objecter does this for us, but
2552 // it pays no attention to the per-pool full flag so in this branch
2553 // we do it ourselves.
2554 if (!full_pools
.empty()) {
2555 objecter
->maybe_request_map();
2561 // ------------------------
2562 // incoming messages
2565 bool Client::ms_dispatch2(const MessageRef
&m
)
2567 std::lock_guard
l(client_lock
);
2569 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2573 switch (m
->get_type()) {
2574 // mounting and mds sessions
2575 case CEPH_MSG_MDS_MAP
:
2576 handle_mds_map(MMDSMap::msgref_cast(m
));
2578 case CEPH_MSG_FS_MAP
:
2579 handle_fs_map(MFSMap::msgref_cast(m
));
2581 case CEPH_MSG_FS_MAP_USER
:
2582 handle_fs_map_user(MFSMapUser::msgref_cast(m
));
2584 case CEPH_MSG_CLIENT_SESSION
:
2585 handle_client_session(MClientSession::msgref_cast(m
));
2588 case CEPH_MSG_OSD_MAP
:
2589 handle_osd_map(MOSDMap::msgref_cast(m
));
2593 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2594 handle_client_request_forward(MClientRequestForward::msgref_cast(m
));
2596 case CEPH_MSG_CLIENT_REPLY
:
2597 handle_client_reply(MClientReply::msgref_cast(m
));
2601 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2602 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m
));
2605 case CEPH_MSG_CLIENT_SNAP
:
2606 handle_snap(MClientSnap::msgref_cast(m
));
2608 case CEPH_MSG_CLIENT_CAPS
:
2609 handle_caps(MClientCaps::msgref_cast(m
));
2611 case CEPH_MSG_CLIENT_LEASE
:
2612 handle_lease(MClientLease::msgref_cast(m
));
2614 case MSG_COMMAND_REPLY
:
2615 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2616 handle_command_reply(MCommandReply::msgref_cast(m
));
2621 case CEPH_MSG_CLIENT_QUOTA
:
2622 handle_quota(MClientQuota::msgref_cast(m
));
2631 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2632 << "+" << inode_map
.size() << dendl
;
2633 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2635 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2636 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2637 mount_cond
.Signal();
2639 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2640 << "+" << inode_map
.size() << dendl
;
2647 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2649 fsmap
.reset(new FSMap(m
->get_fsmap()));
2651 signal_cond_list(waiting_for_fsmap
);
2653 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2656 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2658 fsmap_user
.reset(new FSMapUser
);
2659 *fsmap_user
= m
->get_fsmap();
2661 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2662 signal_cond_list(waiting_for_fsmap
);
2665 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2667 mds_gid_t old_inc
, new_inc
;
2668 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2669 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2670 << " is identical to or older than our "
2671 << mdsmap
->get_epoch() << dendl
;
2675 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2677 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2678 oldmap
.swap(mdsmap
);
2680 mdsmap
->decode(m
->get_encoded());
2682 // Cancel any commands for missing or laggy GIDs
2683 std::list
<ceph_tid_t
> cancel_ops
;
2684 auto &commands
= command_table
.get_commands();
2685 for (const auto &i
: commands
) {
2686 auto &op
= i
.second
;
2687 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2688 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2689 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2690 cancel_ops
.push_back(i
.first
);
2692 std::ostringstream ss
;
2693 ss
<< "MDS " << op_mds_gid
<< " went away";
2694 *(op
.outs
) = ss
.str();
2696 op
.con
->mark_down();
2698 op
.on_finish
->complete(-ETIMEDOUT
);
2703 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2704 i
!= cancel_ops
.end(); ++i
) {
2705 command_table
.erase(*i
);
2709 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2710 mds_rank_t mds
= p
->first
;
2711 MetaSession
*session
= &p
->second
;
2714 int oldstate
= oldmap
->get_state(mds
);
2715 int newstate
= mdsmap
->get_state(mds
);
2716 if (!mdsmap
->is_up(mds
)) {
2717 session
->con
->mark_down();
2718 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2719 old_inc
= oldmap
->get_incarnation(mds
);
2720 new_inc
= mdsmap
->get_incarnation(mds
);
2721 if (old_inc
!= new_inc
) {
2722 ldout(cct
, 1) << "mds incarnation changed from "
2723 << old_inc
<< " to " << new_inc
<< dendl
;
2724 oldstate
= MDSMap::STATE_NULL
;
2726 session
->con
->mark_down();
2727 session
->addrs
= mdsmap
->get_addrs(mds
);
2728 // When new MDS starts to take over, notify kernel to trim unused entries
2729 // in its dcache/icache. Hopefully, the kernel will release some unused
2730 // inodes before the new MDS enters reconnect state.
2731 trim_cache_for_reconnect(session
);
2732 } else if (oldstate
== newstate
)
2733 continue; // no change
2735 session
->mds_state
= newstate
;
2736 if (old_inc
!= new_inc
&& newstate
> MDSMap::STATE_RECONNECT
) {
2737 // missed reconnect close the session so that it can be reopened
2738 _closed_mds_session(session
);
2741 if (newstate
== MDSMap::STATE_RECONNECT
) {
2742 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2743 send_reconnect(session
);
2744 } else if (newstate
>= MDSMap::STATE_ACTIVE
) {
2745 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2746 // kick new requests
2747 kick_requests(session
);
2748 kick_flushing_caps(session
);
2749 signal_context_list(session
->waiting_for_open
);
2750 wake_up_session_caps(session
, true);
2752 connect_mds_targets(mds
);
2753 } else if (newstate
== MDSMap::STATE_NULL
&&
2754 mds
>= mdsmap
->get_max_mds()) {
2755 _closed_mds_session(session
);
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap
);
2762 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2765 void Client::send_reconnect(MetaSession
*session
)
2767 mds_rank_t mds
= session
->mds_num
;
2768 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session
);
2773 session
->readonly
= false;
2775 session
->release
.reset();
2777 // reset my cap seq number
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds
);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session
);
2784 early_kick_flushing_caps(session
);
2786 auto m
= MClientReconnect::create();
2787 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2789 // i have an open session.
2790 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2791 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2792 p
!= inode_map
.end();
2794 Inode
*in
= p
->second
;
2795 auto it
= in
->caps
.find(mds
);
2796 if (it
!= in
->caps
.end()) {
2798 m
->get_approx_size() >= (std::numeric_limits
<int>::max() >> 1)) {
2800 session
->con
->send_message2(std::move(m
));
2802 m
= MClientReconnect::create();
2805 Cap
&cap
= it
->second
;
2806 ldout(cct
, 10) << " caps on " << p
->first
2807 << " " << ccap_string(cap
.issued
)
2808 << " wants " << ccap_string(in
->caps_wanted())
2811 in
->make_long_path(path
);
2812 ldout(cct
, 10) << " path " << path
<< dendl
;
2815 _encode_filelocks(in
, flockbl
);
2817 cap
.seq
= 0; // reset seq.
2818 cap
.issue_seq
= 0; // reset seq.
2819 cap
.mseq
= 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap
.gen
< session
->cap_gen
) {
2822 cap
.gen
= session
->cap_gen
;
2823 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2825 cap
.issued
= cap
.implemented
;
2827 snapid_t snap_follows
= 0;
2828 if (!in
->cap_snaps
.empty())
2829 snap_follows
= in
->cap_snaps
.begin()->first
;
2831 m
->add_cap(p
->first
.ino
,
2833 path
.get_ino(), path
.get_path(), // ino
2834 in
->caps_wanted(), // wanted
2835 cap
.issued
, // issued
2840 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2841 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2842 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2843 did_snaprealm
.insert(in
->snaprealm
->ino
);
2849 m
->set_encoding_version(0); // use connection features to choose encoding
2850 session
->con
->send_message2(std::move(m
));
2852 mount_cond
.Signal();
2854 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2855 signal_cond_list(waiting_for_reclaim
);
2859 void Client::kick_requests(MetaSession
*session
)
2861 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2862 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2863 p
!= mds_requests
.end();
2865 MetaRequest
*req
= p
->second
;
2866 if (req
->got_unsafe
)
2868 if (req
->aborted()) {
2869 if (req
->caller_cond
) {
2871 req
->caller_cond
->Signal();
2875 if (req
->retry_attempt
> 0)
2876 continue; // new requests only
2877 if (req
->mds
== session
->mds_num
) {
2878 send_request(p
->second
, session
);
2883 void Client::resend_unsafe_requests(MetaSession
*session
)
2885 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2888 send_request(*iter
, session
);
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2893 p
!= mds_requests
.end();
2895 MetaRequest
*req
= p
->second
;
2896 if (req
->got_unsafe
)
2900 if (req
->retry_attempt
== 0)
2901 continue; // old requests only
2902 if (req
->mds
== session
->mds_num
)
2903 send_request(req
, session
, true);
2907 void Client::wait_unsafe_requests()
2909 list
<MetaRequest
*> last_unsafe_reqs
;
2910 for (const auto &p
: mds_sessions
) {
2911 const MetaSession
&s
= p
.second
;
2912 if (!s
.unsafe_requests
.empty()) {
2913 MetaRequest
*req
= s
.unsafe_requests
.back();
2915 last_unsafe_reqs
.push_back(req
);
2919 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2920 p
!= last_unsafe_reqs
.end();
2922 MetaRequest
*req
= *p
;
2923 if (req
->unsafe_item
.is_on_list())
2924 wait_on_list(req
->waitfor_safe
);
2929 void Client::kick_requests_closed(MetaSession
*session
)
2931 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2932 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2933 p
!= mds_requests
.end(); ) {
2934 MetaRequest
*req
= p
->second
;
2936 if (req
->mds
== session
->mds_num
) {
2937 if (req
->caller_cond
) {
2939 req
->caller_cond
->Signal();
2941 req
->item
.remove_myself();
2942 if (req
->got_unsafe
) {
2943 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2944 req
->unsafe_item
.remove_myself();
2945 req
->unsafe_dir_item
.remove_myself();
2946 req
->unsafe_target_item
.remove_myself();
2947 signal_cond_list(req
->waitfor_safe
);
2948 unregister_request(req
);
2952 ceph_assert(session
->requests
.empty());
2953 ceph_assert(session
->unsafe_requests
.empty());
2963 void Client::got_mds_push(MetaSession
*s
)
2966 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2967 if (s
->state
== MetaSession::STATE_CLOSING
) {
2968 s
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2972 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2974 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2976 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2978 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2979 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2984 got_mds_push(session
);
2986 ceph_seq_t seq
= m
->get_seq();
2989 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
2990 if (inode_map
.count(vino
) == 0) {
2991 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
2994 in
= inode_map
[vino
];
2996 if (m
->get_mask() & CEPH_LOCK_DN
) {
2997 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
2998 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3001 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3002 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3008 auto reply
= MClientLease::create(CEPH_MDS_LEASE_RELEASE
, seq
, m
->get_mask(), m
->get_ino(), m
->get_first(), m
->get_last(), m
->dname
);
3009 m
->get_connection()->send_message2(std::move(reply
));
3013 void Client::put_inode(Inode
*in
, int n
)
3015 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3016 int left
= in
->_put(n
);
3019 remove_all_caps(in
);
3021 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3022 bool unclean
= objectcacher
->release_set(&in
->oset
);
3023 ceph_assert(!unclean
);
3024 inode_map
.erase(in
->vino());
3025 if (use_faked_inos())
3026 _release_faked_ino(in
);
3031 while (!root_parents
.empty())
3032 root_parents
.erase(root_parents
.begin());
3039 void Client::close_dir(Dir
*dir
)
3041 Inode
*in
= dir
->parent_inode
;
3042 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3043 ceph_assert(dir
->is_empty());
3044 ceph_assert(in
->dir
== dir
);
3045 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3046 if (!in
->dentries
.empty())
3047 in
->get_first_parent()->put(); // unpin dentry
3051 put_inode(in
); // unpin inode
3055 * Don't call this with in==NULL, use get_or_create for that
3056 * leave dn set to default NULL unless you're trying to add
3057 * a new inode to a pre-created Dentry
3059 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3062 // create a new Dentry
3063 dn
= new Dentry(dir
, name
);
3065 lru
.lru_insert_mid(dn
); // mid or top?
3067 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3068 << " dn " << dn
<< " (new dn)" << dendl
;
3070 ceph_assert(!dn
->inode
);
3071 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3072 << " dn " << dn
<< " (old dn)" << dendl
;
3075 if (in
) { // link to inode
3077 // only one parent for directories!
3078 if (in
->is_dir() && !in
->dentries
.empty()) {
3079 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3080 Dentry
*olddn
= in
->get_first_parent();
3081 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3082 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3083 old_diri
->dir_release_count
++;
3084 clear_dir_complete_and_ordered(old_diri
, true);
3085 unlink(olddn
, true, true); // keep dir, dentry
3089 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3095 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3097 InodeRef
in(dn
->inode
);
3098 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3099 << " inode " << dn
->inode
<< dendl
;
3101 // unlink from inode
3104 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3110 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3120 if (dir
->is_empty() && !keepdir
)
3126 * For asynchronous flushes, check for errors from the IO and
3127 * update the inode if necessary
3129 class C_Client_FlushComplete
: public Context
{
3134 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3135 void finish(int r
) override
{
3136 ceph_assert(client
->client_lock
.is_locked_by_me());
3138 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3139 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3140 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3141 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3142 inode
->set_async_err(r
);
3152 void Client::get_cap_ref(Inode
*in
, int cap
)
3154 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3155 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3156 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3159 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3160 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3161 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3164 in
->get_cap_ref(cap
);
3167 void Client::put_cap_ref(Inode
*in
, int cap
)
3169 int last
= in
->put_cap_ref(cap
);
3172 int drop
= last
& ~in
->caps_issued();
3173 if (in
->snapid
== CEPH_NOSNAP
) {
3174 if ((last
& CEPH_CAP_FILE_WR
) &&
3175 !in
->cap_snaps
.empty() &&
3176 in
->cap_snaps
.rbegin()->second
.writing
) {
3177 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3178 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3179 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3180 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3182 if (last
& CEPH_CAP_FILE_BUFFER
) {
3183 for (auto &p
: in
->cap_snaps
)
3184 p
.second
.dirty_data
= 0;
3185 signal_cond_list(in
->waitfor_commit
);
3186 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3190 if (last
& CEPH_CAP_FILE_CACHE
) {
3191 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3197 put_inode(in
, put_nref
);
3201 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3203 int r
= check_pool_perm(in
, need
);
3208 int file_wanted
= in
->caps_file_wanted();
3209 if ((file_wanted
& need
) != need
) {
3210 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3211 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3217 int have
= in
->caps_issued(&implemented
);
3219 bool waitfor_caps
= false;
3220 bool waitfor_commit
= false;
3222 if (have
& need
& CEPH_CAP_FILE_WR
) {
3224 (endoff
>= (loff_t
)in
->max_size
||
3225 endoff
> (loff_t
)(in
->size
<< 1)) &&
3226 endoff
> (loff_t
)in
->wanted_max_size
) {
3227 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3228 in
->wanted_max_size
= endoff
;
3232 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3233 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3234 waitfor_caps
= true;
3236 if (!in
->cap_snaps
.empty()) {
3237 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3238 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3239 waitfor_caps
= true;
3241 for (auto &p
: in
->cap_snaps
) {
3242 if (p
.second
.dirty_data
) {
3243 waitfor_commit
= true;
3247 if (waitfor_commit
) {
3248 _flush(in
, new C_Client_FlushComplete(this, in
));
3249 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3254 if (!waitfor_caps
&& !waitfor_commit
) {
3255 if ((have
& need
) == need
) {
3256 int revoking
= implemented
& ~have
;
3257 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3258 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3259 << " revoking " << ccap_string(revoking
)
3261 if ((revoking
& want
) == 0) {
3262 *phave
= need
| (have
& want
);
3263 in
->get_cap_ref(need
);
3267 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3268 waitfor_caps
= true;
3271 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3272 in
->auth_cap
->session
->readonly
)
3275 if (in
->flags
& I_CAP_DROPPED
) {
3276 int mds_wanted
= in
->caps_mds_wanted();
3277 if ((mds_wanted
& need
) != need
) {
3278 int ret
= _renew_caps(in
);
3283 if (!(file_wanted
& ~mds_wanted
))
3284 in
->flags
&= ~I_CAP_DROPPED
;
3288 wait_on_list(in
->waitfor_caps
);
3289 else if (waitfor_commit
)
3290 wait_on_list(in
->waitfor_commit
);
3294 int Client::get_caps_used(Inode
*in
)
3296 unsigned used
= in
->caps_used();
3297 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3298 !objectcacher
->set_is_empty(&in
->oset
))
3299 used
|= CEPH_CAP_FILE_CACHE
;
3303 void Client::cap_delay_requeue(Inode
*in
)
3305 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3306 in
->hold_caps_until
= ceph_clock_now();
3307 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3308 delayed_list
.push_back(&in
->delay_cap_item
);
3311 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3312 bool sync
, int used
, int want
, int retain
,
3313 int flush
, ceph_tid_t flush_tid
)
3315 int held
= cap
->issued
| cap
->implemented
;
3316 int revoking
= cap
->implemented
& ~cap
->issued
;
3317 retain
&= ~revoking
;
3318 int dropping
= cap
->issued
& ~retain
;
3319 int op
= CEPH_CAP_OP_UPDATE
;
3321 ldout(cct
, 10) << __func__
<< " " << *in
3322 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3323 << (sync
? " sync " : " async ")
3324 << " used " << ccap_string(used
)
3325 << " want " << ccap_string(want
)
3326 << " flush " << ccap_string(flush
)
3327 << " retain " << ccap_string(retain
)
3328 << " held "<< ccap_string(held
)
3329 << " revoking " << ccap_string(revoking
)
3330 << " dropping " << ccap_string(dropping
)
3333 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3334 const int would_have_issued
= cap
->issued
& retain
;
3335 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3340 cap
->issued
= cap
->issued
| cap
->implemented
;
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3346 cap
->issued
^= xattr_mask
& revoking
;
3347 cap
->implemented
^= xattr_mask
& revoking
;
3349 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3350 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3353 cap
->issued
&= retain
;
3354 cap
->implemented
&= cap
->issued
| used
;
3357 snapid_t follows
= 0;
3360 follows
= in
->snaprealm
->get_snap_context().seq
;
3362 auto m
= MClientCaps::create(op
,
3365 cap
->cap_id
, cap
->seq
,
3371 m
->caller_uid
= in
->cap_dirtier_uid
;
3372 m
->caller_gid
= in
->cap_dirtier_gid
;
3374 m
->head
.issue_seq
= cap
->issue_seq
;
3375 m
->set_tid(flush_tid
);
3377 m
->head
.uid
= in
->uid
;
3378 m
->head
.gid
= in
->gid
;
3379 m
->head
.mode
= in
->mode
;
3381 m
->head
.nlink
= in
->nlink
;
3383 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3384 encode(in
->xattrs
, m
->xattrbl
);
3385 m
->head
.xattr_version
= in
->xattr_version
;
3389 m
->max_size
= in
->max_size
;
3390 m
->truncate_seq
= in
->truncate_seq
;
3391 m
->truncate_size
= in
->truncate_size
;
3392 m
->mtime
= in
->mtime
;
3393 m
->atime
= in
->atime
;
3394 m
->ctime
= in
->ctime
;
3395 m
->btime
= in
->btime
;
3396 m
->time_warp_seq
= in
->time_warp_seq
;
3397 m
->change_attr
= in
->change_attr
;
3399 m
->flags
|= MClientCaps::FLAG_SYNC
;
3400 if (!in
->cap_snaps
.empty())
3401 m
->flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3403 if (flush
& CEPH_CAP_FILE_WR
) {
3404 m
->inline_version
= in
->inline_version
;
3405 m
->inline_data
= in
->inline_data
;
3408 in
->reported_size
= in
->size
;
3409 m
->set_snap_follows(follows
);
3411 if (cap
== in
->auth_cap
) {
3412 m
->set_max_size(in
->wanted_max_size
);
3413 in
->requested_max_size
= in
->wanted_max_size
;
3414 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3417 if (!session
->flushing_caps_tids
.empty())
3418 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3420 session
->con
->send_message2(std::move(m
));
3423 static bool is_max_size_approaching(Inode
*in
)
3425 /* mds will adjust max size according to the reported size */
3426 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3428 if (in
->size
>= in
->max_size
)
3430 /* half of previous max_size increment has been used */
3431 if (in
->max_size
> in
->reported_size
&&
3432 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3437 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3439 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3441 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3444 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3445 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3446 used
&= ~CEPH_CAP_FILE_CACHE
;
3447 used
|= CEPH_CAP_FILE_LAZYIO
;
3449 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3450 used
&= ~CEPH_CAP_FILE_BUFFER
;
3451 used
|= CEPH_CAP_FILE_LAZYIO
;
3454 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3455 used
&= ~CEPH_CAP_FILE_CACHE
;
3456 used
|= CEPH_CAP_FILE_LAZYIO
;
3458 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3459 used
&= ~CEPH_CAP_FILE_BUFFER
;
3460 used
|= CEPH_CAP_FILE_LAZYIO
;
3469 * Examine currently used and wanted versus held caps. Release, flush or ack
3470 * revoked caps to the MDS as appropriate.
3472 * @param in the inode to check
3473 * @param flags flags to apply to cap check
3475 void Client::check_caps(Inode
*in
, unsigned flags
)
3477 unsigned wanted
= in
->caps_wanted();
3478 unsigned used
= get_caps_used(in
);
3482 int issued
= in
->caps_issued(&implemented
);
3483 int revoking
= implemented
& ~issued
;
3485 int orig_used
= used
;
3486 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3488 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3489 if (!unmounting
&& in
->nlink
> 0) {
3491 retain
|= CEPH_CAP_ANY
;
3492 } else if (in
->is_dir() &&
3493 (issued
& CEPH_CAP_FILE_SHARED
) &&
3494 (in
->flags
& I_COMPLETE
)) {
3495 // we do this here because we don't want to drop to Fs (and then
3496 // drop the Fs if we do a create!) if that alone makes us send lookups
3497 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3498 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3501 retain
|= CEPH_CAP_ANY_SHARED
;
3502 // keep RD only if we didn't have the file open RW,
3503 // because then the mds would revoke it anyway to
3504 // journal max_size=0.
3505 if (in
->max_size
== 0)
3506 retain
|= CEPH_CAP_ANY_RD
;
3510 ldout(cct
, 10) << __func__
<< " on " << *in
3511 << " wanted " << ccap_string(wanted
)
3512 << " used " << ccap_string(used
)
3513 << " issued " << ccap_string(issued
)
3514 << " revoking " << ccap_string(revoking
)
3515 << " flags=" << flags
3518 if (in
->snapid
!= CEPH_NOSNAP
)
3519 return; //snap caps last forever, can't write
3521 if (in
->caps
.empty())
3522 return; // guard if at end of func
3524 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3525 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3527 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3530 if (!in
->cap_snaps
.empty())
3533 for (auto &p
: in
->caps
) {
3534 mds_rank_t mds
= p
.first
;
3535 Cap
&cap
= p
.second
;
3537 MetaSession
*session
= &mds_sessions
.at(mds
);
3540 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3541 cap_used
&= ~in
->auth_cap
->issued
;
3543 revoking
= cap
.implemented
& ~cap
.issued
;
3545 ldout(cct
, 10) << " cap mds." << mds
3546 << " issued " << ccap_string(cap
.issued
)
3547 << " implemented " << ccap_string(cap
.implemented
)
3548 << " revoking " << ccap_string(revoking
) << dendl
;
3550 if (in
->wanted_max_size
> in
->max_size
&&
3551 in
->wanted_max_size
> in
->requested_max_size
&&
3552 &cap
== in
->auth_cap
)
3555 /* approaching file_max? */
3556 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3557 &cap
== in
->auth_cap
&&
3558 is_max_size_approaching(in
)) {
3559 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3560 << ", reported " << in
->reported_size
<< dendl
;
3564 /* completed revocation? */
3565 if (revoking
&& (revoking
& cap_used
) == 0) {
3566 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3570 /* want more caps from mds? */
3571 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3574 if (!revoking
&& unmounting
&& (cap_used
== 0))
3577 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3578 !in
->dirty_caps
) // and we have no dirty caps
3581 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3582 ldout(cct
, 10) << "delaying cap release" << dendl
;
3583 cap_delay_requeue(in
);
3588 // re-send old cap/snapcap flushes first.
3589 if (session
->mds_state
>= MDSMap::STATE_RECONNECT
&&
3590 session
->mds_state
< MDSMap::STATE_ACTIVE
&&
3591 session
->early_flushing_caps
.count(in
) == 0) {
3592 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3593 << " to mds." << session
->mds_num
<< dendl
;
3594 session
->early_flushing_caps
.insert(in
);
3595 if (in
->cap_snaps
.size())
3596 flush_snaps(in
, true);
3597 if (in
->flushing_caps
)
3598 flush_caps(in
, session
, flags
& CHECK_CAPS_SYNCHRONOUS
);
3602 ceph_tid_t flush_tid
;
3603 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3604 flushing
= mark_caps_flushing(in
, &flush_tid
);
3610 send_cap(in
, session
, &cap
, flags
& CHECK_CAPS_SYNCHRONOUS
, cap_used
, wanted
,
3611 retain
, flushing
, flush_tid
);
3616 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3618 int used
= get_caps_used(in
);
3619 int dirty
= in
->caps_dirty();
3620 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3622 if (in
->cap_snaps
.size() &&
3623 in
->cap_snaps
.rbegin()->second
.writing
) {
3624 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3626 } else if (in
->caps_dirty() ||
3627 (used
& CEPH_CAP_FILE_WR
) ||
3628 (dirty
& CEPH_CAP_ANY_WR
)) {
3629 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3630 ceph_assert(capsnapem
.second
); /* element inserted */
3631 CapSnap
&capsnap
= capsnapem
.first
->second
;
3632 capsnap
.context
= old_snapc
;
3633 capsnap
.issued
= in
->caps_issued();
3634 capsnap
.dirty
= in
->caps_dirty();
3636 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3638 capsnap
.uid
= in
->uid
;
3639 capsnap
.gid
= in
->gid
;
3640 capsnap
.mode
= in
->mode
;
3641 capsnap
.btime
= in
->btime
;
3642 capsnap
.xattrs
= in
->xattrs
;
3643 capsnap
.xattr_version
= in
->xattr_version
;
3644 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3645 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3647 if (used
& CEPH_CAP_FILE_WR
) {
3648 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3649 capsnap
.writing
= 1;
3651 finish_cap_snap(in
, capsnap
, used
);
3654 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3658 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3660 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3661 capsnap
.size
= in
->size
;
3662 capsnap
.mtime
= in
->mtime
;
3663 capsnap
.atime
= in
->atime
;
3664 capsnap
.ctime
= in
->ctime
;
3665 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3666 capsnap
.change_attr
= in
->change_attr
;
3667 capsnap
.dirty
|= in
->caps_dirty();
3669 /* Only reset it if it wasn't set before */
3670 if (capsnap
.cap_dirtier_uid
== -1) {
3671 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3672 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3675 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3676 capsnap
.inline_data
= in
->inline_data
;
3677 capsnap
.inline_version
= in
->inline_version
;
3680 if (used
& CEPH_CAP_FILE_BUFFER
) {
3681 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3682 << " WRBUFFER, delaying" << dendl
;
3684 capsnap
.dirty_data
= 0;
3689 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3691 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3692 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3696 void Client::flush_snaps(Inode
*in
, bool all_again
)
3698 ldout(cct
, 10) << "flush_snaps on " << *in
<< " all_again " << all_again
<< dendl
;
3699 ceph_assert(in
->cap_snaps
.size());
3702 ceph_assert(in
->auth_cap
);
3703 MetaSession
*session
= in
->auth_cap
->session
;
3704 int mseq
= in
->auth_cap
->mseq
;
3706 for (auto &p
: in
->cap_snaps
) {
3707 CapSnap
&capsnap
= p
.second
;
3709 // only flush once per session
3710 if (capsnap
.flush_tid
> 0)
3714 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3715 << " follows " << p
.first
3716 << " size " << capsnap
.size
3717 << " mtime " << capsnap
.mtime
3718 << " dirty_data=" << capsnap
.dirty_data
3719 << " writing=" << capsnap
.writing
3720 << " on " << *in
<< dendl
;
3721 if (capsnap
.dirty_data
|| capsnap
.writing
)
3724 if (capsnap
.flush_tid
== 0) {
3725 capsnap
.flush_tid
= ++last_flush_tid
;
3726 if (!in
->flushing_cap_item
.is_on_list())
3727 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3728 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3731 auto m
= MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP
, in
->ino
, in
->snaprealm
->ino
, 0, mseq
,
3733 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3734 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3736 m
->set_client_tid(capsnap
.flush_tid
);
3737 m
->head
.snap_follows
= p
.first
;
3739 m
->head
.caps
= capsnap
.issued
;
3740 m
->head
.dirty
= capsnap
.dirty
;
3742 m
->head
.uid
= capsnap
.uid
;
3743 m
->head
.gid
= capsnap
.gid
;
3744 m
->head
.mode
= capsnap
.mode
;
3745 m
->btime
= capsnap
.btime
;
3747 m
->size
= capsnap
.size
;
3749 m
->head
.xattr_version
= capsnap
.xattr_version
;
3750 encode(capsnap
.xattrs
, m
->xattrbl
);
3752 m
->ctime
= capsnap
.ctime
;
3753 m
->btime
= capsnap
.btime
;
3754 m
->mtime
= capsnap
.mtime
;
3755 m
->atime
= capsnap
.atime
;
3756 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3757 m
->change_attr
= capsnap
.change_attr
;
3759 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3760 m
->inline_version
= in
->inline_version
;
3761 m
->inline_data
= in
->inline_data
;
3764 ceph_assert(!session
->flushing_caps_tids
.empty());
3765 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3767 session
->con
->send_message2(std::move(m
));
3773 void Client::wait_on_list(list
<Cond
*>& ls
)
3776 ls
.push_back(&cond
);
3777 cond
.Wait(client_lock
);
3781 void Client::signal_cond_list(list
<Cond
*>& ls
)
3783 for (list
<Cond
*>::iterator it
= ls
.begin(); it
!= ls
.end(); ++it
)
3787 void Client::wait_on_context_list(list
<Context
*>& ls
)
3792 ls
.push_back(new C_Cond(&cond
, &done
, &r
));
3794 cond
.Wait(client_lock
);
3797 void Client::signal_context_list(list
<Context
*>& ls
)
3799 while (!ls
.empty()) {
3800 ls
.front()->complete(0);
3805 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3807 for (const auto &cap
: s
->caps
) {
3808 auto &in
= cap
->inode
;
3810 in
.requested_max_size
= 0;
3811 in
.wanted_max_size
= 0;
3813 if (cap
->gen
< s
->cap_gen
) {
3814 // mds did not re-issue stale cap.
3815 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3816 // make sure mds knows what we want.
3817 if (in
.caps_file_wanted() & ~cap
->wanted
)
3818 in
.flags
|= I_CAP_DROPPED
;
3821 signal_cond_list(in
.waitfor_caps
);
3826 // flush dirty data (from objectcache)
3828 class C_Client_CacheInvalidate
: public Context
{
3832 int64_t offset
, length
;
3834 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3835 client(c
), offset(off
), length(len
) {
3836 if (client
->use_faked_inos())
3837 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3841 void finish(int r
) override
{
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3843 ceph_assert(!client
->client_lock
.is_locked_by_me());
3844 client
->_async_invalidate(ino
, offset
, length
);
3848 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3852 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3853 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3856 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3858 if (ino_invalidate_cb
)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3863 void Client::_invalidate_inode_cache(Inode
*in
)
3865 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3867 // invalidate our userspace inode cache
3868 if (cct
->_conf
->client_oc
) {
3869 objectcacher
->release_set(&in
->oset
);
3870 if (!objectcacher
->set_is_empty(&in
->oset
))
3871 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3874 _schedule_invalidate_callback(in
, 0, 0);
3877 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3879 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3881 // invalidate our userspace inode cache
3882 if (cct
->_conf
->client_oc
) {
3883 vector
<ObjectExtent
> ls
;
3884 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3885 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3888 _schedule_invalidate_callback(in
, off
, len
);
3891 bool Client::_release(Inode
*in
)
3893 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3894 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3895 _invalidate_inode_cache(in
);
3901 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3903 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3905 if (!in
->oset
.dirty_or_tx
) {
3906 ldout(cct
, 10) << " nothing to flush" << dendl
;
3907 onfinish
->complete(0);
3911 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3912 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3913 objectcacher
->purge_set(&in
->oset
);
3915 onfinish
->complete(-ENOSPC
);
3920 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3923 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3925 ceph_assert(client_lock
.is_locked());
3926 if (!in
->oset
.dirty_or_tx
) {
3927 ldout(cct
, 10) << " nothing to flush" << dendl
;
3931 C_SaferCond
onflush("Client::_flush_range flock");
3932 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3933 offset
, size
, &onflush
);
3936 client_lock
.Unlock();
3942 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock
.is_locked()); // will be called via dispatch() -> objecter -> ...
3946 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3951 void Client::_flushed(Inode
*in
)
3953 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3955 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3960 // checks common to add_update_cap, handle_cap_grant
3961 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
3963 unsigned had
= in
->caps_issued();
3965 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3966 !(had
& CEPH_CAP_FILE_CACHE
))
3969 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3970 !(had
& CEPH_CAP_FILE_SHARED
)) {
3974 clear_dir_complete_and_ordered(in
, true);
3978 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3979 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
3980 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
3982 if (!in
->is_any_caps()) {
3983 ceph_assert(in
->snaprealm
== 0);
3984 in
->snaprealm
= get_snap_realm(realm
);
3985 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3986 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
3988 ceph_assert(in
->snaprealm
);
3989 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
3990 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
3991 in
->snaprealm_item
.remove_myself();
3992 auto oldrealm
= in
->snaprealm
;
3993 in
->snaprealm
= get_snap_realm(realm
);
3994 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3995 put_snap_realm(oldrealm
);
3999 mds_rank_t mds
= mds_session
->mds_num
;
4000 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4001 Cap
&cap
= capem
.first
->second
;
4002 if (!capem
.second
) {
4003 if (cap
.gen
< mds_session
->cap_gen
)
4004 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4015 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4016 ceph_assert(&cap
== in
->auth_cap
);
4017 ceph_assert(cap
.cap_id
== cap_id
);
4020 issued
|= cap
.issued
;
4021 flags
|= CEPH_CAP_FLAG_AUTH
;
4025 check_cap_issue(in
, issued
);
4027 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4028 if (in
->auth_cap
!= &cap
&&
4029 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4030 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4031 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4032 << "add myself to new auth MDS' flushing caps list" << dendl
;
4033 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4035 in
->auth_cap
= &cap
;
4039 unsigned old_caps
= cap
.issued
;
4040 cap
.cap_id
= cap_id
;
4041 cap
.issued
= issued
;
4042 cap
.implemented
|= issued
;
4043 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4044 cap
.wanted
= wanted
;
4046 cap
.wanted
|= wanted
;
4048 cap
.issue_seq
= seq
;
4050 cap
.gen
= mds_session
->cap_gen
;
4051 cap
.latest_perms
= cap_perms
;
4052 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4053 << " from mds." << mds
4057 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4058 // non-auth MDS is revoking the newly grant caps ?
4059 for (auto &p
: in
->caps
) {
4060 if (&p
.second
== &cap
)
4062 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4063 check_caps(in
, CHECK_CAPS_NODELAY
);
4069 if (issued
& ~old_caps
)
4070 signal_cond_list(in
->waitfor_caps
);
4073 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4075 auto &in
= cap
->inode
;
4076 MetaSession
*session
= cap
->session
;
4077 mds_rank_t mds
= cap
->session
->mds_num
;
4079 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4081 if (queue_release
) {
4082 session
->enqueue_cap_release(
4090 if (in
.auth_cap
== cap
) {
4091 if (in
.flushing_cap_item
.is_on_list()) {
4092 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4093 in
.flushing_cap_item
.remove_myself();
4097 size_t n
= in
.caps
.erase(mds
);
4098 ceph_assert(n
== 1);
4101 if (!in
.is_any_caps()) {
4102 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4103 in
.snaprealm_item
.remove_myself();
4104 put_snap_realm(in
.snaprealm
);
4109 void Client::remove_all_caps(Inode
*in
)
4111 while (!in
->caps
.empty())
4112 remove_cap(&in
->caps
.begin()->second
, true);
4115 void Client::remove_session_caps(MetaSession
*s
)
4117 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4119 while (s
->caps
.size()) {
4120 Cap
*cap
= *s
->caps
.begin();
4121 InodeRef
in(&cap
->inode
);
4122 bool dirty_caps
= false, cap_snaps
= false;
4123 if (in
->auth_cap
== cap
) {
4124 cap_snaps
= !in
->cap_snaps
.empty();
4125 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4126 in
->wanted_max_size
= 0;
4127 in
->requested_max_size
= 0;
4129 if (cap
->wanted
| cap
->issued
)
4130 in
->flags
|= I_CAP_DROPPED
;
4131 remove_cap(cap
, false);
4133 in
->cap_snaps
.clear();
4136 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4137 if (in
->flushing_caps
) {
4138 num_flushing_caps
--;
4139 in
->flushing_cap_tids
.clear();
4141 in
->flushing_caps
= 0;
4142 in
->mark_caps_clean();
4143 put_inode(in
.get());
4145 signal_cond_list(in
->waitfor_caps
);
4147 s
->flushing_caps_tids
.clear();
4151 int Client::_do_remount(bool retry_on_error
)
4153 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4156 int r
= remount_cb(callback_handle
);
4158 retries_on_invalidate
= 0;
4161 client_t whoami
= get_nodeid();
4164 "failed to remount (to trim kernel dentries): "
4165 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4168 "failed to remount (to trim kernel dentries): "
4169 "return code = " << r
<< dendl
;
4172 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4173 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4174 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4175 if (should_abort
&& !unmounting
) {
4176 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4183 class C_Client_Remount
: public Context
{
4187 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4188 void finish(int r
) override
{
4189 ceph_assert(r
== 0);
4190 client
->_do_remount(true);
4194 void Client::_invalidate_kernel_dcache()
4198 if (can_invalidate_dentries
) {
4199 if (dentry_invalidate_cb
&& root
->dir
) {
4200 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4201 p
!= root
->dir
->dentries
.end();
4203 if (p
->second
->inode
)
4204 _schedule_invalidate_dentry_callback(p
->second
, false);
4207 } else if (remount_cb
) {
4209 // when remounting a file system, linux kernel trims all unused dentries in the fs
4210 remount_finisher
.queue(new C_Client_Remount(this));
4214 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4220 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4221 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4222 Dentry
*dn
= p
->second
;
4224 ceph_assert(!dn
->inode
);
4225 if (dn
->lru_is_expireable())
4226 unlink(dn
, true, false); // keep dir, drop dentry
4228 if (dir
->dentries
.empty()) {
4233 if (in
->flags
& I_SNAPDIR_OPEN
) {
4234 InodeRef snapdir
= open_snapdir(in
.get());
4235 _trim_negative_child_dentries(snapdir
);
4239 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4241 mds_rank_t mds
= s
->mds_num
;
4242 size_t caps_size
= s
->caps
.size();
4243 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4244 << " caps " << caps_size
<< dendl
;
4246 uint64_t trimmed
= 0;
4247 auto p
= s
->caps
.begin();
4248 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4249 * looking at from getting deleted during traversal. */
4250 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4252 InodeRef
in(&cap
->inode
);
4254 // Increment p early because it will be invalidated if cap
4255 // is deleted inside remove_cap
4258 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4259 int mine
= cap
->issued
| cap
->implemented
;
4260 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4261 // disposable non-auth cap
4262 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4263 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4264 cap
= (remove_cap(cap
, true), nullptr);
4268 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4269 _trim_negative_child_dentries(in
);
4271 auto q
= in
->dentries
.begin();
4272 while (q
!= in
->dentries
.end()) {
4275 if (dn
->lru_is_expireable()) {
4276 if (can_invalidate_dentries
&&
4277 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4278 // Only issue one of these per DN for inodes in root: handle
4279 // others more efficiently by calling for root-child DNs at
4280 // the end of this function.
4281 _schedule_invalidate_dentry_callback(dn
, true);
4283 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4286 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4290 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4291 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4296 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4297 for (const auto &dn
: to_trim
) {
4302 caps_size
= s
->caps
.size();
4303 if (caps_size
> (size_t)max
)
4304 _invalidate_kernel_dcache();
4307 void Client::force_session_readonly(MetaSession
*s
)
4310 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4311 auto &in
= (*p
)->inode
;
4312 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4313 signal_cond_list(in
.waitfor_caps
);
4317 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4319 MetaSession
*session
= in
->auth_cap
->session
;
4321 int flushing
= in
->dirty_caps
;
4322 ceph_assert(flushing
);
4324 ceph_tid_t flush_tid
= ++last_flush_tid
;
4325 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4327 if (!in
->flushing_caps
) {
4328 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4329 num_flushing_caps
++;
4331 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4334 in
->flushing_caps
|= flushing
;
4335 in
->mark_caps_clean();
4337 if (!in
->flushing_cap_item
.is_on_list())
4338 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4339 session
->flushing_caps_tids
.insert(flush_tid
);
4345 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4347 for (auto &p
: in
->cap_snaps
) {
4348 CapSnap
&capsnap
= p
.second
;
4349 if (capsnap
.flush_tid
> 0) {
4350 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4351 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4354 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4355 it
!= in
->flushing_cap_tids
.end();
4357 old_s
->flushing_caps_tids
.erase(it
->first
);
4358 new_s
->flushing_caps_tids
.insert(it
->first
);
4360 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4364 * Flush all caps back to the MDS. Because the callers generally wait on the
4365 * result of this function (syncfs and umount cases), we set
4366 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4368 void Client::flush_caps_sync()
4370 ldout(cct
, 10) << __func__
<< dendl
;
4371 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4373 unsigned flags
= CHECK_CAPS_NODELAY
;
4377 delayed_list
.pop_front();
4378 if (p
.end() && dirty_list
.empty())
4379 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4380 check_caps(in
, flags
);
4384 p
= dirty_list
.begin();
4386 unsigned flags
= CHECK_CAPS_NODELAY
;
4391 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4392 check_caps(in
, flags
);
4396 void Client::flush_caps(Inode
*in
, MetaSession
*session
, bool sync
)
4398 ldout(cct
, 10) << __func__
<< " " << in
<< " mds." << session
->mds_num
<< dendl
;
4399 Cap
*cap
= in
->auth_cap
;
4400 ceph_assert(cap
->session
== session
);
4402 for (map
<ceph_tid_t
,int>::iterator p
= in
->flushing_cap_tids
.begin();
4403 p
!= in
->flushing_cap_tids
.end();
4405 bool req_sync
= false;
4407 /* If this is a synchronous request, then flush the journal on last one */
4408 if (sync
&& (p
->first
== in
->flushing_cap_tids
.rbegin()->first
))
4411 send_cap(in
, session
, cap
, req_sync
,
4412 (get_caps_used(in
) | in
->caps_dirty()),
4413 in
->caps_wanted(), (cap
->issued
| cap
->implemented
),
4414 p
->second
, p
->first
);
4418 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4420 while (in
->flushing_caps
) {
4421 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4422 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4423 if (it
->first
> want
)
4425 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4426 << ccap_string(it
->second
) << " want " << want
4427 << " last " << it
->first
<< dendl
;
4428 wait_on_list(in
->waitfor_caps
);
4432 void Client::wait_sync_caps(ceph_tid_t want
)
4435 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4436 << num_flushing_caps
<< " total flushing)" << dendl
;
4437 for (auto &p
: mds_sessions
) {
4438 MetaSession
*s
= &p
.second
;
4439 if (s
->flushing_caps_tids
.empty())
4441 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4442 if (oldest_tid
<= want
) {
4443 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4444 << " (want " << want
<< ")" << dendl
;
4445 sync_cond
.Wait(client_lock
);
4451 void Client::kick_flushing_caps(MetaSession
*session
)
4453 mds_rank_t mds
= session
->mds_num
;
4454 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4456 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4458 if (session
->early_flushing_caps
.count(in
))
4460 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4461 if (in
->cap_snaps
.size())
4462 flush_snaps(in
, true);
4463 if (in
->flushing_caps
)
4464 flush_caps(in
, session
);
4467 session
->early_flushing_caps
.clear();
4470 void Client::early_kick_flushing_caps(MetaSession
*session
)
4472 session
->early_flushing_caps
.clear();
4474 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4476 Cap
*cap
= in
->auth_cap
;
4479 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4480 // stage. This guarantees that MDS processes the cap flush message before issuing
4481 // the flushing caps to other client.
4482 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
)
4485 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4486 << " to mds." << session
->mds_num
<< dendl
;
4488 session
->early_flushing_caps
.insert(in
);
4490 // send_reconnect() also will reset these sequence numbers. make sure
4491 // sequence numbers in cap flush message match later reconnect message.
4495 cap
->issued
= cap
->implemented
;
4497 if (in
->cap_snaps
.size())
4498 flush_snaps(in
, true);
4499 if (in
->flushing_caps
)
4500 flush_caps(in
, session
);
4505 void SnapRealm::build_snap_context()
4507 set
<snapid_t
> snaps
;
4508 snapid_t max_seq
= seq
;
4510 // start with prior_parents?
4511 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4512 snaps
.insert(prior_parent_snaps
[i
]);
4514 // current parent's snaps
4516 const SnapContext
& psnapc
= pparent
->get_snap_context();
4517 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4518 if (psnapc
.snaps
[i
] >= parent_since
)
4519 snaps
.insert(psnapc
.snaps
[i
]);
4520 if (psnapc
.seq
> max_seq
)
4521 max_seq
= psnapc
.seq
;
4525 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4526 snaps
.insert(my_snaps
[i
]);
4529 cached_snap_context
.seq
= max_seq
;
4530 cached_snap_context
.snaps
.resize(0);
4531 cached_snap_context
.snaps
.reserve(snaps
.size());
4532 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4533 cached_snap_context
.snaps
.push_back(*p
);
4536 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4541 while (!q
.empty()) {
4545 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4546 realm
->invalidate_cache();
4548 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4549 p
!= realm
->pchildren
.end();
4555 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4557 SnapRealm
*realm
= snap_realms
[r
];
4559 snap_realms
[r
] = realm
= new SnapRealm(r
);
4560 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4565 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4567 if (snap_realms
.count(r
) == 0) {
4568 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4571 SnapRealm
*realm
= snap_realms
[r
];
4572 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4577 void Client::put_snap_realm(SnapRealm
*realm
)
4579 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4580 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4581 if (--realm
->nref
== 0) {
4582 snap_realms
.erase(realm
->ino
);
4583 if (realm
->pparent
) {
4584 realm
->pparent
->pchildren
.erase(realm
);
4585 put_snap_realm(realm
->pparent
);
4591 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4593 if (realm
->parent
!= parent
) {
4594 ldout(cct
, 10) << __func__
<< " " << *realm
4595 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4596 realm
->parent
= parent
;
4597 if (realm
->pparent
) {
4598 realm
->pparent
->pchildren
.erase(realm
);
4599 put_snap_realm(realm
->pparent
);
4601 realm
->pparent
= get_snap_realm(parent
);
4602 realm
->pparent
->pchildren
.insert(realm
);
4608 static bool has_new_snaps(const SnapContext
& old_snapc
,
4609 const SnapContext
& new_snapc
)
4611 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4615 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4617 SnapRealm
*first_realm
= NULL
;
4618 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4620 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4622 auto p
= bl
.cbegin();
4626 SnapRealm
*realm
= get_snap_realm(info
.ino());
4628 bool invalidate
= false;
4630 if (info
.seq() > realm
->seq
) {
4631 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4635 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4636 // flush me + children
4639 while (!q
.empty()) {
4640 SnapRealm
*realm
= q
.front();
4643 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4644 p
!= realm
->pchildren
.end();
4648 if (dirty_realms
.count(realm
) == 0) {
4650 dirty_realms
[realm
] = realm
->get_snap_context();
4656 realm
->seq
= info
.seq();
4657 realm
->created
= info
.created();
4658 realm
->parent_since
= info
.parent_since();
4659 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4660 realm
->my_snaps
= info
.my_snaps
;
4664 // _always_ verify parent
4665 if (adjust_realm_parent(realm
, info
.parent()))
4669 invalidate_snaprealm_and_children(realm
);
4670 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4671 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4673 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4674 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4678 first_realm
= realm
;
4680 put_snap_realm(realm
);
4683 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4684 q
!= dirty_realms
.end();
4686 SnapRealm
*realm
= q
->first
;
4687 // if there are new snaps ?
4688 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4689 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4690 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4694 queue_cap_snap(in
, q
->second
);
4697 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4699 put_snap_realm(realm
);
4703 *realm_ret
= first_realm
;
4705 put_snap_realm(first_realm
);
4708 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4710 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4711 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4712 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4717 got_mds_push(session
);
4719 map
<Inode
*, SnapContext
> to_move
;
4720 SnapRealm
*realm
= 0;
4722 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4723 ceph_assert(m
->head
.split
);
4725 auto p
= m
->bl
.cbegin();
4727 ceph_assert(info
.ino() == m
->head
.split
);
4729 // flush, then move, ino's.
4730 realm
= get_snap_realm(info
.ino());
4731 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4732 for (auto& ino
: m
->split_inos
) {
4733 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4734 if (inode_map
.count(vino
)) {
4735 Inode
*in
= inode_map
[vino
];
4736 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4738 if (in
->snaprealm
->created
> info
.created()) {
4739 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4740 << *in
->snaprealm
<< dendl
;
4743 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4746 in
->snaprealm_item
.remove_myself();
4747 to_move
[in
] = in
->snaprealm
->get_snap_context();
4748 put_snap_realm(in
->snaprealm
);
4752 // move child snaprealms, too
4753 for (auto& child_realm
: m
->split_realms
) {
4754 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4755 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4758 adjust_realm_parent(child
, realm
->ino
);
4759 put_snap_realm(child
);
4763 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4766 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4767 Inode
*in
= p
->first
;
4768 in
->snaprealm
= realm
;
4769 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4771 // queue for snap writeback
4772 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4773 queue_cap_snap(in
, p
->second
);
4775 put_snap_realm(realm
);
4779 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4781 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4782 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4787 got_mds_push(session
);
4789 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4791 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4792 if (inode_map
.count(vino
)) {
4794 in
= inode_map
[vino
];
4797 in
->quota
= m
->quota
;
4798 in
->rstat
= m
->rstat
;
4803 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4805 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4806 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4811 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4812 // Pause RADOS operations until we see the required epoch
4813 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4816 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4817 // Record the barrier so that we will transmit it to MDS when releasing
4818 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4821 got_mds_push(session
);
4824 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4825 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4828 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4829 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4830 session
->enqueue_cap_release(
4837 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4840 // in case the mds is waiting on e.g. a revocation
4841 flush_cap_releases();
4845 switch (m
->get_op()) {
4846 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4847 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4848 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4851 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4852 Cap
&cap
= in
->caps
.at(mds
);
4854 switch (m
->get_op()) {
4855 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4856 case CEPH_CAP_OP_IMPORT
:
4857 case CEPH_CAP_OP_REVOKE
:
4858 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4859 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4862 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4867 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4869 mds_rank_t mds
= session
->mds_num
;
4871 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4872 << " IMPORT from mds." << mds
<< dendl
;
4874 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4877 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4879 cap_perms
= cap
->latest_perms
;
4883 SnapRealm
*realm
= NULL
;
4884 update_snap_trace(m
->snapbl
, &realm
);
4886 add_update_cap(in
, session
, m
->get_cap_id(),
4887 m
->get_caps(), m
->get_wanted(), m
->get_seq(), m
->get_mseq(),
4888 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4890 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4891 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4895 put_snap_realm(realm
);
4897 if (in
->auth_cap
&& in
->auth_cap
->session
->mds_num
== mds
) {
4898 // reflush any/all caps (if we are now the auth_cap)
4899 if (in
->cap_snaps
.size())
4900 flush_snaps(in
, true);
4901 if (in
->flushing_caps
)
4902 flush_caps(in
, session
);
4906 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4908 mds_rank_t mds
= session
->mds_num
;
4910 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4911 << " EXPORT from mds." << mds
<< dendl
;
4913 auto it
= in
->caps
.find(mds
);
4914 if (it
!= in
->caps
.end()) {
4915 Cap
&cap
= it
->second
;
4916 if (cap
.cap_id
== m
->get_cap_id()) {
4917 if (m
->peer
.cap_id
) {
4918 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
4919 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4920 auto it
= in
->caps
.find(peer_mds
);
4921 if (it
!= in
->caps
.end()) {
4922 Cap
&tcap
= it
->second
;
4923 if (tcap
.cap_id
== m
->peer
.cap_id
&&
4924 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
4925 tcap
.cap_id
= m
->peer
.cap_id
;
4926 tcap
.seq
= m
->peer
.seq
- 1;
4927 tcap
.issue_seq
= tcap
.seq
;
4928 tcap
.issued
|= cap
.issued
;
4929 tcap
.implemented
|= cap
.issued
;
4930 if (&cap
== in
->auth_cap
)
4931 in
->auth_cap
= &tcap
;
4932 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
4933 adjust_session_flushing_caps(in
, session
, tsession
);
4936 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
4937 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4938 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4942 if (cap
.wanted
| cap
.issued
)
4943 in
->flags
|= I_CAP_DROPPED
;
4946 remove_cap(&cap
, false);
4951 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4953 mds_rank_t mds
= session
->mds_num
;
4954 ceph_assert(in
->caps
.count(mds
));
4956 ldout(cct
, 10) << __func__
<< " on ino " << *in
4957 << " size " << in
->size
<< " -> " << m
->get_size()
4961 in
->caps_issued(&issued
);
4962 issued
|= in
->caps_dirty();
4963 update_inode_file_size(in
, issued
, m
->get_size(),
4964 m
->get_truncate_seq(), m
->get_truncate_size());
4967 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
4969 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4970 int dirty
= m
->get_dirty();
4974 auto it
= in
->flushing_cap_tids
.begin();
4975 if (it
->first
< flush_ack_tid
) {
4976 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
4977 << " got unexpected flush ack tid " << flush_ack_tid
4978 << " expected is " << it
->first
<< dendl
;
4980 for (; it
!= in
->flushing_cap_tids
.end(); ) {
4981 if (it
->first
== flush_ack_tid
)
4982 cleaned
= it
->second
;
4983 if (it
->first
<= flush_ack_tid
) {
4984 session
->flushing_caps_tids
.erase(it
->first
);
4985 in
->flushing_cap_tids
.erase(it
++);
4989 cleaned
&= ~it
->second
;
4995 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
4996 << " cleaned " << ccap_string(cleaned
) << " on " << *in
4997 << " with " << ccap_string(dirty
) << dendl
;
5000 signal_cond_list(in
->waitfor_caps
);
5001 if (session
->flushing_caps_tids
.empty() ||
5002 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5007 in
->cap_dirtier_uid
= -1;
5008 in
->cap_dirtier_gid
= -1;
5012 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5014 if (in
->flushing_caps
) {
5015 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5016 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5017 in
->flushing_caps
&= ~cleaned
;
5018 if (in
->flushing_caps
== 0) {
5019 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5020 num_flushing_caps
--;
5021 if (in
->cap_snaps
.empty())
5022 in
->flushing_cap_item
.remove_myself();
5024 if (!in
->caps_dirty())
5031 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5033 mds_rank_t mds
= session
->mds_num
;
5034 ceph_assert(in
->caps
.count(mds
));
5035 snapid_t follows
= m
->get_snap_follows();
5037 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5038 auto& capsnap
= it
->second
;
5039 if (m
->get_client_tid() != capsnap
.flush_tid
) {
5040 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != " << capsnap
.flush_tid
<< dendl
;
5042 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5043 << " on " << *in
<< dendl
;
5045 if (in
->get_num_ref() == 1)
5046 tmp_ref
= in
; // make sure inode not get freed while erasing item from in->cap_snaps
5047 if (in
->flushing_caps
== 0 && in
->cap_snaps
.empty())
5048 in
->flushing_cap_item
.remove_myself();
5049 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5050 in
->cap_snaps
.erase(it
);
5053 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5054 << " on " << *in
<< dendl
;
5055 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5059 class C_Client_DentryInvalidate
: public Context
{
5066 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5067 client(c
), name(dn
->name
) {
5068 if (client
->use_faked_inos()) {
5069 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5071 ino
.ino
= dn
->inode
->faked_ino
;
5073 dirino
= dn
->dir
->parent_inode
->vino();
5075 ino
= dn
->inode
->vino();
5078 ino
.ino
= inodeno_t();
5080 void finish(int r
) override
{
5081 // _async_dentry_invalidate is responsible for its own locking
5082 ceph_assert(!client
->client_lock
.is_locked_by_me());
5083 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5087 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5091 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5092 << " in dir " << dirino
<< dendl
;
5093 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5096 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5098 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5099 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5102 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5104 int ref
= in
->get_num_ref();
5106 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5107 for (auto p
= in
->dir
->dentries
.begin();
5108 p
!= in
->dir
->dentries
.end(); ) {
5109 Dentry
*dn
= p
->second
;
5111 /* rmsnap removes whole subtree, need trim inodes recursively.
5112 * we don't need to invalidate dentries recursively. because
5113 * invalidating a directory dentry effectively invalidate
5115 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5116 _try_to_trim_inode(dn
->inode
.get(), false);
5118 if (dn
->lru_is_expireable())
5119 unlink(dn
, true, false); // keep dir, drop dentry
5121 if (in
->dir
->dentries
.empty()) {
5127 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5128 InodeRef snapdir
= open_snapdir(in
);
5129 _try_to_trim_inode(snapdir
.get(), false);
5133 if (ref
> 0 && in
->ll_ref
> 0 && sched_inval
) {
5134 auto q
= in
->dentries
.begin();
5135 while (q
!= in
->dentries
.end()) {
5138 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5139 // so in->dentries doesn't always reflect the state of kernel's dcache.
5140 _schedule_invalidate_dentry_callback(dn
, true);
5141 unlink(dn
, true, true);
5146 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5148 mds_rank_t mds
= session
->mds_num
;
5149 int used
= get_caps_used(in
);
5150 int wanted
= in
->caps_wanted();
5152 const unsigned new_caps
= m
->get_caps();
5153 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5154 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5155 << " mds." << mds
<< " seq " << m
->get_seq()
5156 << " caps now " << ccap_string(new_caps
)
5157 << " was " << ccap_string(cap
->issued
)
5158 << (was_stale
? "" : " (stale)") << dendl
;
5161 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5162 cap
->seq
= m
->get_seq();
5163 cap
->gen
= session
->cap_gen
;
5165 check_cap_issue(in
, new_caps
);
5169 in
->caps_issued(&issued
);
5170 issued
|= in
->caps_dirty();
5172 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5173 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5174 in
->mode
= m
->head
.mode
;
5175 in
->uid
= m
->head
.uid
;
5176 in
->gid
= m
->head
.gid
;
5177 in
->btime
= m
->btime
;
5179 bool deleted_inode
= false;
5180 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5181 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5182 in
->nlink
= m
->head
.nlink
;
5183 if (in
->nlink
== 0 &&
5184 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5185 deleted_inode
= true;
5187 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5188 m
->xattrbl
.length() &&
5189 m
->head
.xattr_version
> in
->xattr_version
) {
5190 auto p
= m
->xattrbl
.cbegin();
5191 decode(in
->xattrs
, p
);
5192 in
->xattr_version
= m
->head
.xattr_version
;
5195 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5196 in
->dirstat
.nfiles
= m
->get_nfiles();
5197 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5200 if (new_caps
& CEPH_CAP_ANY_RD
) {
5201 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5202 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5205 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5206 in
->layout
= m
->get_layout();
5207 update_inode_file_size(in
, issued
, m
->get_size(),
5208 m
->get_truncate_seq(), m
->get_truncate_size());
5211 if (m
->inline_version
> in
->inline_version
) {
5212 in
->inline_data
= m
->inline_data
;
5213 in
->inline_version
= m
->inline_version
;
5216 /* always take a newer change attr */
5217 if (m
->get_change_attr() > in
->change_attr
)
5218 in
->change_attr
= m
->get_change_attr();
5221 if (cap
== in
->auth_cap
&&
5222 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5223 (m
->get_max_size() != in
->max_size
)) {
5224 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5225 in
->max_size
= m
->get_max_size();
5226 if (in
->max_size
> in
->wanted_max_size
) {
5227 in
->wanted_max_size
= 0;
5228 in
->requested_max_size
= 0;
5233 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5234 (wanted
& ~(cap
->wanted
| new_caps
))) {
5235 // If mds is importing cap, prior cap messages that update 'wanted'
5236 // may get dropped by mds (migrate seq mismatch).
5238 // We don't send cap message to update 'wanted' if what we want are
5239 // already issued. If mds revokes caps, cap message that releases caps
5240 // also tells mds what we want. But if caps got revoked by mds forcedly
5241 // (session stale). We may haven't told mds what we want.
5247 auto revoked
= cap
->issued
& ~new_caps
;
5249 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5250 cap
->issued
= new_caps
;
5251 cap
->implemented
|= new_caps
;
5253 // recall delegations if we're losing caps necessary for them
5254 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5255 in
->recall_deleg(false);
5256 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5257 in
->recall_deleg(true);
5259 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5260 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5261 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5262 // waitin' for flush
5263 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5267 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5270 } else if (cap
->issued
== new_caps
) {
5271 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5273 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5274 cap
->issued
= new_caps
;
5275 cap
->implemented
|= new_caps
;
5277 if (cap
== in
->auth_cap
) {
5278 // non-auth MDS is revoking the newly grant caps ?
5279 for (const auto &p
: in
->caps
) {
5280 if (&p
.second
== cap
)
5282 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5295 signal_cond_list(in
->waitfor_caps
);
5297 // may drop inode's last ref
5299 _try_to_trim_inode(in
, true);
5302 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5304 if (perms
.uid() == 0)
5307 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5308 int ret
= _posix_acl_permission(in
, perms
, want
);
5313 // check permissions before doing anything else
5314 if (!in
->check_mode(perms
, want
))
5319 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5320 const UserPerm
& perms
)
5322 int r
= _getattr_for_perm(in
, perms
);
5327 if (strncmp(name
, "system.", 7) == 0) {
5328 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5331 r
= inode_permission(in
, perms
, want
);
5334 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5338 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5339 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5343 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5344 const UserPerm
& perms
)
5346 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5347 int r
= _getattr_for_perm(in
, perms
);
5351 if (mask
& CEPH_SETATTR_SIZE
) {
5352 r
= inode_permission(in
, perms
, MAY_WRITE
);
5358 if (mask
& CEPH_SETATTR_UID
) {
5359 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5362 if (mask
& CEPH_SETATTR_GID
) {
5363 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5364 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5368 if (mask
& CEPH_SETATTR_MODE
) {
5369 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5372 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5373 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5374 stx
->stx_mode
&= ~S_ISGID
;
5377 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5378 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5379 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5380 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5381 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5382 check_mask
|= CEPH_SETATTR_MTIME
;
5383 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5384 check_mask
|= CEPH_SETATTR_ATIME
;
5385 if (check_mask
& mask
) {
5388 r
= inode_permission(in
, perms
, MAY_WRITE
);
5396 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5400 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5402 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5405 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5407 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5408 want
= MAY_READ
| MAY_WRITE
;
5409 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5411 if (flags
& O_TRUNC
)
5415 switch (in
->mode
& S_IFMT
) {
5420 if (want
& MAY_WRITE
) {
5427 r
= _getattr_for_perm(in
, perms
);
5431 r
= inode_permission(in
, perms
, want
);
5433 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5437 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5439 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5440 int r
= _getattr_for_perm(dir
, perms
);
5444 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5446 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5450 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5452 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5453 int r
= _getattr_for_perm(dir
, perms
);
5457 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5459 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5463 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5465 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5466 int r
= _getattr_for_perm(dir
, perms
);
5470 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5474 /* 'name == NULL' means rmsnap */
5475 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5477 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5480 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5484 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5488 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5490 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5491 int r
= _getattr_for_perm(in
, perms
);
5495 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5501 if (!S_ISREG(in
->mode
))
5504 if (in
->mode
& S_ISUID
)
5507 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5510 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5512 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5516 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5518 int mask
= CEPH_STAT_CAP_MODE
;
5520 if (acl_type
!= NO_ACL
) {
5521 mask
|= CEPH_STAT_CAP_XATTR
;
5522 force
= in
->xattr_version
== 0;
5524 return _getattr(in
, mask
, perms
, force
);
5527 vinodeno_t
Client::_get_vino(Inode
*in
)
5529 /* The caller must hold the client lock */
5530 return vinodeno_t(in
->ino
, in
->snapid
);
5534 * Resolve an MDS spec to a list of MDS daemon GIDs.
5536 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5537 * It may be '*' in which case it matches all GIDs.
5539 * If no error is returned, the `targets` vector will be populated with at least
5542 int Client::resolve_mds(
5543 const std::string
&mds_spec
,
5544 std::vector
<mds_gid_t
> *targets
)
5547 ceph_assert(targets
!= nullptr);
5550 std::stringstream ss
;
5551 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5553 // We got a role, resolve it to a GID
5554 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5555 << role
<< "'" << dendl
;
5557 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5561 std::string strtol_err
;
5562 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5563 if (strtol_err
.empty()) {
5564 // It is a possible GID
5565 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5566 if (fsmap
->gid_exists(mds_gid
)) {
5567 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5568 targets
->push_back(mds_gid
);
5570 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5574 } else if (mds_spec
== "*") {
5575 // It is a wildcard: use all MDSs
5576 const auto mds_info
= fsmap
->get_mds_info();
5578 if (mds_info
.empty()) {
5579 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5583 for (const auto i
: mds_info
) {
5584 targets
->push_back(i
.first
);
5587 // It did not parse as an integer, it is not a wildcard, it must be a name
5588 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5590 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5592 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5596 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5597 << "' to GID " << mds_gid
<< dendl
;
5598 targets
->push_back(mds_gid
);
5607 * Authenticate with mon and establish global ID
5609 int Client::authenticate()
5611 ceph_assert(client_lock
.is_locked_by_me());
5613 if (monclient
->is_authenticated()) {
5617 client_lock
.Unlock();
5618 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5624 whoami
= monclient
->get_global_id();
5625 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5630 int Client::fetch_fsmap(bool user
)
5633 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5634 // rather than MDSMap because no one MDSMap contains all the daemons, and
5635 // a `tell` can address any daemon.
5636 version_t fsmap_latest
;
5639 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5640 client_lock
.Unlock();
5643 } while (r
== -EAGAIN
);
5646 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5650 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5653 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5654 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5655 monclient
->renew_subs();
5656 wait_on_list(waiting_for_fsmap
);
5658 ceph_assert(fsmap_user
);
5659 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5661 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5662 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5663 monclient
->renew_subs();
5664 wait_on_list(waiting_for_fsmap
);
5667 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5669 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5670 << fsmap_latest
<< dendl
;
5676 * @mds_spec one of ID, rank, GID, "*"
5679 int Client::mds_command(
5680 const std::string
&mds_spec
,
5681 const vector
<string
>& cmd
,
5682 const bufferlist
& inbl
,
5687 std::lock_guard
lock(client_lock
);
5698 r
= fetch_fsmap(false);
5703 // Look up MDS target(s) of the command
5704 std::vector
<mds_gid_t
> targets
;
5705 r
= resolve_mds(mds_spec
, &targets
);
5710 // If daemons are laggy, we won't send them commands. If all
5711 // are laggy then we fail.
5712 std::vector
<mds_gid_t
> non_laggy
;
5713 for (const auto gid
: targets
) {
5714 const auto info
= fsmap
->get_info_gid(gid
);
5715 if (!info
.laggy()) {
5716 non_laggy
.push_back(gid
);
5719 if (non_laggy
.size() == 0) {
5720 *outs
= "All targeted MDS daemons are laggy";
5724 if (metadata
.empty()) {
5725 // We are called on an unmounted client, so metadata
5726 // won't be initialized yet.
5727 populate_metadata("");
5730 // Send commands to targets
5731 C_GatherBuilder
gather(cct
, onfinish
);
5732 for (const auto target_gid
: non_laggy
) {
5733 const auto info
= fsmap
->get_info_gid(target_gid
);
5735 // Open a connection to the target MDS
5736 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5738 // Generate MDSCommandOp state
5739 auto &op
= command_table
.start_command();
5741 op
.on_finish
= gather
.new_sub();
5746 op
.mds_gid
= target_gid
;
5749 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5750 << " tid=" << op
.tid
<< cmd
<< dendl
;
5752 // Construct and send MCommand
5753 auto m
= op
.get_message(monclient
->get_fsid());
5754 conn
->send_message2(std::move(m
));
5761 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5763 ceph_tid_t
const tid
= m
->get_tid();
5765 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5767 if (!command_table
.exists(tid
)) {
5768 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5772 auto &op
= command_table
.get_command(tid
);
5774 *op
.outbl
= m
->get_data();
5781 op
.on_finish
->complete(m
->r
);
5784 command_table
.erase(tid
);
5787 // -------------------
5790 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5792 int r
= authenticate();
5794 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5798 std::string resolved_fs_name
;
5799 if (fs_name
.empty()) {
5800 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5802 resolved_fs_name
= fs_name
;
5805 std::string want
= "mdsmap";
5806 if (!resolved_fs_name
.empty()) {
5807 r
= fetch_fsmap(true);
5810 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5811 if (fscid
== FS_CLUSTER_ID_NONE
) {
5815 std::ostringstream oss
;
5816 oss
<< want
<< "." << fscid
;
5819 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5821 monclient
->sub_want(want
, 0, 0);
5822 monclient
->renew_subs();
5827 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5828 bool require_mds
, const std::string
&fs_name
)
5830 std::lock_guard
lock(client_lock
);
5833 ldout(cct
, 5) << "already mounted" << dendl
;
5839 int r
= subscribe_mdsmap(fs_name
);
5841 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5845 tick(); // start tick
5849 auto availability
= mdsmap
->is_cluster_available();
5850 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5852 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5853 return CEPH_FUSE_NO_MDS_UP
;
5854 } else if (availability
== MDSMap::AVAILABLE
) {
5855 // Continue to mount
5857 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5858 // Else, wait. MDSMonitor will update the map to bring
5859 // us to a conclusion eventually.
5860 wait_on_list(waiting_for_mdsmap
);
5862 // Unexpected value!
5868 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5870 filepath
fp(CEPH_INO_ROOT
);
5871 if (!mount_root
.empty()) {
5872 fp
= filepath(mount_root
.c_str());
5875 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5876 req
->set_filepath(fp
);
5877 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5878 int res
= make_request(req
, perms
);
5880 if (res
== -EACCES
&& root
) {
5881 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5899 if (!cct
->_conf
->client_trace
.empty()) {
5900 traceout
.open(cct
->_conf
->client_trace
.c_str());
5901 if (traceout
.is_open()) {
5902 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5904 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5909 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5910 ldout(cct, 3) << "op: struct stat st;" << dendl;
5911 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5912 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5913 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5914 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5915 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5916 ldout(cct, 3) << "op: int fd;" << dendl;
5923 void Client::_close_sessions()
5925 while (!mds_sessions
.empty()) {
5926 // send session closes!
5927 for (auto &p
: mds_sessions
) {
5928 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
5929 _close_mds_session(&p
.second
);
5933 // wait for sessions to close
5934 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5935 mount_cond
.Wait(client_lock
);
5939 void Client::flush_mdlog_sync()
5941 if (mds_requests
.empty())
5943 for (auto &p
: mds_sessions
) {
5944 flush_mdlog(&p
.second
);
5948 void Client::flush_mdlog(MetaSession
*session
)
5950 // Only send this to Luminous or newer MDS daemons, older daemons
5951 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5952 const uint64_t features
= session
->con
->get_features();
5953 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5954 auto m
= MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5955 session
->con
->send_message2(std::move(m
));
5960 void Client::_abort_mds_sessions(int err
)
5962 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
5963 auto req
= p
->second
;
5965 // unsafe requests will be removed during close session below.
5966 if (req
->got_unsafe
)
5970 if (req
->caller_cond
) {
5972 req
->caller_cond
->Signal();
5976 // Process aborts on any requests that were on this waitlist.
5977 // Any requests that were on a waiting_for_open session waitlist
5978 // will get kicked during close session below.
5979 signal_cond_list(waiting_for_mdsmap
);
5981 // Force-close all sessions
5982 while(!mds_sessions
.empty()) {
5983 auto& session
= mds_sessions
.begin()->second
;
5984 _closed_mds_session(&session
);
5988 void Client::_unmount(bool abort
)
5993 if (abort
|| blacklisted
) {
5994 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
5996 ldout(cct
, 2) << "unmounting" << dendl
;
6003 // Abort all mds sessions
6004 _abort_mds_sessions(-ENOTCONN
);
6006 objecter
->op_cancel_writes(-ENOTCONN
);
6008 // flush the mdlog for pending requests, if any
6012 while (!mds_requests
.empty()) {
6013 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests" << dendl
;
6014 mount_cond
.Wait(client_lock
);
6018 timer
.cancel_event(tick_event
);
6023 // clean up any unclosed files
6024 while (!fd_map
.empty()) {
6025 Fh
*fh
= fd_map
.begin()->second
;
6026 fd_map
.erase(fd_map
.begin());
6027 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6031 while (!ll_unclosed_fh_set
.empty()) {
6032 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6034 ll_unclosed_fh_set
.erase(fh
);
6035 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6039 while (!opened_dirs
.empty()) {
6040 dir_result_t
*dirp
= *opened_dirs
.begin();
6041 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6047 while (unsafe_sync_write
> 0) {
6048 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting" << dendl
;
6049 mount_cond
.Wait(client_lock
);
6052 if (cct
->_conf
->client_oc
) {
6053 // flush/release all buffered data
6054 std::list
<InodeRef
> anchor
;
6055 for (auto& p
: inode_map
) {
6056 Inode
*in
= p
.second
;
6058 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6062 // prevent inode from getting freed
6063 anchor
.emplace_back(in
);
6065 if (abort
|| blacklisted
) {
6066 objectcacher
->purge_set(&in
->oset
);
6067 } else if (!in
->caps
.empty()) {
6069 _flush(in
, new C_Client_FlushComplete(this, in
));
6074 if (abort
|| blacklisted
) {
6075 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6078 if (in
->dirty_caps
) {
6079 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6080 in
->mark_caps_clean();
6086 wait_sync_caps(last_flush_tid
);
6092 while (lru
.lru_get_size() > 0 ||
6093 !inode_map
.empty()) {
6094 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6095 << "+" << inode_map
.size() << " items"
6096 << ", waiting (for caps to release?)"
6098 utime_t until
= ceph_clock_now() + utime_t(5, 0);
6099 int r
= mount_cond
.WaitUntil(client_lock
, until
);
6100 if (r
== ETIMEDOUT
) {
6104 ceph_assert(lru
.lru_get_size() == 0);
6105 ceph_assert(inode_map
.empty());
6108 if (!cct
->_conf
->client_trace
.empty()) {
6109 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6117 ldout(cct
, 2) << "unmounted." << dendl
;
6120 void Client::unmount()
6122 std::lock_guard
lock(client_lock
);
6126 void Client::abort_conn()
6128 std::lock_guard
lock(client_lock
);
6132 void Client::flush_cap_releases()
6134 // send any cap releases
6135 for (auto &p
: mds_sessions
) {
6136 auto &session
= p
.second
;
6137 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6139 if (cct
->_conf
->client_inject_release_failure
) {
6140 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6142 session
.con
->send_message2(std::move(session
.release
));
6144 session
.release
.reset();
6151 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6152 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6153 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6154 cct
->_conf
.apply_changes(nullptr);
6157 ldout(cct
, 21) << "tick" << dendl
;
6158 tick_event
= timer
.add_event_after(
6159 cct
->_conf
->client_tick_interval
,
6160 new FunctionContext([this](int) {
6161 // Called back via Timer, which takes client_lock for us
6162 ceph_assert(client_lock
.is_locked_by_me());
6165 utime_t now
= ceph_clock_now();
6167 if (!mounted
&& !mds_requests
.empty()) {
6168 MetaRequest
*req
= mds_requests
.begin()->second
;
6169 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6170 req
->abort(-ETIMEDOUT
);
6171 if (req
->caller_cond
) {
6173 req
->caller_cond
->Signal();
6175 signal_cond_list(waiting_for_mdsmap
);
6176 for (auto &p
: mds_sessions
) {
6177 signal_context_list(p
.second
.waiting_for_open
);
6182 if (mdsmap
->get_epoch()) {
6184 utime_t el
= now
- last_cap_renew
;
6185 if (el
> mdsmap
->get_session_timeout() / 3.0)
6188 flush_cap_releases();
6192 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6196 if (in
->hold_caps_until
> now
)
6198 delayed_list
.pop_front();
6199 check_caps(in
, CHECK_CAPS_NODELAY
);
6205 void Client::renew_caps()
6207 ldout(cct
, 10) << "renew_caps()" << dendl
;
6208 last_cap_renew
= ceph_clock_now();
6210 for (auto &p
: mds_sessions
) {
6211 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6212 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6213 renew_caps(&p
.second
);
6217 void Client::renew_caps(MetaSession
*session
)
6219 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6220 session
->last_cap_renew_request
= ceph_clock_now();
6221 uint64_t seq
= ++session
->cap_renew_seq
;
6222 session
->con
->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6226 // ===============================================================
6227 // high level (POSIXy) interface
6229 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6230 InodeRef
*target
, const UserPerm
& perms
)
6232 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6233 MetaRequest
*req
= new MetaRequest(op
);
6235 dir
->make_nosnap_relative_path(path
);
6236 path
.push_dentry(name
);
6237 req
->set_filepath(path
);
6238 req
->set_inode(dir
);
6239 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6240 mask
|= DEBUG_GETATTR_CAPS
;
6241 req
->head
.args
.getattr
.mask
= mask
;
6243 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6245 int r
= make_request(req
, perms
, target
);
6246 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6250 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6251 const UserPerm
& perms
)
6256 if (dname
== "..") {
6257 if (dir
->dentries
.empty()) {
6258 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6259 filepath
path(dir
->ino
);
6260 req
->set_filepath(path
);
6263 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6266 Inode
*tempino
= tmptarget
.get();
6269 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6275 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6284 if (!dir
->is_dir()) {
6289 if (dname
.length() > NAME_MAX
) {
6294 if (dname
== cct
->_conf
->client_snapdir
&&
6295 dir
->snapid
== CEPH_NOSNAP
) {
6296 *target
= open_snapdir(dir
);
6301 dir
->dir
->dentries
.count(dname
)) {
6302 dn
= dir
->dir
->dentries
[dname
];
6304 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6305 << " seq " << dn
->lease_seq
6308 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6309 // is dn lease valid?
6310 utime_t now
= ceph_clock_now();
6311 if (dn
->lease_mds
>= 0 &&
6312 dn
->lease_ttl
> now
&&
6313 mds_sessions
.count(dn
->lease_mds
)) {
6314 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6315 if (s
.cap_ttl
> now
&&
6316 s
.cap_gen
== dn
->lease_gen
) {
6317 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6318 // make trim_caps() behave.
6319 dir
->try_touch_cap(dn
->lease_mds
);
6322 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6323 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6326 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6327 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6328 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6330 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6331 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6332 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6337 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6340 // can we conclude ENOENT locally?
6341 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6342 (dir
->flags
& I_COMPLETE
)) {
6343 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6348 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6353 *target
= dn
->inode
;
6361 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6363 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6367 int Client::get_or_create(Inode
*dir
, const char* name
,
6368 Dentry
**pdn
, bool expect_null
)
6371 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6373 if (dir
->dir
->dentries
.count(name
)) {
6374 Dentry
*dn
= dir
->dir
->dentries
[name
];
6376 // is dn lease valid?
6377 utime_t now
= ceph_clock_now();
6379 dn
->lease_mds
>= 0 &&
6380 dn
->lease_ttl
> now
&&
6381 mds_sessions
.count(dn
->lease_mds
)) {
6382 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6383 if (s
.cap_ttl
> now
&&
6384 s
.cap_gen
== dn
->lease_gen
) {
6391 // otherwise link up a new one
6392 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6399 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6400 const UserPerm
& perms
, bool followsym
, int mask
)
6402 filepath path
= origpath
;
6404 if (origpath
.absolute())
6410 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6415 while (i
< path
.depth() && cur
) {
6417 const string
&dname
= path
[i
];
6418 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6419 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6421 if (cct
->_conf
->client_permissions
) {
6422 int r
= may_lookup(cur
.get(), perms
);
6425 caps
= CEPH_CAP_AUTH_SHARED
;
6428 /* Get extra requested caps on the last component */
6429 if (i
== (path
.depth() - 1))
6431 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6434 // only follow trailing symlink if followsym. always follow
6435 // 'directory' symlinks.
6436 if (next
&& next
->is_symlink()) {
6438 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6439 if (symlinks
> MAXSYMLINKS
) {
6443 if (i
< path
.depth() - 1) {
6445 // replace consumed components of path with symlink dir target
6446 filepath
resolved(next
->symlink
.c_str());
6447 resolved
.append(path
.postfixpath(i
+ 1));
6450 if (next
->symlink
[0] == '/') {
6454 } else if (followsym
) {
6455 if (next
->symlink
[0] == '/') {
6456 path
= next
->symlink
.c_str();
6461 filepath
more(next
->symlink
.c_str());
6462 // we need to remove the symlink component from off of the path
6463 // before adding the target that the symlink points to. remain
6464 // at the same position in the path.
6484 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6486 std::lock_guard
lock(client_lock
);
6487 tout(cct
) << "link" << std::endl
;
6488 tout(cct
) << relexisting
<< std::endl
;
6489 tout(cct
) << relpath
<< std::endl
;
6494 filepath
existing(relexisting
);
6497 int r
= path_walk(existing
, &in
, perm
, true);
6500 if (std::string(relpath
) == "/") {
6504 filepath
path(relpath
);
6505 string name
= path
.last_dentry();
6508 r
= path_walk(path
, &dir
, perm
, true);
6511 if (cct
->_conf
->client_permissions
) {
6512 if (S_ISDIR(in
->mode
)) {
6516 r
= may_hardlink(in
.get(), perm
);
6519 r
= may_create(dir
.get(), perm
);
6523 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6527 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6529 std::lock_guard
lock(client_lock
);
6530 tout(cct
) << __func__
<< std::endl
;
6531 tout(cct
) << relpath
<< std::endl
;
6536 if (std::string(relpath
) == "/")
6539 filepath
path(relpath
);
6540 string name
= path
.last_dentry();
6543 int r
= path_walk(path
, &dir
, perm
);
6546 if (cct
->_conf
->client_permissions
) {
6547 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6551 return _unlink(dir
.get(), name
.c_str(), perm
);
6554 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6556 std::lock_guard
lock(client_lock
);
6557 tout(cct
) << __func__
<< std::endl
;
6558 tout(cct
) << relfrom
<< std::endl
;
6559 tout(cct
) << relto
<< std::endl
;
6564 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6567 filepath
from(relfrom
);
6569 string fromname
= from
.last_dentry();
6571 string toname
= to
.last_dentry();
6574 InodeRef fromdir
, todir
;
6575 int r
= path_walk(from
, &fromdir
, perm
);
6578 r
= path_walk(to
, &todir
, perm
);
6582 if (cct
->_conf
->client_permissions
) {
6583 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6586 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6587 if (r
< 0 && r
!= -ENOENT
)
6590 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6597 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6599 std::lock_guard
lock(client_lock
);
6600 tout(cct
) << __func__
<< std::endl
;
6601 tout(cct
) << relpath
<< std::endl
;
6602 tout(cct
) << mode
<< std::endl
;
6603 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6608 if (std::string(relpath
) == "/")
6611 filepath
path(relpath
);
6612 string name
= path
.last_dentry();
6615 int r
= path_walk(path
, &dir
, perm
);
6618 if (cct
->_conf
->client_permissions
) {
6619 r
= may_create(dir
.get(), perm
);
6623 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6626 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6628 std::lock_guard
lock(client_lock
);
6629 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6630 tout(cct
) << __func__
<< std::endl
;
6631 tout(cct
) << relpath
<< std::endl
;
6632 tout(cct
) << mode
<< std::endl
;
6637 //get through existing parts of path
6638 filepath
path(relpath
);
6640 int r
= 0, caps
= 0;
6643 for (i
=0; i
<path
.depth(); ++i
) {
6644 if (cct
->_conf
->client_permissions
) {
6645 r
= may_lookup(cur
.get(), perms
);
6648 caps
= CEPH_CAP_AUTH_SHARED
;
6650 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6655 //check that we have work left to do
6656 if (i
==path
.depth()) return -EEXIST
;
6657 if (r
!=-ENOENT
) return r
;
6658 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6659 //make new directory at each level
6660 for (; i
<path
.depth(); ++i
) {
6661 if (cct
->_conf
->client_permissions
) {
6662 r
= may_create(cur
.get(), perms
);
6667 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6669 //check proper creation/existence
6670 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6671 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6675 //move to new dir and continue
6677 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6678 << filepath(cur
->ino
).get_path() << dendl
;
6683 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6685 std::lock_guard
lock(client_lock
);
6686 tout(cct
) << __func__
<< std::endl
;
6687 tout(cct
) << relpath
<< std::endl
;
6692 if (std::string(relpath
) == "/")
6695 filepath
path(relpath
);
6696 string name
= path
.last_dentry();
6699 int r
= path_walk(path
, &dir
, perms
);
6702 if (cct
->_conf
->client_permissions
) {
6703 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6707 return _rmdir(dir
.get(), name
.c_str(), perms
);
6710 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6712 std::lock_guard
lock(client_lock
);
6713 tout(cct
) << __func__
<< std::endl
;
6714 tout(cct
) << relpath
<< std::endl
;
6715 tout(cct
) << mode
<< std::endl
;
6716 tout(cct
) << rdev
<< std::endl
;
6721 if (std::string(relpath
) == "/")
6724 filepath
path(relpath
);
6725 string name
= path
.last_dentry();
6728 int r
= path_walk(path
, &dir
, perms
);
6731 if (cct
->_conf
->client_permissions
) {
6732 int r
= may_create(dir
.get(), perms
);
6736 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6741 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6743 std::lock_guard
lock(client_lock
);
6744 tout(cct
) << __func__
<< std::endl
;
6745 tout(cct
) << target
<< std::endl
;
6746 tout(cct
) << relpath
<< std::endl
;
6751 if (std::string(relpath
) == "/")
6754 filepath
path(relpath
);
6755 string name
= path
.last_dentry();
6758 int r
= path_walk(path
, &dir
, perms
);
6761 if (cct
->_conf
->client_permissions
) {
6762 int r
= may_create(dir
.get(), perms
);
6766 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6769 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6771 std::lock_guard
lock(client_lock
);
6772 tout(cct
) << __func__
<< std::endl
;
6773 tout(cct
) << relpath
<< std::endl
;
6778 filepath
path(relpath
);
6780 int r
= path_walk(path
, &in
, perms
, false);
6784 return _readlink(in
.get(), buf
, size
);
6787 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6789 if (!in
->is_symlink())
6792 // copy into buf (at most size bytes)
6793 int r
= in
->symlink
.length();
6796 memcpy(buf
, in
->symlink
.c_str(), r
);
6803 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6805 bool yes
= in
->caps_issued_mask(mask
, true);
6807 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6811 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6813 in
->make_nosnap_relative_path(path
);
6814 req
->set_filepath(path
);
6816 req
->head
.args
.getattr
.mask
= mask
;
6818 int res
= make_request(req
, perms
);
6819 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6823 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6824 const UserPerm
& perms
, InodeRef
*inp
)
6826 int issued
= in
->caps_issued();
6828 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6829 ccap_string(issued
) << dendl
;
6831 if (in
->snapid
!= CEPH_NOSNAP
) {
6834 if ((mask
& CEPH_SETATTR_SIZE
) &&
6835 (unsigned long)stx
->stx_size
> in
->size
&&
6836 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6841 // make the change locally?
6842 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6843 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6844 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6845 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6846 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6849 * This works because we implicitly flush the caps as part of the
6850 * request, so the cap update check will happen with the writeback
6851 * cap context, and then the setattr check will happen with the
6854 * In reality this pattern is likely pretty rare (different users
6855 * setattr'ing the same file). If that turns out not to be the
6856 * case later, we can build a more complex pipelined cap writeback
6860 mask
|= CEPH_SETATTR_CTIME
;
6865 // caller just needs us to bump the ctime
6866 in
->ctime
= ceph_clock_now();
6867 in
->cap_dirtier_uid
= perms
.uid();
6868 in
->cap_dirtier_gid
= perms
.gid();
6869 if (issued
& CEPH_CAP_AUTH_EXCL
)
6870 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6871 else if (issued
& CEPH_CAP_FILE_EXCL
)
6872 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6873 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6874 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6876 mask
|= CEPH_SETATTR_CTIME
;
6879 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6880 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6882 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6884 if (mask
& CEPH_SETATTR_UID
) {
6885 in
->ctime
= ceph_clock_now();
6886 in
->cap_dirtier_uid
= perms
.uid();
6887 in
->cap_dirtier_gid
= perms
.gid();
6888 in
->uid
= stx
->stx_uid
;
6889 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6890 mask
&= ~CEPH_SETATTR_UID
;
6892 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6894 if (mask
& CEPH_SETATTR_GID
) {
6895 in
->ctime
= ceph_clock_now();
6896 in
->cap_dirtier_uid
= perms
.uid();
6897 in
->cap_dirtier_gid
= perms
.gid();
6898 in
->gid
= stx
->stx_gid
;
6899 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6900 mask
&= ~CEPH_SETATTR_GID
;
6902 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6905 if (mask
& CEPH_SETATTR_MODE
) {
6906 in
->ctime
= ceph_clock_now();
6907 in
->cap_dirtier_uid
= perms
.uid();
6908 in
->cap_dirtier_gid
= perms
.gid();
6909 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6910 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6911 mask
&= ~CEPH_SETATTR_MODE
;
6912 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6913 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6914 /* Must squash the any setuid/setgid bits with an ownership change */
6915 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6916 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6919 if (mask
& CEPH_SETATTR_BTIME
) {
6920 in
->ctime
= ceph_clock_now();
6921 in
->cap_dirtier_uid
= perms
.uid();
6922 in
->cap_dirtier_gid
= perms
.gid();
6923 in
->btime
= utime_t(stx
->stx_btime
);
6924 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6925 mask
&= ~CEPH_SETATTR_BTIME
;
6926 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6928 } else if (mask
& CEPH_SETATTR_SIZE
) {
6929 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6930 mask
|= CEPH_SETATTR_KILL_SGUID
;
6933 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6934 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6935 if (mask
& CEPH_SETATTR_MTIME
)
6936 in
->mtime
= utime_t(stx
->stx_mtime
);
6937 if (mask
& CEPH_SETATTR_ATIME
)
6938 in
->atime
= utime_t(stx
->stx_atime
);
6939 in
->ctime
= ceph_clock_now();
6940 in
->cap_dirtier_uid
= perms
.uid();
6941 in
->cap_dirtier_gid
= perms
.gid();
6942 in
->time_warp_seq
++;
6943 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6944 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6953 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6957 in
->make_nosnap_relative_path(path
);
6958 req
->set_filepath(path
);
6961 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6962 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6964 if (mask
& CEPH_SETATTR_MODE
) {
6965 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
6966 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6967 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6969 if (mask
& CEPH_SETATTR_UID
) {
6970 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
6971 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6972 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6974 if (mask
& CEPH_SETATTR_GID
) {
6975 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
6976 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6977 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6979 if (mask
& CEPH_SETATTR_BTIME
) {
6980 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
6981 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6983 if (mask
& CEPH_SETATTR_MTIME
) {
6984 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
6985 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6988 if (mask
& CEPH_SETATTR_ATIME
) {
6989 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
6990 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
6993 if (mask
& CEPH_SETATTR_SIZE
) {
6994 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
6995 req
->head
.args
.setattr
.size
= stx
->stx_size
;
6996 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
6999 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7002 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7005 req
->head
.args
.setattr
.mask
= mask
;
7007 req
->regetattr_mask
= mask
;
7009 int res
= make_request(req
, perms
, inp
);
7010 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7014 /* Note that we only care about attrs that setattr cares about */
7015 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7017 stx
->stx_size
= st
->st_size
;
7018 stx
->stx_mode
= st
->st_mode
;
7019 stx
->stx_uid
= st
->st_uid
;
7020 stx
->stx_gid
= st
->st_gid
;
7022 stx
->stx_mtime
= st
->st_mtimespec
;
7023 stx
->stx_atime
= st
->st_atimespec
;
7025 stx
->stx_mtime
= st
->st_mtim
;
7026 stx
->stx_atime
= st
->st_atim
;
7030 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7031 const UserPerm
& perms
, InodeRef
*inp
)
7033 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7036 if (mask
& CEPH_SETATTR_MODE
)
7037 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7041 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7042 const UserPerm
& perms
)
7044 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7045 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7046 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7047 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7048 if (cct
->_conf
->client_permissions
) {
7049 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7053 return __setattrx(in
.get(), stx
, mask
, perms
);
7056 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7057 const UserPerm
& perms
)
7059 struct ceph_statx stx
;
7061 stat_to_statx(attr
, &stx
);
7062 mask
&= ~CEPH_SETATTR_BTIME
;
7064 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7065 mask
&= ~CEPH_SETATTR_UID
;
7067 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7068 mask
&= ~CEPH_SETATTR_GID
;
7071 return _setattrx(in
, &stx
, mask
, perms
);
7074 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7075 const UserPerm
& perms
)
7077 std::lock_guard
lock(client_lock
);
7078 tout(cct
) << __func__
<< std::endl
;
7079 tout(cct
) << relpath
<< std::endl
;
7080 tout(cct
) << mask
<< std::endl
;
7085 filepath
path(relpath
);
7087 int r
= path_walk(path
, &in
, perms
);
7090 return _setattr(in
, attr
, mask
, perms
);
7093 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7094 const UserPerm
& perms
, int flags
)
7096 std::lock_guard
lock(client_lock
);
7097 tout(cct
) << __func__
<< std::endl
;
7098 tout(cct
) << relpath
<< std::endl
;
7099 tout(cct
) << mask
<< std::endl
;
7104 filepath
path(relpath
);
7106 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7109 return _setattrx(in
, stx
, mask
, perms
);
7112 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7114 std::lock_guard
lock(client_lock
);
7115 tout(cct
) << __func__
<< std::endl
;
7116 tout(cct
) << fd
<< std::endl
;
7117 tout(cct
) << mask
<< std::endl
;
7122 Fh
*f
= get_filehandle(fd
);
7125 #if defined(__linux__) && defined(O_PATH)
7126 if (f
->flags
& O_PATH
)
7129 return _setattr(f
->inode
, attr
, mask
, perms
);
7132 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7134 std::lock_guard
lock(client_lock
);
7135 tout(cct
) << __func__
<< std::endl
;
7136 tout(cct
) << fd
<< std::endl
;
7137 tout(cct
) << mask
<< std::endl
;
7142 Fh
*f
= get_filehandle(fd
);
7145 #if defined(__linux__) && defined(O_PATH)
7146 if (f
->flags
& O_PATH
)
7149 return _setattrx(f
->inode
, stx
, mask
, perms
);
7152 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7153 frag_info_t
*dirstat
, int mask
)
7155 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7156 std::lock_guard
lock(client_lock
);
7157 tout(cct
) << "stat" << std::endl
;
7158 tout(cct
) << relpath
<< std::endl
;
7163 filepath
path(relpath
);
7165 int r
= path_walk(path
, &in
, perms
, true, mask
);
7168 r
= _getattr(in
, mask
, perms
);
7170 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7173 fill_stat(in
, stbuf
, dirstat
);
7174 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7178 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7182 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7183 if (flags
& AT_NO_ATTR_SYNC
)
7186 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7187 mask
|= CEPH_CAP_PIN
;
7188 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7189 mask
|= CEPH_CAP_AUTH_SHARED
;
7190 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7191 mask
|= CEPH_CAP_LINK_SHARED
;
7192 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7193 mask
|= CEPH_CAP_FILE_SHARED
;
7194 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7195 mask
|= CEPH_CAP_XATTR_SHARED
;
7200 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7201 const UserPerm
& perms
,
7202 unsigned int want
, unsigned int flags
)
7204 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7205 std::lock_guard
lock(client_lock
);
7206 tout(cct
) << "statx" << std::endl
;
7207 tout(cct
) << relpath
<< std::endl
;
7212 filepath
path(relpath
);
7215 unsigned mask
= statx_to_mask(flags
, want
);
7217 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7221 r
= _getattr(in
, mask
, perms
);
7223 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7227 fill_statx(in
, mask
, stx
);
7228 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7232 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7233 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7235 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7236 std::lock_guard
lock(client_lock
);
7237 tout(cct
) << __func__
<< std::endl
;
7238 tout(cct
) << relpath
<< std::endl
;
7243 filepath
path(relpath
);
7245 // don't follow symlinks
7246 int r
= path_walk(path
, &in
, perms
, false, mask
);
7249 r
= _getattr(in
, mask
, perms
);
7251 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7254 fill_stat(in
, stbuf
, dirstat
);
7255 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7259 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7261 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7262 << " mode 0" << oct
<< in
->mode
<< dec
7263 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7264 memset(st
, 0, sizeof(struct stat
));
7265 if (use_faked_inos())
7266 st
->st_ino
= in
->faked_ino
;
7268 st
->st_ino
= in
->ino
;
7269 st
->st_dev
= in
->snapid
;
7270 st
->st_mode
= in
->mode
;
7271 st
->st_rdev
= in
->rdev
;
7273 switch (in
->nlink
) {
7275 st
->st_nlink
= 0; /* dir is unlinked */
7278 st
->st_nlink
= 1 /* parent dentry */
7280 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7286 st
->st_nlink
= in
->nlink
;
7288 st
->st_uid
= in
->uid
;
7289 st
->st_gid
= in
->gid
;
7290 if (in
->ctime
> in
->mtime
) {
7291 stat_set_ctime_sec(st
, in
->ctime
.sec());
7292 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7294 stat_set_ctime_sec(st
, in
->mtime
.sec());
7295 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7297 stat_set_atime_sec(st
, in
->atime
.sec());
7298 stat_set_atime_nsec(st
, in
->atime
.nsec());
7299 stat_set_mtime_sec(st
, in
->mtime
.sec());
7300 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7302 if (cct
->_conf
->client_dirsize_rbytes
)
7303 st
->st_size
= in
->rstat
.rbytes
;
7305 st
->st_size
= in
->dirstat
.size();
7308 st
->st_size
= in
->size
;
7309 st
->st_blocks
= (in
->size
+ 511) >> 9;
7311 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7314 *dirstat
= in
->dirstat
;
7318 return in
->caps_issued();
7321 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7323 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7324 << " mode 0" << oct
<< in
->mode
<< dec
7325 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7326 memset(stx
, 0, sizeof(struct ceph_statx
));
7329 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7330 * so that all bits are set.
7335 /* These are always considered to be available */
7336 stx
->stx_dev
= in
->snapid
;
7337 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7339 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7340 stx
->stx_mode
= S_IFMT
& in
->mode
;
7341 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7342 stx
->stx_rdev
= in
->rdev
;
7343 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7345 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7346 stx
->stx_uid
= in
->uid
;
7347 stx
->stx_gid
= in
->gid
;
7348 stx
->stx_mode
= in
->mode
;
7349 in
->btime
.to_timespec(&stx
->stx_btime
);
7350 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7353 if (mask
& CEPH_CAP_LINK_SHARED
) {
7355 switch (in
->nlink
) {
7357 stx
->stx_nlink
= 0; /* dir is unlinked */
7360 stx
->stx_nlink
= 1 /* parent dentry */
7362 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7368 stx
->stx_nlink
= in
->nlink
;
7370 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7373 if (mask
& CEPH_CAP_FILE_SHARED
) {
7375 in
->atime
.to_timespec(&stx
->stx_atime
);
7376 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7379 if (cct
->_conf
->client_dirsize_rbytes
)
7380 stx
->stx_size
= in
->rstat
.rbytes
;
7382 stx
->stx_size
= in
->dirstat
.size();
7383 stx
->stx_blocks
= 1;
7385 stx
->stx_size
= in
->size
;
7386 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7388 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7389 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7392 /* Change time and change_attr both require all shared caps to view */
7393 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7394 stx
->stx_version
= in
->change_attr
;
7395 if (in
->ctime
> in
->mtime
)
7396 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7398 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7399 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7404 void Client::touch_dn(Dentry
*dn
)
7409 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7411 std::lock_guard
lock(client_lock
);
7412 tout(cct
) << __func__
<< std::endl
;
7413 tout(cct
) << relpath
<< std::endl
;
7414 tout(cct
) << mode
<< std::endl
;
7419 filepath
path(relpath
);
7421 int r
= path_walk(path
, &in
, perms
);
7425 attr
.st_mode
= mode
;
7426 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7429 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7431 std::lock_guard
lock(client_lock
);
7432 tout(cct
) << __func__
<< std::endl
;
7433 tout(cct
) << fd
<< std::endl
;
7434 tout(cct
) << mode
<< std::endl
;
7439 Fh
*f
= get_filehandle(fd
);
7442 #if defined(__linux__) && defined(O_PATH)
7443 if (f
->flags
& O_PATH
)
7447 attr
.st_mode
= mode
;
7448 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7451 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7453 std::lock_guard
lock(client_lock
);
7454 tout(cct
) << __func__
<< std::endl
;
7455 tout(cct
) << relpath
<< std::endl
;
7456 tout(cct
) << mode
<< std::endl
;
7461 filepath
path(relpath
);
7463 // don't follow symlinks
7464 int r
= path_walk(path
, &in
, perms
, false);
7468 attr
.st_mode
= mode
;
7469 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7472 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7473 const UserPerm
& perms
)
7475 std::lock_guard
lock(client_lock
);
7476 tout(cct
) << __func__
<< std::endl
;
7477 tout(cct
) << relpath
<< std::endl
;
7478 tout(cct
) << new_uid
<< std::endl
;
7479 tout(cct
) << new_gid
<< std::endl
;
7484 filepath
path(relpath
);
7486 int r
= path_walk(path
, &in
, perms
);
7490 attr
.st_uid
= new_uid
;
7491 attr
.st_gid
= new_gid
;
7492 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7495 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7497 std::lock_guard
lock(client_lock
);
7498 tout(cct
) << __func__
<< std::endl
;
7499 tout(cct
) << fd
<< std::endl
;
7500 tout(cct
) << new_uid
<< std::endl
;
7501 tout(cct
) << new_gid
<< std::endl
;
7506 Fh
*f
= get_filehandle(fd
);
7509 #if defined(__linux__) && defined(O_PATH)
7510 if (f
->flags
& O_PATH
)
7514 attr
.st_uid
= new_uid
;
7515 attr
.st_gid
= new_gid
;
7517 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7518 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7519 return _setattr(f
->inode
, &attr
, mask
, perms
);
7522 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7523 const UserPerm
& perms
)
7525 std::lock_guard
lock(client_lock
);
7526 tout(cct
) << __func__
<< std::endl
;
7527 tout(cct
) << relpath
<< std::endl
;
7528 tout(cct
) << new_uid
<< std::endl
;
7529 tout(cct
) << new_gid
<< std::endl
;
7534 filepath
path(relpath
);
7536 // don't follow symlinks
7537 int r
= path_walk(path
, &in
, perms
, false);
7541 attr
.st_uid
= new_uid
;
7542 attr
.st_gid
= new_gid
;
7544 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7545 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7546 return _setattr(in
, &attr
, mask
, perms
);
7549 static void attr_set_atime_and_mtime(struct stat
*attr
,
7550 const utime_t
&atime
,
7551 const utime_t
&mtime
)
7553 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7554 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7555 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7556 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7559 // for [l]utime() invoke the timeval variant as the timespec
7560 // variant are not yet implemented. for futime[s](), invoke
7561 // the timespec variant.
7562 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7563 const UserPerm
& perms
)
7565 struct timeval tv
[2];
7566 tv
[0].tv_sec
= buf
->actime
;
7568 tv
[1].tv_sec
= buf
->modtime
;
7571 return utimes(relpath
, tv
, perms
);
7574 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7575 const UserPerm
& perms
)
7577 struct timeval tv
[2];
7578 tv
[0].tv_sec
= buf
->actime
;
7580 tv
[1].tv_sec
= buf
->modtime
;
7583 return lutimes(relpath
, tv
, perms
);
7586 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7588 struct timespec ts
[2];
7589 ts
[0].tv_sec
= buf
->actime
;
7591 ts
[1].tv_sec
= buf
->modtime
;
7594 return futimens(fd
, ts
, perms
);
7597 int Client::utimes(const char *relpath
, struct timeval times
[2],
7598 const UserPerm
& perms
)
7600 std::lock_guard
lock(client_lock
);
7601 tout(cct
) << __func__
<< std::endl
;
7602 tout(cct
) << relpath
<< std::endl
;
7603 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7605 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7611 filepath
path(relpath
);
7613 int r
= path_walk(path
, &in
, perms
);
7617 utime_t
atime(times
[0]);
7618 utime_t
mtime(times
[1]);
7620 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7621 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7624 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7625 const UserPerm
& perms
)
7627 std::lock_guard
lock(client_lock
);
7628 tout(cct
) << __func__
<< std::endl
;
7629 tout(cct
) << relpath
<< std::endl
;
7630 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7632 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7638 filepath
path(relpath
);
7640 int r
= path_walk(path
, &in
, perms
, false);
7644 utime_t
atime(times
[0]);
7645 utime_t
mtime(times
[1]);
7647 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7648 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7651 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7653 struct timespec ts
[2];
7654 ts
[0].tv_sec
= times
[0].tv_sec
;
7655 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7656 ts
[1].tv_sec
= times
[1].tv_sec
;
7657 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7659 return futimens(fd
, ts
, perms
);
7662 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7664 std::lock_guard
lock(client_lock
);
7665 tout(cct
) << __func__
<< std::endl
;
7666 tout(cct
) << fd
<< std::endl
;
7667 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7669 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7675 Fh
*f
= get_filehandle(fd
);
7678 #if defined(__linux__) && defined(O_PATH)
7679 if (f
->flags
& O_PATH
)
7683 utime_t
atime(times
[0]);
7684 utime_t
mtime(times
[1]);
7686 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7687 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7690 int Client::flock(int fd
, int operation
, uint64_t owner
)
7692 std::lock_guard
lock(client_lock
);
7693 tout(cct
) << __func__
<< std::endl
;
7694 tout(cct
) << fd
<< std::endl
;
7695 tout(cct
) << operation
<< std::endl
;
7696 tout(cct
) << owner
<< std::endl
;
7701 Fh
*f
= get_filehandle(fd
);
7705 return _flock(f
, operation
, owner
);
7708 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7710 std::lock_guard
lock(client_lock
);
7711 tout(cct
) << __func__
<< std::endl
;
7712 tout(cct
) << relpath
<< std::endl
;
7717 filepath
path(relpath
);
7719 int r
= path_walk(path
, &in
, perms
, true);
7722 if (cct
->_conf
->client_permissions
) {
7723 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7727 r
= _opendir(in
.get(), dirpp
, perms
);
7728 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7730 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7734 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7738 *dirpp
= new dir_result_t(in
, perms
);
7739 opened_dirs
.insert(*dirpp
);
7740 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7745 int Client::closedir(dir_result_t
*dir
)
7747 std::lock_guard
lock(client_lock
);
7748 tout(cct
) << __func__
<< std::endl
;
7749 tout(cct
) << (unsigned long)dir
<< std::endl
;
7751 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7756 void Client::_closedir(dir_result_t
*dirp
)
7758 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7760 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7761 dirp
->inode
.reset();
7763 _readdir_drop_dirp_buffer(dirp
);
7764 opened_dirs
.erase(dirp
);
7768 void Client::rewinddir(dir_result_t
*dirp
)
7770 std::lock_guard
lock(client_lock
);
7771 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7776 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7777 _readdir_drop_dirp_buffer(d
);
7781 loff_t
Client::telldir(dir_result_t
*dirp
)
7783 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7784 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7788 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7790 std::lock_guard
lock(client_lock
);
7792 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7797 if (offset
== dirp
->offset
)
7800 if (offset
> dirp
->offset
)
7801 dirp
->release_count
= 0; // bump if we do a forward seek
7803 dirp
->ordered_count
= 0; // disable filling readdir cache
7805 if (dirp
->hash_order()) {
7806 if (dirp
->offset
> offset
) {
7807 _readdir_drop_dirp_buffer(dirp
);
7812 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7813 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7814 _readdir_drop_dirp_buffer(dirp
);
7819 dirp
->offset
= offset
;
7824 // ino_t d_ino; /* inode number */
7825 // off_t d_off; /* offset to the next dirent */
7826 // unsigned short d_reclen; /* length of this record */
7827 // unsigned char d_type; /* type of file */
7828 // char d_name[256]; /* filename */
7830 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7832 strncpy(de
->d_name
, name
, 255);
7833 de
->d_name
[255] = '\0';
7836 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7837 de
->d_off
= next_off
;
7840 de
->d_type
= IFTODT(type
);
7841 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7842 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7846 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7848 frag_t fg
= dirp
->buffer_frag
;
7850 if (fg
.is_rightmost()) {
7851 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7858 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7860 if (dirp
->hash_order()) {
7862 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7863 if (dirp
->offset
< new_offset
) // don't decrease offset
7864 dirp
->offset
= new_offset
;
7866 dirp
->last_name
.clear();
7867 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7868 _readdir_rechoose_frag(dirp
);
7872 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7874 ceph_assert(dirp
->inode
);
7876 if (dirp
->hash_order())
7879 frag_t cur
= frag_t(dirp
->offset_high());
7880 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7882 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7883 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7884 dirp
->last_name
.clear();
7885 dirp
->next_offset
= 2;
7889 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7891 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7892 dirp
->buffer
.clear();
7895 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7898 ceph_assert(dirp
->inode
);
7900 // get the current frag.
7902 if (dirp
->hash_order())
7903 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7905 fg
= frag_t(dirp
->offset_high());
7907 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7908 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7910 int op
= CEPH_MDS_OP_READDIR
;
7911 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7912 op
= CEPH_MDS_OP_LSSNAP
;
7914 InodeRef
& diri
= dirp
->inode
;
7916 MetaRequest
*req
= new MetaRequest(op
);
7918 diri
->make_nosnap_relative_path(path
);
7919 req
->set_filepath(path
);
7920 req
->set_inode(diri
.get());
7921 req
->head
.args
.readdir
.frag
= fg
;
7922 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7923 if (dirp
->last_name
.length()) {
7924 req
->path2
.set_path(dirp
->last_name
);
7925 } else if (dirp
->hash_order()) {
7926 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7931 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7933 if (res
== -EAGAIN
) {
7934 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
7935 _readdir_rechoose_frag(dirp
);
7936 return _readdir_get_frag(dirp
);
7940 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
7941 << " size " << dirp
->buffer
.size() << dendl
;
7943 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
7950 struct dentry_off_lt
{
7951 bool operator()(const Dentry
* dn
, int64_t off
) const {
7952 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7956 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7957 int caps
, bool getref
)
7959 ceph_assert(client_lock
.is_locked());
7960 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
7961 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
7963 Dir
*dir
= dirp
->inode
->dir
;
7966 ldout(cct
, 10) << " dir is empty" << dendl
;
7971 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
7972 dir
->readdir_cache
.end(),
7973 dirp
->offset
, dentry_off_lt());
7977 if (!dirp
->inode
->is_complete_and_ordered())
7979 if (pd
== dir
->readdir_cache
.end())
7982 if (dn
->inode
== NULL
) {
7983 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
7987 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
7988 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
7993 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
7997 struct ceph_statx stx
;
7999 fill_statx(dn
->inode
, caps
, &stx
);
8001 uint64_t next_off
= dn
->offset
+ 1;
8003 if (pd
== dir
->readdir_cache
.end())
8004 next_off
= dir_result_t::END
;
8007 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8009 in
= dn
->inode
.get();
8013 dn_name
= dn
->name
; // fill in name while we have lock
8015 client_lock
.Unlock();
8016 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8018 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8019 << " = " << r
<< dendl
;
8024 dirp
->offset
= next_off
;
8026 dirp
->next_offset
= 2;
8028 dirp
->next_offset
= dirp
->offset_low();
8029 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8030 dirp
->release_count
= 0; // last_name no longer match cache index
8035 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8040 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8041 unsigned want
, unsigned flags
, bool getref
)
8043 int caps
= statx_to_mask(flags
, want
);
8045 std::lock_guard
lock(client_lock
);
8050 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8052 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8053 << dec
<< " at_end=" << dirp
->at_end()
8054 << " hash_order=" << dirp
->hash_order() << dendl
;
8057 struct ceph_statx stx
;
8058 memset(&de
, 0, sizeof(de
));
8059 memset(&stx
, 0, sizeof(stx
));
8061 InodeRef
& diri
= dirp
->inode
;
8066 if (dirp
->offset
== 0) {
8067 ldout(cct
, 15) << " including ." << dendl
;
8068 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8069 uint64_t next_off
= 1;
8072 r
= _getattr(diri
, caps
, dirp
->perms
);
8076 fill_statx(diri
, caps
, &stx
);
8077 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8079 Inode
*inode
= NULL
;
8085 client_lock
.Unlock();
8086 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8091 dirp
->offset
= next_off
;
8095 if (dirp
->offset
== 1) {
8096 ldout(cct
, 15) << " including .." << dendl
;
8097 uint64_t next_off
= 2;
8099 if (diri
->dentries
.empty())
8102 in
= diri
->get_first_parent()->dir
->parent_inode
;
8105 r
= _getattr(in
, caps
, dirp
->perms
);
8109 fill_statx(in
, caps
, &stx
);
8110 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8112 Inode
*inode
= NULL
;
8118 client_lock
.Unlock();
8119 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8124 dirp
->offset
= next_off
;
8129 // can we read from our cache?
8130 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8131 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8132 << dirp
->inode
->is_complete_and_ordered()
8133 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8135 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8136 dirp
->inode
->is_complete_and_ordered() &&
8137 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8138 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8147 bool check_caps
= true;
8148 if (!dirp
->is_cached()) {
8149 int r
= _readdir_get_frag(dirp
);
8152 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8153 // different than the requested one. (our dirfragtree was outdated)
8156 frag_t fg
= dirp
->buffer_frag
;
8158 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8159 << " offset " << hex
<< dirp
->offset
<< dendl
;
8161 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8162 dirp
->offset
, dir_result_t::dentry_off_lt());
8163 it
!= dirp
->buffer
.end();
8165 dir_result_t::dentry
&entry
= *it
;
8167 uint64_t next_off
= entry
.offset
+ 1;
8171 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8176 fill_statx(entry
.inode
, caps
, &stx
);
8177 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8179 Inode
*inode
= NULL
;
8181 inode
= entry
.inode
.get();
8185 client_lock
.Unlock();
8186 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8189 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8190 << " = " << r
<< dendl
;
8194 dirp
->offset
= next_off
;
8199 if (dirp
->next_offset
> 2) {
8200 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8201 _readdir_drop_dirp_buffer(dirp
);
8205 if (!fg
.is_rightmost()) {
8207 _readdir_next_frag(dirp
);
8211 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8212 diri
->dir_release_count
== dirp
->release_count
) {
8213 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8214 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8216 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8217 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8219 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8221 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8222 diri
->flags
|= I_COMPLETE
;
8234 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8236 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8243 * 1 if we got a dirent
8244 * 0 for end of directory
8248 struct single_readdir
{
8250 struct ceph_statx
*stx
;
8255 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8256 struct ceph_statx
*stx
, off_t off
,
8259 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8262 return -1; // already filled this dirent
8272 struct dirent
*Client::readdir(dir_result_t
*d
)
8275 static struct dirent de
;
8282 // our callback fills the dirent and sets sr.full=true on first
8283 // call, and returns -1 the second time around.
8284 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8286 errno
= -ret
; // this sucks.
8287 return (dirent
*) NULL
;
8292 return (dirent
*) NULL
;
8295 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8296 struct ceph_statx
*stx
, unsigned want
,
8297 unsigned flags
, Inode
**out
)
8305 // our callback fills the dirent and sets sr.full=true on first
8306 // call, and returns -1 the second time around.
8307 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8319 struct getdents_result
{
8326 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8327 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8329 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8335 dlen
= strlen(de
->d_name
) + 1;
8337 if (c
->pos
+ dlen
> c
->buflen
)
8338 return -1; // doesn't fit
8341 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8343 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8349 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8354 gr
.fullent
= fullent
;
8357 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8359 if (r
< 0) { // some error
8360 if (r
== -1) { // buffer ran out of space
8361 if (gr
.pos
) { // but we got some entries already!
8363 } // or we need a larger buffer
8365 } else { // actual error, return it
8374 struct getdir_result
{
8375 list
<string
> *contents
;
8379 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8381 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8383 r
->contents
->push_back(de
->d_name
);
8388 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8389 const UserPerm
& perms
)
8391 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8393 std::lock_guard
lock(client_lock
);
8394 tout(cct
) << "getdir" << std::endl
;
8395 tout(cct
) << relpath
<< std::endl
;
8399 int r
= opendir(relpath
, &d
, perms
);
8404 gr
.contents
= &contents
;
8406 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8416 /****** file i/o **********/
8417 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8418 mode_t mode
, int stripe_unit
, int stripe_count
,
8419 int object_size
, const char *data_pool
)
8421 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8422 std::lock_guard
lock(client_lock
);
8423 tout(cct
) << "open" << std::endl
;
8424 tout(cct
) << relpath
<< std::endl
;
8425 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8432 #if defined(__linux__) && defined(O_PATH)
8433 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8434 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8435 * in kernel (fs/open.c). */
8437 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8440 filepath
path(relpath
);
8442 bool created
= false;
8443 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8444 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8445 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8447 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8450 #if defined(__linux__) && defined(O_PATH)
8451 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8453 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8457 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8458 filepath dirpath
= path
;
8459 string dname
= dirpath
.last_dentry();
8460 dirpath
.pop_dentry();
8462 r
= path_walk(dirpath
, &dir
, perms
, true,
8463 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8466 if (cct
->_conf
->client_permissions
) {
8467 r
= may_create(dir
.get(), perms
);
8471 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8472 stripe_count
, object_size
, data_pool
, &created
, perms
);
8478 // posix says we can only check permissions of existing files
8479 if (cct
->_conf
->client_permissions
) {
8480 r
= may_open(in
.get(), flags
, perms
);
8487 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8489 // allocate a integer file descriptor
8492 ceph_assert(fd_map
.count(r
) == 0);
8497 tout(cct
) << r
<< std::endl
;
8498 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8502 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8504 /* Use default file striping parameters */
8505 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8508 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8509 const UserPerm
& perms
)
8511 std::lock_guard
lock(client_lock
);
8512 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8517 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8519 req
->set_filepath(path
);
8521 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8523 sprintf(f
, "%u", h
);
8524 filepath
path2(dirino
);
8525 path2
.push_dentry(string(f
));
8526 req
->set_filepath2(path2
);
8528 int r
= make_request(req
, perms
, NULL
, NULL
,
8529 rand() % mdsmap
->get_num_in_mds());
8530 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8536 * Load inode into local cache.
8538 * If inode pointer is non-NULL, and take a reference on
8539 * the resulting Inode object in one operation, so that caller
8540 * can safely assume inode will still be there after return.
8542 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8544 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8549 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8551 req
->set_filepath(path
);
8553 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8554 if (r
== 0 && inode
!= NULL
) {
8555 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8556 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8557 ceph_assert(p
!= inode_map
.end());
8561 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8565 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8567 std::lock_guard
lock(client_lock
);
8568 return _lookup_ino(ino
, perms
, inode
);
8572 * Find the parent inode of `ino` and insert it into
8573 * our cache. Conditionally also set `parent` to a referenced
8574 * Inode* if caller provides non-NULL value.
8576 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8578 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8583 if (!ino
->dentries
.empty()) {
8584 // if we exposed the parent here, we'd need to check permissions,
8585 // but right now we just rely on the MDS doing so in make_request
8586 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
8590 if (ino
->is_root()) {
8592 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
8596 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8597 filepath
path(ino
->ino
);
8598 req
->set_filepath(path
);
8601 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8602 // Give caller a reference to the parent ino if they provided a pointer.
8603 if (parent
!= NULL
) {
8605 *parent
= target
.get();
8607 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8612 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8616 int Client::lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8618 std::lock_guard
lock(client_lock
);
8619 return _lookup_parent(ino
, perms
, parent
);
8623 * Populate the parent dentry for `ino`, provided it is
8624 * a child of `parent`.
8626 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8628 ceph_assert(parent
->is_dir());
8629 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8634 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8635 req
->set_filepath2(filepath(parent
->ino
));
8636 req
->set_filepath(filepath(ino
->ino
));
8637 req
->set_inode(ino
);
8639 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8640 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8644 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8646 std::lock_guard
lock(client_lock
);
8647 return _lookup_name(ino
, parent
, perms
);
8650 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8653 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8655 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8657 if (in
->snapid
!= CEPH_NOSNAP
) {
8658 in
->snap_cap_refs
++;
8659 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8660 << ccap_string(in
->caps_issued()) << dendl
;
8663 const auto& conf
= cct
->_conf
;
8664 f
->readahead
.set_trigger_requests(1);
8665 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8666 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8667 if (conf
->client_readahead_max_bytes
) {
8668 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8670 if (conf
->client_readahead_max_periods
) {
8671 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8673 f
->readahead
.set_max_readahead_size(max_readahead
);
8674 vector
<uint64_t> alignments
;
8675 alignments
.push_back(in
->layout
.get_period());
8676 alignments
.push_back(in
->layout
.stripe_unit
);
8677 f
->readahead
.set_alignments(alignments
);
8682 int Client::_release_fh(Fh
*f
)
8684 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8685 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8686 Inode
*in
= f
->inode
.get();
8687 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8691 if (in
->snapid
== CEPH_NOSNAP
) {
8692 if (in
->put_open_ref(f
->mode
)) {
8693 _flush(in
, new C_Client_FlushComplete(this, in
));
8697 ceph_assert(in
->snap_cap_refs
> 0);
8698 in
->snap_cap_refs
--;
8701 _release_filelocks(f
);
8703 // Finally, read any async err (i.e. from flushes)
8704 int err
= f
->take_async_err();
8706 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8707 << cpp_strerror(err
) << dendl
;
8709 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8717 void Client::_put_fh(Fh
*f
)
8719 int left
= f
->put();
8725 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8726 const UserPerm
& perms
)
8728 if (in
->snapid
!= CEPH_NOSNAP
&&
8729 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8733 // use normalized flags to generate cmode
8734 int cflags
= ceph_flags_sys2wire(flags
);
8735 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8736 cflags
|= CEPH_O_LAZY
;
8738 int cmode
= ceph_flags_to_mode(cflags
);
8739 int want
= ceph_caps_for_mode(cmode
);
8742 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8744 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8746 check_caps(in
, CHECK_CAPS_NODELAY
);
8749 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8751 in
->make_nosnap_relative_path(path
);
8752 req
->set_filepath(path
);
8753 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8754 req
->head
.args
.open
.mode
= mode
;
8755 req
->head
.args
.open
.pool
= -1;
8756 if (cct
->_conf
->client_debug_getattr_caps
)
8757 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8759 req
->head
.args
.open
.mask
= 0;
8760 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8762 result
= make_request(req
, perms
);
8765 * NFS expects that delegations will be broken on a conflicting open,
8766 * not just when there is actual conflicting access to the file. SMB leases
8767 * and oplocks also have similar semantics.
8769 * Ensure that clients that have delegations enabled will wait on minimal
8770 * caps during open, just to ensure that other clients holding delegations
8771 * return theirs first.
8773 if (deleg_timeout
&& result
== 0) {
8776 if (cmode
& CEPH_FILE_MODE_WR
)
8777 need
|= CEPH_CAP_FILE_WR
;
8778 if (cmode
& CEPH_FILE_MODE_RD
)
8779 need
|= CEPH_CAP_FILE_RD
;
8781 result
= get_caps(in
, need
, want
, &have
, -1);
8783 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8784 " . Denying open: " <<
8785 cpp_strerror(result
) << dendl
;
8786 in
->put_open_ref(cmode
);
8788 put_cap_ref(in
, need
);
8796 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8798 in
->put_open_ref(cmode
);
8806 int Client::_renew_caps(Inode
*in
)
8808 int wanted
= in
->caps_file_wanted();
8809 if (in
->is_any_caps() &&
8810 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8811 check_caps(in
, CHECK_CAPS_NODELAY
);
8816 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8818 else if (wanted
& CEPH_CAP_FILE_RD
)
8820 else if (wanted
& CEPH_CAP_FILE_WR
)
8823 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8825 in
->make_nosnap_relative_path(path
);
8826 req
->set_filepath(path
);
8827 req
->head
.args
.open
.flags
= flags
;
8828 req
->head
.args
.open
.pool
= -1;
8829 if (cct
->_conf
->client_debug_getattr_caps
)
8830 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8832 req
->head
.args
.open
.mask
= 0;
8835 // duplicate in case Cap goes away; not sure if that race is a concern?
8836 const UserPerm
*pperm
= in
->get_best_perms();
8840 int ret
= make_request(req
, perms
);
8844 int Client::close(int fd
)
8846 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8847 std::lock_guard
lock(client_lock
);
8848 tout(cct
) << "close" << std::endl
;
8849 tout(cct
) << fd
<< std::endl
;
8854 Fh
*fh
= get_filehandle(fd
);
8857 int err
= _release_fh(fh
);
8860 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8868 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8870 std::lock_guard
lock(client_lock
);
8871 tout(cct
) << "lseek" << std::endl
;
8872 tout(cct
) << fd
<< std::endl
;
8873 tout(cct
) << offset
<< std::endl
;
8874 tout(cct
) << whence
<< std::endl
;
8879 Fh
*f
= get_filehandle(fd
);
8882 #if defined(__linux__) && defined(O_PATH)
8883 if (f
->flags
& O_PATH
)
8886 return _lseek(f
, offset
, whence
);
8889 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8891 Inode
*in
= f
->inode
.get();
8905 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8908 pos
= in
->size
+ offset
;
8921 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8926 void Client::lock_fh_pos(Fh
*f
)
8928 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8930 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8932 f
->pos_waiters
.push_back(&cond
);
8933 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
8934 while (f
->pos_locked
|| f
->pos_waiters
.front() != &cond
)
8935 cond
.Wait(client_lock
);
8936 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
8937 ceph_assert(f
->pos_waiters
.front() == &cond
);
8938 f
->pos_waiters
.pop_front();
8941 f
->pos_locked
= true;
8944 void Client::unlock_fh_pos(Fh
*f
)
8946 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
8947 f
->pos_locked
= false;
8950 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
8952 if (!in
->inline_data
.length()) {
8953 onfinish
->complete(0);
8958 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
8959 object_t oid
= oid_buf
;
8961 ObjectOperation create_ops
;
8962 create_ops
.create(false);
8964 objecter
->mutate(oid
,
8965 OSDMap::file_to_object_locator(in
->layout
),
8967 in
->snaprealm
->get_snap_context(),
8968 ceph::real_clock::now(),
8972 bufferlist inline_version_bl
;
8973 encode(in
->inline_version
, inline_version_bl
);
8975 ObjectOperation uninline_ops
;
8976 uninline_ops
.cmpxattr("inline_version",
8977 CEPH_OSD_CMPXATTR_OP_GT
,
8978 CEPH_OSD_CMPXATTR_MODE_U64
,
8980 bufferlist inline_data
= in
->inline_data
;
8981 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
8982 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
8984 objecter
->mutate(oid
,
8985 OSDMap::file_to_object_locator(in
->layout
),
8987 in
->snaprealm
->get_snap_context(),
8988 ceph::real_clock::now(),
8997 // blocking osd interface
8999 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9001 std::lock_guard
lock(client_lock
);
9002 tout(cct
) << "read" << std::endl
;
9003 tout(cct
) << fd
<< std::endl
;
9004 tout(cct
) << size
<< std::endl
;
9005 tout(cct
) << offset
<< std::endl
;
9010 Fh
*f
= get_filehandle(fd
);
9013 #if defined(__linux__) && defined(O_PATH)
9014 if (f
->flags
& O_PATH
)
9018 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9019 size
= std::min(size
, (loff_t
)INT_MAX
);
9020 int r
= _read(f
, offset
, size
, &bl
);
9021 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9023 bl
.copy(0, bl
.length(), buf
);
9029 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9033 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9036 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9039 bool movepos
= false;
9040 std::unique_ptr
<C_SaferCond
> onuninline
;
9042 const auto& conf
= cct
->_conf
;
9043 Inode
*in
= f
->inode
.get();
9045 utime_t start
= ceph_clock_now();
9047 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9049 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9056 loff_t start_pos
= offset
;
9058 if (in
->inline_version
== 0) {
9059 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9063 ceph_assert(in
->inline_version
> 0);
9067 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9068 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9070 want
= CEPH_CAP_FILE_CACHE
;
9071 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9075 if (f
->flags
& O_DIRECT
)
9076 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9078 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9079 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9080 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9081 uninline_data(in
, onuninline
.get());
9083 uint32_t len
= in
->inline_data
.length();
9084 uint64_t endoff
= offset
+ size
;
9085 if (endoff
> in
->size
)
9089 if (endoff
<= len
) {
9090 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9092 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9093 bl
->append_zero(endoff
- len
);
9095 r
= endoff
- offset
;
9096 } else if ((uint64_t)offset
< endoff
) {
9097 bl
->append_zero(endoff
- offset
);
9098 r
= endoff
- offset
;
9106 if (!conf
->client_debug_force_sync_read
&&
9108 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9110 if (f
->flags
& O_RSYNC
) {
9111 _flush_range(in
, offset
, size
);
9113 r
= _read_async(f
, offset
, size
, bl
);
9117 if (f
->flags
& O_DIRECT
)
9118 _flush_range(in
, offset
, size
);
9120 bool checkeof
= false;
9121 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9128 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9131 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9136 if ((uint64_t)offset
< in
->size
)
9142 ceph_assert(r
>= 0);
9145 f
->pos
= start_pos
+ r
;
9148 lat
= ceph_clock_now();
9150 logger
->tinc(l_c_read
, lat
);
9156 client_lock
.Unlock();
9157 int ret
= onuninline
->wait();
9159 if (ret
>= 0 || ret
== -ECANCELED
) {
9160 in
->inline_data
.clear();
9161 in
->inline_version
= CEPH_INLINE_NONE
;
9162 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9168 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9176 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9179 f
->readahead
.inc_pending();
9182 Client::C_Readahead::~C_Readahead() {
9183 f
->readahead
.dec_pending();
9187 void Client::C_Readahead::finish(int r
) {
9188 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9189 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9192 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9194 const auto& conf
= cct
->_conf
;
9195 Inode
*in
= f
->inode
.get();
9197 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9199 // trim read based on file size?
9200 if (off
>= in
->size
)
9204 if (off
+ len
> in
->size
) {
9205 len
= in
->size
- off
;
9208 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9209 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9210 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9212 // read (and possibly block)
9214 C_SaferCond
onfinish("Client::_read_async flock");
9215 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9216 off
, len
, bl
, 0, &onfinish
);
9218 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9219 client_lock
.Unlock();
9220 r
= onfinish
.wait();
9222 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9225 if(f
->readahead
.get_min_readahead_size() > 0) {
9226 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9227 if (readahead_extent
.second
> 0) {
9228 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9229 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9230 Context
*onfinish2
= new C_Readahead(this, f
);
9231 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9232 readahead_extent
.first
, readahead_extent
.second
,
9233 NULL
, 0, onfinish2
);
9235 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9236 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9238 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9247 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9250 Inode
*in
= f
->inode
.get();
9255 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9257 Mutex
flock("Client::_read_sync flock");
9260 C_SaferCond
onfinish("Client::_read_sync flock");
9264 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9266 in
->truncate_size
, in
->truncate_seq
,
9268 client_lock
.Unlock();
9269 int r
= onfinish
.wait();
9272 // if we get ENOENT from OSD, assume 0 bytes returned
9283 bl
->claim_append(tbl
);
9286 if (r
>= 0 && r
< wanted
) {
9287 if (pos
< in
->size
) {
9288 // zero up to known EOF
9289 int64_t some
= in
->size
- pos
;
9292 auto z
= buffer::ptr_node::create(some
);
9294 bl
->push_back(std::move(z
));
9311 * we keep count of uncommitted sync writes on the inode, so that
9314 void Client::_sync_write_commit(Inode
*in
)
9316 ceph_assert(unsafe_sync_write
> 0);
9317 unsafe_sync_write
--;
9319 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9321 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9322 if (unsafe_sync_write
== 0 && unmounting
) {
9323 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9324 mount_cond
.Signal();
9328 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9330 std::lock_guard
lock(client_lock
);
9331 tout(cct
) << "write" << std::endl
;
9332 tout(cct
) << fd
<< std::endl
;
9333 tout(cct
) << size
<< std::endl
;
9334 tout(cct
) << offset
<< std::endl
;
9339 Fh
*fh
= get_filehandle(fd
);
9342 #if defined(__linux__) && defined(O_PATH)
9343 if (fh
->flags
& O_PATH
)
9346 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9347 size
= std::min(size
, (loff_t
)INT_MAX
);
9348 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9349 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9353 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9357 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9360 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9361 unsigned iovcnt
, int64_t offset
, bool write
,
9364 #if defined(__linux__) && defined(O_PATH)
9365 if (fh
->flags
& O_PATH
)
9368 loff_t totallen
= 0;
9369 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9370 totallen
+= iov
[i
].iov_len
;
9374 * Some of the API functions take 64-bit size values, but only return
9375 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9376 * we don't do I/Os larger than the values we can return.
9379 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9382 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9383 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9387 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9388 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9393 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9395 * This piece of code aims to handle the case that bufferlist does not have enough data
9396 * to fill in the iov
9398 if (resid
< iov
[j
].iov_len
) {
9399 bl
.copy(bufoff
, resid
, (char *)iov
[j
].iov_base
);
9402 bl
.copy(bufoff
, iov
[j
].iov_len
, (char *)iov
[j
].iov_base
);
9404 resid
-= iov
[j
].iov_len
;
9405 bufoff
+= iov
[j
].iov_len
;
9411 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9413 std::lock_guard
lock(client_lock
);
9414 tout(cct
) << fd
<< std::endl
;
9415 tout(cct
) << offset
<< std::endl
;
9420 Fh
*fh
= get_filehandle(fd
);
9423 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9426 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9427 const struct iovec
*iov
, int iovcnt
)
9431 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9434 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9435 Inode
*in
= f
->inode
.get();
9437 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9441 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9443 // was Fh opened as writeable?
9444 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9447 // use/adjust fd pos?
9451 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9452 * change out from under us.
9454 if (f
->flags
& O_APPEND
) {
9455 int r
= _lseek(f
, 0, SEEK_END
);
9467 uint64_t endoff
= offset
+ size
;
9468 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9473 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9475 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9478 utime_t start
= ceph_clock_now();
9480 if (in
->inline_version
== 0) {
9481 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9484 ceph_assert(in
->inline_version
> 0);
9487 // copy into fresh buffer (since our write may be resub, async)
9491 bl
.append(buf
, size
);
9493 for (int i
= 0; i
< iovcnt
; i
++) {
9494 if (iov
[i
].iov_len
> 0) {
9495 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9501 uint64_t totalwritten
;
9503 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9504 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9506 want
= CEPH_CAP_FILE_BUFFER
;
9507 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9511 /* clear the setuid/setgid bits, if any */
9512 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9513 struct ceph_statx stx
= { 0 };
9515 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9516 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9520 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9523 if (f
->flags
& O_DIRECT
)
9524 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9526 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9528 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9530 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9531 if (endoff
> cct
->_conf
->client_max_inline_size
||
9532 endoff
> CEPH_INLINE_MAX_SIZE
||
9533 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9534 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9535 uninline_data(in
, onuninline
.get());
9537 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9539 uint32_t len
= in
->inline_data
.length();
9542 in
->inline_data
.copy(endoff
, len
- endoff
, bl
);
9545 in
->inline_data
.splice(offset
, len
- offset
);
9546 else if (offset
> len
)
9547 in
->inline_data
.append_zero(offset
- len
);
9549 in
->inline_data
.append(bl
);
9550 in
->inline_version
++;
9552 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9558 if (cct
->_conf
->client_oc
&&
9559 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9560 // do buffered write
9561 if (!in
->oset
.dirty_or_tx
)
9562 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9564 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9566 // async, caching, non-blocking.
9567 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9568 in
->snaprealm
->get_snap_context(),
9569 offset
, size
, bl
, ceph::real_clock::now(),
9571 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9576 // flush cached write if O_SYNC is set on file fh
9577 // O_DSYNC == O_SYNC on linux < 2.6.33
9578 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9579 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9580 _flush_range(in
, offset
, size
);
9583 if (f
->flags
& O_DIRECT
)
9584 _flush_range(in
, offset
, size
);
9586 // simple, non-atomic sync write
9587 C_SaferCond
onfinish("Client::_write flock");
9588 unsafe_sync_write
++;
9589 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9591 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9592 offset
, size
, bl
, ceph::real_clock::now(), 0,
9593 in
->truncate_size
, in
->truncate_seq
,
9595 client_lock
.Unlock();
9598 _sync_write_commit(in
);
9601 // if we get here, write was successful, update client metadata
9604 lat
= ceph_clock_now();
9606 logger
->tinc(l_c_wrlat
, lat
);
9613 totalwritten
= size
;
9614 r
= (int64_t)totalwritten
;
9617 if (totalwritten
+ offset
> in
->size
) {
9618 in
->size
= totalwritten
+ offset
;
9619 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9621 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9622 check_caps(in
, CHECK_CAPS_NODELAY
);
9623 } else if (is_max_size_approaching(in
)) {
9627 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9629 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9633 in
->mtime
= in
->ctime
= ceph_clock_now();
9635 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9639 if (nullptr != onuninline
) {
9640 client_lock
.Unlock();
9641 int uninline_ret
= onuninline
->wait();
9644 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9645 in
->inline_data
.clear();
9646 in
->inline_version
= CEPH_INLINE_NONE
;
9647 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9653 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9657 int Client::_flush(Fh
*f
)
9659 Inode
*in
= f
->inode
.get();
9660 int err
= f
->take_async_err();
9662 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9663 << cpp_strerror(err
) << dendl
;
9665 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9671 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9673 struct ceph_statx stx
;
9674 stx
.stx_size
= length
;
9675 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9678 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9680 std::lock_guard
lock(client_lock
);
9681 tout(cct
) << __func__
<< std::endl
;
9682 tout(cct
) << fd
<< std::endl
;
9683 tout(cct
) << length
<< std::endl
;
9688 Fh
*f
= get_filehandle(fd
);
9691 #if defined(__linux__) && defined(O_PATH)
9692 if (f
->flags
& O_PATH
)
9696 attr
.st_size
= length
;
9697 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9700 int Client::fsync(int fd
, bool syncdataonly
)
9702 std::lock_guard
lock(client_lock
);
9703 tout(cct
) << "fsync" << std::endl
;
9704 tout(cct
) << fd
<< std::endl
;
9705 tout(cct
) << syncdataonly
<< std::endl
;
9710 Fh
*f
= get_filehandle(fd
);
9713 #if defined(__linux__) && defined(O_PATH)
9714 if (f
->flags
& O_PATH
)
9717 int r
= _fsync(f
, syncdataonly
);
9719 // The IOs in this fsync were okay, but maybe something happened
9720 // in the background that we shoudl be reporting?
9721 r
= f
->take_async_err();
9722 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9723 << ") = 0, async_err = " << r
<< dendl
;
9725 // Assume that an error we encountered during fsync, even reported
9726 // synchronously, would also have applied the error to the Fh, and we
9727 // should clear it here to avoid returning the same error again on next
9729 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9731 f
->take_async_err();
9736 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9739 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9740 ceph_tid_t flush_tid
= 0;
9743 utime_t start
= ceph_clock_now();
9745 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9747 if (cct
->_conf
->client_oc
) {
9748 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9749 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9750 _flush(in
, object_cacher_completion
.get());
9751 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9754 if (!syncdataonly
&& in
->dirty_caps
) {
9755 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9756 if (in
->flushing_caps
)
9757 flush_tid
= last_flush_tid
;
9758 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9760 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9763 MetaRequest
*req
= in
->unsafe_ops
.back();
9764 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9767 wait_on_list(req
->waitfor_safe
);
9771 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9772 client_lock
.Unlock();
9773 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9774 r
= object_cacher_completion
->wait();
9776 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9778 // FIXME: this can starve
9779 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9780 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9781 << " uncommitted, waiting" << dendl
;
9782 wait_on_list(in
->waitfor_commit
);
9788 wait_sync_caps(in
, flush_tid
);
9790 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9792 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9793 << cpp_strerror(-r
) << dendl
;
9796 lat
= ceph_clock_now();
9798 logger
->tinc(l_c_fsync
, lat
);
9803 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9805 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9806 return _fsync(f
->inode
.get(), syncdataonly
);
9809 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9811 std::lock_guard
lock(client_lock
);
9812 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9813 tout(cct
) << fd
<< std::endl
;
9818 Fh
*f
= get_filehandle(fd
);
9821 int r
= _getattr(f
->inode
, mask
, perms
);
9824 fill_stat(f
->inode
, stbuf
, NULL
);
9825 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9829 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9830 unsigned int want
, unsigned int flags
)
9832 std::lock_guard
lock(client_lock
);
9833 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9834 tout(cct
) << fd
<< std::endl
;
9839 Fh
*f
= get_filehandle(fd
);
9843 unsigned mask
= statx_to_mask(flags
, want
);
9846 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9847 r
= _getattr(f
->inode
, mask
, perms
);
9849 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9854 fill_statx(f
->inode
, mask
, stx
);
9855 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9859 // not written yet, but i want to link!
9861 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9862 const UserPerm
& perms
)
9864 std::lock_guard
lock(client_lock
);
9865 tout(cct
) << "chdir" << std::endl
;
9866 tout(cct
) << relpath
<< std::endl
;
9871 filepath
path(relpath
);
9873 int r
= path_walk(path
, &in
, perms
);
9878 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9880 _getcwd(new_cwd
, perms
);
9884 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9887 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
9889 Inode
*in
= cwd
.get();
9890 while (in
!= root
) {
9891 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
9893 // A cwd or ancester is unlinked
9894 if (in
->dentries
.empty()) {
9898 Dentry
*dn
= in
->get_first_parent();
9903 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
9904 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9905 filepath
path(in
->ino
);
9906 req
->set_filepath(path
);
9908 int res
= make_request(req
, perms
);
9917 path
.push_front_dentry(dn
->name
);
9918 in
= dn
->dir
->parent_inode
;
9921 dir
+= path
.get_path();
9924 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9926 std::lock_guard
l(client_lock
);
9928 _getcwd(dir
, perms
);
9931 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9932 const UserPerm
& perms
)
9934 std::lock_guard
l(client_lock
);
9935 tout(cct
) << __func__
<< std::endl
;
9936 unsigned long int total_files_on_fs
;
9944 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
9945 if (data_pools
.size() == 1) {
9946 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
9948 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
9951 client_lock
.Unlock();
9952 int rval
= cond
.wait();
9954 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
9958 ldout(cct
, 1) << "underlying call to statfs returned error: "
9959 << cpp_strerror(rval
)
9964 memset(stbuf
, 0, sizeof(*stbuf
));
9967 * we're going to set a block size of 4MB so we can represent larger
9968 * FSes without overflowing. Additionally convert the space
9969 * measurements from KB to bytes while making them in terms of
9970 * blocks. We use 4MB only because it is big enough, and because it
9971 * actually *is* the (ceph) default block size.
9973 const int CEPH_BLOCK_SHIFT
= 22;
9974 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
9975 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
9976 stbuf
->f_files
= total_files_on_fs
;
9978 stbuf
->f_favail
= -1;
9979 stbuf
->f_fsid
= -1; // ??
9980 stbuf
->f_flag
= 0; // ??
9981 stbuf
->f_namemax
= NAME_MAX
;
9983 // Usually quota_root will == root_ancestor, but if the mount root has no
9984 // quota but we can see a parent of it that does have a quota, we'll
9985 // respect that one instead.
9986 ceph_assert(root
!= nullptr);
9987 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
9989 // get_quota_root should always give us something
9990 // because client quotas are always enabled
9991 ceph_assert(quota_root
!= nullptr);
9993 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
9995 // Skip the getattr if any sessions are stale, as we don't want to
9996 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9998 if (!_any_stale_sessions()) {
9999 int r
= _getattr(quota_root
, 0, perms
, true);
10001 // Ignore return value: error getting latest inode metadata is not a good
10002 // reason to break "df".
10003 lderr(cct
) << "Error in getattr on quota root 0x"
10004 << std::hex
<< quota_root
->ino
<< std::dec
10005 << " statfs result may be outdated" << dendl
;
10009 // Special case: if there is a size quota set on the Inode acting
10010 // as the root for this client mount, then report the quota status
10011 // as the filesystem statistics.
10012 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10013 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10014 // It is possible for a quota to be exceeded: arithmetic here must
10015 // handle case where used > total.
10016 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10018 stbuf
->f_blocks
= total
;
10019 stbuf
->f_bfree
= free
;
10020 stbuf
->f_bavail
= free
;
10022 // General case: report the cluster statistics returned from RADOS. Because
10023 // multiple pools may be used without one filesystem namespace via
10024 // layouts, this is the most correct thing we can do.
10025 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10026 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10027 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10033 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10034 struct flock
*fl
, uint64_t owner
, bool removing
)
10036 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10037 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10038 << " type " << fl
->l_type
<< " owner " << owner
10039 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10042 if (F_RDLCK
== fl
->l_type
)
10043 lock_cmd
= CEPH_LOCK_SHARED
;
10044 else if (F_WRLCK
== fl
->l_type
)
10045 lock_cmd
= CEPH_LOCK_EXCL
;
10046 else if (F_UNLCK
== fl
->l_type
)
10047 lock_cmd
= CEPH_LOCK_UNLOCK
;
10051 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10055 * Set the most significant bit, so that MDS knows the 'owner'
10056 * is sufficient to identify the owner of lock. (old code uses
10057 * both 'owner' and 'pid')
10059 owner
|= (1ULL << 63);
10061 MetaRequest
*req
= new MetaRequest(op
);
10063 in
->make_nosnap_relative_path(path
);
10064 req
->set_filepath(path
);
10065 req
->set_inode(in
);
10067 req
->head
.args
.filelock_change
.rule
= lock_type
;
10068 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10069 req
->head
.args
.filelock_change
.owner
= owner
;
10070 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10071 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10072 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10073 req
->head
.args
.filelock_change
.wait
= sleep
;
10078 if (sleep
&& switch_interrupt_cb
) {
10079 // enable interrupt
10080 switch_interrupt_cb(callback_handle
, req
->get());
10081 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10082 // disable interrupt
10083 switch_interrupt_cb(callback_handle
, NULL
);
10084 if (ret
== 0 && req
->aborted()) {
10085 // effect of this lock request has been revoked by the 'lock intr' request
10086 ret
= req
->get_abort_code();
10090 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10094 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10095 ceph_filelock filelock
;
10096 auto p
= bl
.cbegin();
10097 decode(filelock
, p
);
10099 if (CEPH_LOCK_SHARED
== filelock
.type
)
10100 fl
->l_type
= F_RDLCK
;
10101 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10102 fl
->l_type
= F_WRLCK
;
10104 fl
->l_type
= F_UNLCK
;
10106 fl
->l_whence
= SEEK_SET
;
10107 fl
->l_start
= filelock
.start
;
10108 fl
->l_len
= filelock
.length
;
10109 fl
->l_pid
= filelock
.pid
;
10110 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10111 ceph_lock_state_t
*lock_state
;
10112 if (lock_type
== CEPH_LOCK_FCNTL
) {
10113 if (!in
->fcntl_locks
)
10114 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10115 lock_state
= in
->fcntl_locks
.get();
10116 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10117 if (!in
->flock_locks
)
10118 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10119 lock_state
= in
->flock_locks
.get();
10124 _update_lock_state(fl
, owner
, lock_state
);
10127 if (lock_type
== CEPH_LOCK_FCNTL
) {
10128 if (!fh
->fcntl_locks
)
10129 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10130 lock_state
= fh
->fcntl_locks
.get();
10132 if (!fh
->flock_locks
)
10133 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10134 lock_state
= fh
->flock_locks
.get();
10136 _update_lock_state(fl
, owner
, lock_state
);
10144 int Client::_interrupt_filelock(MetaRequest
*req
)
10146 // Set abort code, but do not kick. The abort code prevents the request
10147 // from being re-sent.
10148 req
->abort(-EINTR
);
10150 return 0; // haven't sent the request
10152 Inode
*in
= req
->inode();
10155 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10156 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10157 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10158 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10164 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10166 in
->make_nosnap_relative_path(path
);
10167 intr_req
->set_filepath(path
);
10168 intr_req
->set_inode(in
);
10169 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10170 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10171 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10173 UserPerm
perms(req
->get_uid(), req
->get_gid());
10174 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10177 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10179 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10182 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10183 encode(nr_fcntl_locks
, bl
);
10184 if (nr_fcntl_locks
) {
10185 auto &lock_state
= in
->fcntl_locks
;
10186 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10187 p
!= lock_state
->held_locks
.end();
10189 encode(p
->second
, bl
);
10192 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10193 encode(nr_flock_locks
, bl
);
10194 if (nr_flock_locks
) {
10195 auto &lock_state
= in
->flock_locks
;
10196 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10197 p
!= lock_state
->held_locks
.end();
10199 encode(p
->second
, bl
);
10202 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10203 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10206 void Client::_release_filelocks(Fh
*fh
)
10208 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10211 Inode
*in
= fh
->inode
.get();
10212 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10214 list
<pair
<int, ceph_filelock
> > to_release
;
10216 if (fh
->fcntl_locks
) {
10217 auto &lock_state
= fh
->fcntl_locks
;
10218 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10219 p
!= lock_state
->held_locks
.end();
10221 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10222 lock_state
.reset();
10224 if (fh
->flock_locks
) {
10225 auto &lock_state
= fh
->flock_locks
;
10226 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10227 p
!= lock_state
->held_locks
.end();
10229 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10230 lock_state
.reset();
10233 if (to_release
.empty())
10236 // mds has already released filelocks if session was closed.
10237 if (in
->caps
.empty())
10241 memset(&fl
, 0, sizeof(fl
));
10242 fl
.l_whence
= SEEK_SET
;
10243 fl
.l_type
= F_UNLCK
;
10245 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10246 p
!= to_release
.end();
10248 fl
.l_start
= p
->second
.start
;
10249 fl
.l_len
= p
->second
.length
;
10250 fl
.l_pid
= p
->second
.pid
;
10251 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10252 p
->second
.owner
, true);
10256 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10257 ceph_lock_state_t
*lock_state
)
10260 if (F_RDLCK
== fl
->l_type
)
10261 lock_cmd
= CEPH_LOCK_SHARED
;
10262 else if (F_WRLCK
== fl
->l_type
)
10263 lock_cmd
= CEPH_LOCK_EXCL
;
10265 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10267 ceph_filelock filelock
;
10268 filelock
.start
= fl
->l_start
;
10269 filelock
.length
= fl
->l_len
;
10270 filelock
.client
= 0;
10271 // see comment in _do_filelock()
10272 filelock
.owner
= owner
| (1ULL << 63);
10273 filelock
.pid
= fl
->l_pid
;
10274 filelock
.type
= lock_cmd
;
10276 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10277 list
<ceph_filelock
> activated_locks
;
10278 lock_state
->remove_lock(filelock
, activated_locks
);
10280 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10285 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10287 Inode
*in
= fh
->inode
.get();
10288 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10289 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10293 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10295 Inode
*in
= fh
->inode
.get();
10296 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10297 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10298 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10302 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10304 Inode
*in
= fh
->inode
.get();
10305 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10307 int sleep
= !(cmd
& LOCK_NB
);
10326 memset(&fl
, 0, sizeof(fl
));
10328 fl
.l_whence
= SEEK_SET
;
10330 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10331 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10335 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10337 /* Since the only thing this does is wrap a call to statfs, and
10338 statfs takes a lock, it doesn't seem we have a need to split it
10340 return statfs(0, stbuf
, perms
);
10343 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10347 std::lock_guard
l(client_lock
);
10348 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10349 << " invalidate_ino_cb " << args
->ino_cb
10350 << " invalidate_dentry_cb " << args
->dentry_cb
10351 << " switch_interrupt_cb " << args
->switch_intr_cb
10352 << " remount_cb " << args
->remount_cb
10354 callback_handle
= args
->handle
;
10355 if (args
->ino_cb
) {
10356 ino_invalidate_cb
= args
->ino_cb
;
10357 async_ino_invalidator
.start();
10359 if (args
->dentry_cb
) {
10360 dentry_invalidate_cb
= args
->dentry_cb
;
10361 async_dentry_invalidator
.start();
10363 if (args
->switch_intr_cb
) {
10364 switch_interrupt_cb
= args
->switch_intr_cb
;
10365 interrupt_finisher
.start();
10367 if (args
->remount_cb
) {
10368 remount_cb
= args
->remount_cb
;
10369 remount_finisher
.start();
10371 umask_cb
= args
->umask_cb
;
10374 int Client::test_dentry_handling(bool can_invalidate
)
10378 can_invalidate_dentries
= can_invalidate
;
10380 if (can_invalidate_dentries
) {
10381 ceph_assert(dentry_invalidate_cb
);
10382 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10385 ceph_assert(remount_cb
);
10386 ldout(cct
, 1) << "using remount_cb" << dendl
;
10387 r
= _do_remount(false);
10393 int Client::_sync_fs()
10395 ldout(cct
, 10) << __func__
<< dendl
;
10398 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10399 if (cct
->_conf
->client_oc
) {
10400 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10401 objectcacher
->flush_all(cond
.get());
10406 ceph_tid_t flush_tid
= last_flush_tid
;
10408 // wait for unsafe mds requests
10409 wait_unsafe_requests();
10411 wait_sync_caps(flush_tid
);
10413 if (nullptr != cond
) {
10414 client_lock
.Unlock();
10415 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10417 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10418 client_lock
.Lock();
10424 int Client::sync_fs()
10426 std::lock_guard
l(client_lock
);
10434 int64_t Client::drop_caches()
10436 std::lock_guard
l(client_lock
);
10437 return objectcacher
->release_all();
10440 int Client::_lazyio(Fh
*fh
, int enable
)
10442 Inode
*in
= fh
->inode
.get();
10443 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10445 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10448 int orig_mode
= fh
->mode
;
10450 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10451 in
->get_open_ref(fh
->mode
);
10452 in
->put_open_ref(orig_mode
);
10453 check_caps(in
, CHECK_CAPS_NODELAY
);
10455 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10456 in
->get_open_ref(fh
->mode
);
10457 in
->put_open_ref(orig_mode
);
10464 int Client::lazyio(int fd
, int enable
)
10466 std::lock_guard
l(client_lock
);
10467 Fh
*f
= get_filehandle(fd
);
10471 return _lazyio(f
, enable
);
10474 int Client::ll_lazyio(Fh
*fh
, int enable
)
10476 std::lock_guard
lock(client_lock
);
10477 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10478 tout(cct
) << __func__
<< std::endl
;
10480 return _lazyio(fh
, enable
);
10483 int Client::lazyio_propogate(int fd
, loff_t offset
, size_t count
)
10485 std::lock_guard
l(client_lock
);
10486 ldout(cct
, 3) << "op: client->lazyio_propogate(" << fd
10487 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10489 Fh
*f
= get_filehandle(fd
);
10499 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10501 std::lock_guard
l(client_lock
);
10502 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10503 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10505 Fh
*f
= get_filehandle(fd
);
10508 Inode
*in
= f
->inode
.get();
10517 // =============================
10520 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10522 std::lock_guard
l(client_lock
);
10527 filepath
path(relpath
);
10529 int r
= path_walk(path
, &in
, perm
);
10532 if (cct
->_conf
->client_permissions
) {
10533 r
= may_create(in
.get(), perm
);
10537 Inode
*snapdir
= open_snapdir(in
.get());
10538 return _mkdir(snapdir
, name
, 0, perm
);
10541 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10543 std::lock_guard
l(client_lock
);
10548 filepath
path(relpath
);
10550 int r
= path_walk(path
, &in
, perms
);
10553 if (cct
->_conf
->client_permissions
) {
10554 r
= may_delete(in
.get(), NULL
, perms
);
10558 Inode
*snapdir
= open_snapdir(in
.get());
10559 return _rmdir(snapdir
, name
, perms
);
10562 // =============================
10565 int Client::get_caps_issued(int fd
) {
10567 std::lock_guard
lock(client_lock
);
10572 Fh
*f
= get_filehandle(fd
);
10576 return f
->inode
->caps_issued();
10579 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10581 std::lock_guard
lock(client_lock
);
10588 int r
= path_walk(p
, &in
, perms
, true);
10591 return in
->caps_issued();
10594 // =========================================
10597 Inode
*Client::open_snapdir(Inode
*diri
)
10600 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10601 if (!inode_map
.count(vino
)) {
10602 in
= new Inode(this, vino
, &diri
->layout
);
10604 in
->ino
= diri
->ino
;
10605 in
->snapid
= CEPH_SNAPDIR
;
10606 in
->mode
= diri
->mode
;
10607 in
->uid
= diri
->uid
;
10608 in
->gid
= diri
->gid
;
10609 in
->mtime
= diri
->mtime
;
10610 in
->ctime
= diri
->ctime
;
10611 in
->btime
= diri
->btime
;
10612 in
->size
= diri
->size
;
10613 in
->change_attr
= diri
->change_attr
;
10615 in
->dirfragtree
.clear();
10616 in
->snapdir_parent
= diri
;
10617 diri
->flags
|= I_SNAPDIR_OPEN
;
10618 inode_map
[vino
] = in
;
10619 if (use_faked_inos())
10620 _assign_faked_ino(in
);
10621 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10623 in
= inode_map
[vino
];
10624 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10629 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10630 Inode
**out
, const UserPerm
& perms
)
10632 std::lock_guard
lock(client_lock
);
10633 vinodeno_t vparent
= _get_vino(parent
);
10634 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10635 tout(cct
) << __func__
<< std::endl
;
10636 tout(cct
) << name
<< std::endl
;
10642 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
10643 "fuse_default_permissions");
10644 if (!fuse_default_permissions
) {
10645 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10646 r
= may_lookup(parent
, perms
);
10652 string
dname(name
);
10655 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10662 fill_stat(in
, attr
);
10666 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10667 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10668 tout(cct
) << attr
->st_ino
<< std::endl
;
10673 int Client::ll_lookup_inode(
10674 struct inodeno_t ino
,
10675 const UserPerm
& perms
,
10678 std::lock_guard
lock(client_lock
);
10679 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10681 // Num1: get inode and *inode
10682 int r
= _lookup_ino(ino
, perms
, inode
);
10686 ceph_assert(inode
!= NULL
);
10687 ceph_assert(*inode
!= NULL
);
10689 // Num2: Request the parent inode, so that we can look up the name
10691 r
= _lookup_parent(*inode
, perms
, &parent
);
10692 if (r
&& r
!= -EINVAL
) {
10693 // Unexpected error
10694 _ll_forget(*inode
, 1);
10696 } else if (r
== -EINVAL
) {
10697 // EINVAL indicates node without parents (root), drop out now
10698 // and don't try to look up the non-existent dentry.
10701 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10702 // is already in cache
10703 ceph_assert(parent
!= NULL
);
10705 // Num3: Finally, get the name (dentry) of the requested inode
10706 r
= _lookup_name(*inode
, parent
, perms
);
10708 // Unexpected error
10709 _ll_forget(parent
, 1);
10710 _ll_forget(*inode
, 1);
10714 _ll_forget(parent
, 1);
10718 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10719 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10720 const UserPerm
& perms
)
10722 std::lock_guard
lock(client_lock
);
10723 vinodeno_t vparent
= _get_vino(parent
);
10724 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10725 tout(cct
) << "ll_lookupx" << std::endl
;
10726 tout(cct
) << name
<< std::endl
;
10732 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
10733 "fuse_default_permissions");
10734 if (!fuse_default_permissions
) {
10735 r
= may_lookup(parent
, perms
);
10740 string
dname(name
);
10743 unsigned mask
= statx_to_mask(flags
, want
);
10744 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10750 fill_statx(in
, mask
, stx
);
10754 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10755 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10756 tout(cct
) << stx
->stx_ino
<< std::endl
;
10761 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10762 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10764 std::lock_guard
lock(client_lock
);
10769 filepath
fp(name
, 0);
10772 unsigned mask
= statx_to_mask(flags
, want
);
10774 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10775 tout(cct
) << __func__
<< std::endl
;
10776 tout(cct
) << name
<< std::endl
;
10778 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10780 /* zero out mask, just in case... */
10787 fill_statx(in
, mask
, stx
);
10794 void Client::_ll_get(Inode
*in
)
10796 if (in
->ll_ref
== 0) {
10798 if (in
->is_dir() && !in
->dentries
.empty()) {
10799 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10800 in
->get_first_parent()->get(); // pin dentry
10802 if (in
->snapid
!= CEPH_NOSNAP
)
10803 ll_snap_ref
[in
->snapid
]++;
10806 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10809 int Client::_ll_put(Inode
*in
, int num
)
10812 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10813 if (in
->ll_ref
== 0) {
10814 if (in
->is_dir() && !in
->dentries
.empty()) {
10815 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10816 in
->get_first_parent()->put(); // unpin dentry
10818 if (in
->snapid
!= CEPH_NOSNAP
) {
10819 auto p
= ll_snap_ref
.find(in
->snapid
);
10820 ceph_assert(p
!= ll_snap_ref
.end());
10821 ceph_assert(p
->second
> 0);
10822 if (--p
->second
== 0)
10823 ll_snap_ref
.erase(p
);
10832 void Client::_ll_drop_pins()
10834 ldout(cct
, 10) << __func__
<< dendl
;
10835 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10836 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10837 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10838 it
!= inode_map
.end();
10840 Inode
*in
= it
->second
;
10844 to_be_put
.insert(in
);
10845 _ll_put(in
, in
->ll_ref
);
10850 bool Client::_ll_forget(Inode
*in
, int count
)
10852 inodeno_t ino
= in
->ino
;
10854 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10855 tout(cct
) << __func__
<< std::endl
;
10856 tout(cct
) << ino
.val
<< std::endl
;
10857 tout(cct
) << count
<< std::endl
;
10859 // Ignore forget if we're no longer mounted
10863 if (ino
== 1) return true; // ignore forget on root.
10866 if (in
->ll_ref
< count
) {
10867 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10868 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10869 _ll_put(in
, in
->ll_ref
);
10872 if (_ll_put(in
, count
) == 0)
10879 bool Client::ll_forget(Inode
*in
, int count
)
10881 std::lock_guard
lock(client_lock
);
10882 return _ll_forget(in
, count
);
10885 bool Client::ll_put(Inode
*in
)
10887 /* ll_forget already takes the lock */
10888 return ll_forget(in
, 1);
10891 int Client::ll_get_snap_ref(snapid_t snap
)
10893 std::lock_guard
lock(client_lock
);
10894 auto p
= ll_snap_ref
.find(snap
);
10895 if (p
!= ll_snap_ref
.end())
10900 snapid_t
Client::ll_get_snapid(Inode
*in
)
10902 std::lock_guard
lock(client_lock
);
10906 Inode
*Client::ll_get_inode(ino_t ino
)
10908 std::lock_guard
lock(client_lock
);
10913 vinodeno_t vino
= _map_faked_ino(ino
);
10914 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10915 if (p
== inode_map
.end())
10917 Inode
*in
= p
->second
;
10922 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10924 std::lock_guard
lock(client_lock
);
10929 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10930 if (p
== inode_map
.end())
10932 Inode
*in
= p
->second
;
10937 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
10939 vinodeno_t vino
= _get_vino(in
);
10941 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
10942 tout(cct
) << __func__
<< std::endl
;
10943 tout(cct
) << vino
.ino
.val
<< std::endl
;
10945 if (vino
.snapid
< CEPH_NOSNAP
)
10948 return _getattr(in
, caps
, perms
);
10951 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
10953 std::lock_guard
lock(client_lock
);
10958 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
10961 fill_stat(in
, attr
);
10962 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
10966 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
10967 unsigned int flags
, const UserPerm
& perms
)
10969 std::lock_guard
lock(client_lock
);
10975 unsigned mask
= statx_to_mask(flags
, want
);
10977 if (mask
&& !in
->caps_issued_mask(mask
, true))
10978 res
= _ll_getattr(in
, mask
, perms
);
10981 fill_statx(in
, mask
, stx
);
10982 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
10986 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10987 const UserPerm
& perms
, InodeRef
*inp
)
10989 vinodeno_t vino
= _get_vino(in
);
10991 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
10993 tout(cct
) << __func__
<< std::endl
;
10994 tout(cct
) << vino
.ino
.val
<< std::endl
;
10995 tout(cct
) << stx
->stx_mode
<< std::endl
;
10996 tout(cct
) << stx
->stx_uid
<< std::endl
;
10997 tout(cct
) << stx
->stx_gid
<< std::endl
;
10998 tout(cct
) << stx
->stx_size
<< std::endl
;
10999 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11000 tout(cct
) << stx
->stx_atime
<< std::endl
;
11001 tout(cct
) << stx
->stx_btime
<< std::endl
;
11002 tout(cct
) << mask
<< std::endl
;
11004 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11005 "fuse_default_permissions");
11006 if (!fuse_default_permissions
) {
11007 int res
= may_setattr(in
, stx
, mask
, perms
);
11012 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11014 return __setattrx(in
, stx
, mask
, perms
, inp
);
11017 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11018 const UserPerm
& perms
)
11020 std::lock_guard
lock(client_lock
);
11025 InodeRef
target(in
);
11026 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11028 ceph_assert(in
== target
.get());
11029 fill_statx(in
, in
->caps_issued(), stx
);
11032 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11036 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11037 const UserPerm
& perms
)
11039 struct ceph_statx stx
;
11040 stat_to_statx(attr
, &stx
);
11042 std::lock_guard
lock(client_lock
);
11047 InodeRef
target(in
);
11048 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11050 ceph_assert(in
== target
.get());
11051 fill_stat(in
, attr
);
11054 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11062 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11063 const UserPerm
& perms
)
11065 std::lock_guard
lock(client_lock
);
11071 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11074 return _getxattr(in
, name
, value
, size
, perms
);
11077 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11078 const UserPerm
& perms
)
11080 std::lock_guard
lock(client_lock
);
11086 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11089 return _getxattr(in
, name
, value
, size
, perms
);
11092 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11093 const UserPerm
& perms
)
11095 std::lock_guard
lock(client_lock
);
11100 Fh
*f
= get_filehandle(fd
);
11103 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11106 int Client::listxattr(const char *path
, char *list
, size_t size
,
11107 const UserPerm
& perms
)
11109 std::lock_guard
lock(client_lock
);
11115 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11118 return Client::_listxattr(in
.get(), list
, size
, perms
);
11121 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11122 const UserPerm
& perms
)
11124 std::lock_guard
lock(client_lock
);
11130 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11133 return Client::_listxattr(in
.get(), list
, size
, perms
);
11136 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11138 std::lock_guard
lock(client_lock
);
11143 Fh
*f
= get_filehandle(fd
);
11146 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11149 int Client::removexattr(const char *path
, const char *name
,
11150 const UserPerm
& perms
)
11152 std::lock_guard
lock(client_lock
);
11158 int r
= Client::path_walk(path
, &in
, perms
, true);
11161 return _removexattr(in
, name
, perms
);
11164 int Client::lremovexattr(const char *path
, const char *name
,
11165 const UserPerm
& perms
)
11167 std::lock_guard
lock(client_lock
);
11173 int r
= Client::path_walk(path
, &in
, perms
, false);
11176 return _removexattr(in
, name
, perms
);
11179 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11181 std::lock_guard
lock(client_lock
);
11186 Fh
*f
= get_filehandle(fd
);
11189 return _removexattr(f
->inode
, name
, perms
);
11192 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11193 size_t size
, int flags
, const UserPerm
& perms
)
11195 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11197 std::lock_guard
lock(client_lock
);
11203 int r
= Client::path_walk(path
, &in
, perms
, true);
11206 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11209 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11210 size_t size
, int flags
, const UserPerm
& perms
)
11212 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11214 std::lock_guard
lock(client_lock
);
11220 int r
= Client::path_walk(path
, &in
, perms
, false);
11223 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11226 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11227 int flags
, const UserPerm
& perms
)
11229 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11231 std::lock_guard
lock(client_lock
);
11236 Fh
*f
= get_filehandle(fd
);
11239 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11242 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11243 const UserPerm
& perms
)
11247 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11251 // Do a force getattr to get the latest quota before returning
11252 // a value to userspace.
11254 if (vxattr
->flags
& VXATTR_RSTAT
) {
11255 flags
|= CEPH_STAT_RSTAT
;
11257 r
= _getattr(in
, flags
, perms
, true);
11259 // Error from getattr!
11263 // call pointer-to-member function
11265 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11266 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11272 if (r
> (int)size
) {
11274 } else if (r
> 0) {
11275 memcpy(value
, buf
, r
);
11281 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11286 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11290 if (in
->xattrs
.count(n
)) {
11291 r
= in
->xattrs
[n
].length();
11292 if (r
> 0 && size
!= 0) {
11293 if (size
>= (unsigned)r
)
11294 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11301 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11305 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11306 const UserPerm
& perms
)
11308 if (cct
->_conf
->client_permissions
) {
11309 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11313 return _getxattr(in
.get(), name
, value
, size
, perms
);
11316 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11317 size_t size
, const UserPerm
& perms
)
11319 std::lock_guard
lock(client_lock
);
11324 vinodeno_t vino
= _get_vino(in
);
11326 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11327 tout(cct
) << __func__
<< std::endl
;
11328 tout(cct
) << vino
.ino
.val
<< std::endl
;
11329 tout(cct
) << name
<< std::endl
;
11331 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11332 "fuse_default_permissions");
11333 if (!fuse_default_permissions
) {
11334 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11339 return _getxattr(in
, name
, value
, size
, perms
);
11342 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11343 const UserPerm
& perms
)
11345 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11347 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11348 p
!= in
->xattrs
.end();
11350 r
+= p
->first
.length() + 1;
11352 const VXattr
*vxattrs
= _get_vxattrs(in
);
11353 r
+= _vxattrs_name_size(vxattrs
);
11356 if (size
>= (unsigned)r
) {
11357 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11358 p
!= in
->xattrs
.end();
11360 memcpy(name
, p
->first
.c_str(), p
->first
.length());
11361 name
+= p
->first
.length();
11366 for (int i
= 0; !vxattrs
[i
].name
.empty(); i
++) {
11367 const VXattr
& vxattr
= vxattrs
[i
];
11370 // call pointer-to-member function
11371 if(vxattr
.exists_cb
&& !(this->*(vxattr
.exists_cb
))(in
))
11373 memcpy(name
, vxattr
.name
.c_str(), vxattr
.name
.length());
11374 name
+= vxattr
.name
.length();
11383 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11387 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11388 const UserPerm
& perms
)
11390 std::lock_guard
lock(client_lock
);
11395 vinodeno_t vino
= _get_vino(in
);
11397 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11398 tout(cct
) << __func__
<< std::endl
;
11399 tout(cct
) << vino
.ino
.val
<< std::endl
;
11400 tout(cct
) << size
<< std::endl
;
11402 return _listxattr(in
, names
, size
, perms
);
11405 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11406 size_t size
, int flags
, const UserPerm
& perms
)
11409 int xattr_flags
= 0;
11411 xattr_flags
|= CEPH_XATTR_REMOVE
;
11412 if (flags
& XATTR_CREATE
)
11413 xattr_flags
|= CEPH_XATTR_CREATE
;
11414 if (flags
& XATTR_REPLACE
)
11415 xattr_flags
|= CEPH_XATTR_REPLACE
;
11417 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11419 in
->make_nosnap_relative_path(path
);
11420 req
->set_filepath(path
);
11421 req
->set_string2(name
);
11422 req
->set_inode(in
);
11423 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11426 assert (value
|| size
== 0);
11427 bl
.append((const char*)value
, size
);
11430 int res
= make_request(req
, perms
);
11433 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11438 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11439 size_t size
, int flags
, const UserPerm
& perms
)
11441 if (in
->snapid
!= CEPH_NOSNAP
) {
11445 bool posix_acl_xattr
= false;
11446 if (acl_type
== POSIX_ACL
)
11447 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11449 if (strncmp(name
, "user.", 5) &&
11450 strncmp(name
, "security.", 9) &&
11451 strncmp(name
, "trusted.", 8) &&
11452 strncmp(name
, "ceph.", 5) &&
11454 return -EOPNOTSUPP
;
11456 bool check_realm
= false;
11458 if (posix_acl_xattr
) {
11459 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11460 mode_t new_mode
= in
->mode
;
11462 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11469 if (new_mode
!= in
->mode
) {
11470 struct ceph_statx stx
;
11471 stx
.stx_mode
= new_mode
;
11472 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11477 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11479 if (!S_ISDIR(in
->mode
))
11481 int ret
= posix_acl_check(value
, size
);
11490 return -EOPNOTSUPP
;
11493 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11495 if (vxattr
->readonly
)
11496 return -EOPNOTSUPP
;
11497 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11498 check_realm
= true;
11502 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11503 if (ret
>= 0 && check_realm
) {
11504 // check if snaprealm was created for quota inode
11505 if (in
->quota
.is_enable() &&
11506 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11513 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11514 size_t size
, int flags
, const UserPerm
& perms
)
11516 if (cct
->_conf
->client_permissions
) {
11517 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11521 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11524 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11527 if (name
== "layout") {
11528 string::iterator begin
= value
.begin();
11529 string::iterator end
= value
.end();
11530 keys_and_values
<string::iterator
> p
; // create instance of parser
11531 std::map
<string
, string
> m
; // map to receive results
11532 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11537 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11538 if (q
->first
== "pool") {
11543 } else if (name
== "layout.pool") {
11547 if (tmp
.length()) {
11550 pool
= boost::lexical_cast
<unsigned>(tmp
);
11551 if (!osdmap
->have_pg_pool(pool
))
11553 } catch (boost::bad_lexical_cast
const&) {
11554 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11564 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11566 // For setting pool of layout, MetaRequest need osdmap epoch.
11567 // There is a race which create a new data pool but client and mds both don't have.
11568 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11569 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11570 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11571 string
rest(strstr(name
, "layout"));
11572 string
v((const char*)value
, size
);
11573 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11574 return _setxattr_check_data_pool(rest
, v
, &o
);
11577 if (r
== -ENOENT
) {
11579 objecter
->wait_for_latest_osdmap(&ctx
);
11585 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11586 size_t size
, int flags
, const UserPerm
& perms
)
11588 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11590 std::lock_guard
lock(client_lock
);
11595 vinodeno_t vino
= _get_vino(in
);
11597 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11598 tout(cct
) << __func__
<< std::endl
;
11599 tout(cct
) << vino
.ino
.val
<< std::endl
;
11600 tout(cct
) << name
<< std::endl
;
11602 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11603 "fuse_default_permissions");
11604 if (!fuse_default_permissions
) {
11605 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11609 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11612 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11614 if (in
->snapid
!= CEPH_NOSNAP
) {
11618 // same xattrs supported by kernel client
11619 if (strncmp(name
, "user.", 5) &&
11620 strncmp(name
, "system.", 7) &&
11621 strncmp(name
, "security.", 9) &&
11622 strncmp(name
, "trusted.", 8) &&
11623 strncmp(name
, "ceph.", 5))
11624 return -EOPNOTSUPP
;
11626 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11627 if (vxattr
&& vxattr
->readonly
)
11628 return -EOPNOTSUPP
;
11630 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11632 in
->make_nosnap_relative_path(path
);
11633 req
->set_filepath(path
);
11634 req
->set_filepath2(name
);
11635 req
->set_inode(in
);
11637 int res
= make_request(req
, perms
);
11640 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11644 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11646 if (cct
->_conf
->client_permissions
) {
11647 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11651 return _removexattr(in
.get(), name
, perms
);
11654 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11656 std::lock_guard
lock(client_lock
);
11661 vinodeno_t vino
= _get_vino(in
);
11663 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11664 tout(cct
) << "ll_removexattr" << std::endl
;
11665 tout(cct
) << vino
.ino
.val
<< std::endl
;
11666 tout(cct
) << name
<< std::endl
;
11668 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
11669 "fuse_default_permissions");
11670 if (!fuse_default_permissions
) {
11671 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11676 return _removexattr(in
, name
, perms
);
11679 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11681 return in
->quota
.is_enable() &&
11682 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11684 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11686 return snprintf(val
, size
,
11687 "max_bytes=%lld max_files=%lld",
11688 (long long int)in
->quota
.max_bytes
,
11689 (long long int)in
->quota
.max_files
);
11691 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11693 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11695 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11697 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11700 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11702 return in
->layout
!= file_layout_t();
11704 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11706 int r
= snprintf(val
, size
,
11707 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11708 (unsigned long long)in
->layout
.stripe_unit
,
11709 (unsigned long long)in
->layout
.stripe_count
,
11710 (unsigned long long)in
->layout
.object_size
);
11711 objecter
->with_osdmap([&](const OSDMap
& o
) {
11712 if (o
.have_pg_pool(in
->layout
.pool_id
))
11713 r
+= snprintf(val
+ r
, size
- r
, "%s",
11714 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11716 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11717 (uint64_t)in
->layout
.pool_id
);
11719 if (in
->layout
.pool_ns
.length())
11720 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11721 in
->layout
.pool_ns
.c_str());
11724 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11726 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11728 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11730 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11732 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11734 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11736 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11739 objecter
->with_osdmap([&](const OSDMap
& o
) {
11740 if (o
.have_pg_pool(in
->layout
.pool_id
))
11741 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11742 in
->layout
.pool_id
).c_str());
11744 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11748 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11750 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11752 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11754 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11756 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11758 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11760 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11762 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11764 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11766 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11768 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11770 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11772 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11774 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11776 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11778 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11780 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11782 return snprintf(val
, size
, "%ld.09%ld", (long)in
->rstat
.rctime
.sec(),
11783 (long)in
->rstat
.rctime
.nsec());
11785 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11787 return in
->dir_pin
!= -ENODATA
;
11789 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11791 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11794 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11795 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11797 #define XATTR_NAME_CEPH(_type, _name) \
11799 name: CEPH_XATTR_NAME(_type, _name), \
11800 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11806 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11808 name: CEPH_XATTR_NAME(_type, _name), \
11809 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11815 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11817 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11818 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11821 exists_cb: &Client::_vxattrcb_layout_exists, \
11824 #define XATTR_QUOTA_FIELD(_type, _name) \
11826 name: CEPH_XATTR_NAME(_type, _name), \
11827 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11830 exists_cb: &Client::_vxattrcb_quota_exists, \
11834 const Client::VXattr
Client::_dir_vxattrs
[] = {
11836 name
: "ceph.dir.layout",
11837 getxattr_cb
: &Client::_vxattrcb_layout
,
11840 exists_cb
: &Client::_vxattrcb_layout_exists
,
11843 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11844 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11845 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11846 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11847 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11848 XATTR_NAME_CEPH(dir
, entries
),
11849 XATTR_NAME_CEPH(dir
, files
),
11850 XATTR_NAME_CEPH(dir
, subdirs
),
11851 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11852 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11853 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11854 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11855 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11857 name
: "ceph.quota",
11858 getxattr_cb
: &Client::_vxattrcb_quota
,
11861 exists_cb
: &Client::_vxattrcb_quota_exists
,
11864 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11865 XATTR_QUOTA_FIELD(quota
, max_files
),
11867 name
: "ceph.dir.pin",
11868 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11871 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11874 { name
: "" } /* Required table terminator */
11877 const Client::VXattr
Client::_file_vxattrs
[] = {
11879 name
: "ceph.file.layout",
11880 getxattr_cb
: &Client::_vxattrcb_layout
,
11883 exists_cb
: &Client::_vxattrcb_layout_exists
,
11886 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11887 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11888 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11889 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11890 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11891 { name
: "" } /* Required table terminator */
11894 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11897 return _dir_vxattrs
;
11898 else if (in
->is_file())
11899 return _file_vxattrs
;
11903 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11905 if (strncmp(name
, "ceph.", 5) == 0) {
11906 const VXattr
*vxattr
= _get_vxattrs(in
);
11908 while (!vxattr
->name
.empty()) {
11909 if (vxattr
->name
== name
)
11918 size_t Client::_vxattrs_calcu_name_size(const VXattr
*vxattr
)
11921 while (!vxattr
->name
.empty()) {
11922 if (!vxattr
->hidden
)
11923 len
+= vxattr
->name
.length() + 1;
11929 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11931 std::lock_guard
lock(client_lock
);
11936 vinodeno_t vino
= _get_vino(in
);
11938 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11939 tout(cct
) << "ll_readlink" << std::endl
;
11940 tout(cct
) << vino
.ino
.val
<< std::endl
;
11942 for (auto dn
: in
->dentries
) {
11946 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11947 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
11951 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
11952 const UserPerm
& perms
, InodeRef
*inp
)
11954 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
11955 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
11956 << ", gid " << perms
.gid() << ")" << dendl
;
11958 if (strlen(name
) > NAME_MAX
)
11959 return -ENAMETOOLONG
;
11961 if (dir
->snapid
!= CEPH_NOSNAP
) {
11964 if (is_quota_files_exceeded(dir
, perms
)) {
11968 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
11971 dir
->make_nosnap_relative_path(path
);
11972 path
.push_dentry(name
);
11973 req
->set_filepath(path
);
11974 req
->set_inode(dir
);
11975 req
->head
.args
.mknod
.rdev
= rdev
;
11976 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11977 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11979 bufferlist xattrs_bl
;
11980 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
11983 req
->head
.args
.mknod
.mode
= mode
;
11984 if (xattrs_bl
.length() > 0)
11985 req
->set_data(xattrs_bl
);
11988 res
= get_or_create(dir
, name
, &de
);
11991 req
->set_dentry(de
);
11993 res
= make_request(req
, perms
, inp
);
11997 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12005 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12006 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12007 const UserPerm
& perms
)
12009 std::lock_guard
lock(client_lock
);
12014 vinodeno_t vparent
= _get_vino(parent
);
12016 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12017 tout(cct
) << "ll_mknod" << std::endl
;
12018 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12019 tout(cct
) << name
<< std::endl
;
12020 tout(cct
) << mode
<< std::endl
;
12021 tout(cct
) << rdev
<< std::endl
;
12023 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12024 "fuse_default_permissions");
12025 if (!fuse_default_permissions
) {
12026 int r
= may_create(parent
, perms
);
12032 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12034 fill_stat(in
, attr
);
12037 tout(cct
) << attr
->st_ino
<< std::endl
;
12038 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12039 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12044 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12045 dev_t rdev
, Inode
**out
,
12046 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12047 const UserPerm
& perms
)
12049 unsigned caps
= statx_to_mask(flags
, want
);
12050 std::lock_guard
lock(client_lock
);
12055 vinodeno_t vparent
= _get_vino(parent
);
12057 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12058 tout(cct
) << "ll_mknodx" << std::endl
;
12059 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12060 tout(cct
) << name
<< std::endl
;
12061 tout(cct
) << mode
<< std::endl
;
12062 tout(cct
) << rdev
<< std::endl
;
12064 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12065 "fuse_default_permissions");
12066 if (!fuse_default_permissions
) {
12067 int r
= may_create(parent
, perms
);
12073 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12075 fill_statx(in
, caps
, stx
);
12078 tout(cct
) << stx
->stx_ino
<< std::endl
;
12079 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12080 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12085 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12086 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12087 int object_size
, const char *data_pool
, bool *created
,
12088 const UserPerm
& perms
)
12090 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12091 mode
<< dec
<< ")" << dendl
;
12093 if (strlen(name
) > NAME_MAX
)
12094 return -ENAMETOOLONG
;
12095 if (dir
->snapid
!= CEPH_NOSNAP
) {
12098 if (is_quota_files_exceeded(dir
, perms
)) {
12102 // use normalized flags to generate cmode
12103 int cflags
= ceph_flags_sys2wire(flags
);
12104 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12105 cflags
|= CEPH_O_LAZY
;
12107 int cmode
= ceph_flags_to_mode(cflags
);
12109 int64_t pool_id
= -1;
12110 if (data_pool
&& *data_pool
) {
12111 pool_id
= objecter
->with_osdmap(
12112 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12115 if (pool_id
> 0xffffffffll
)
12116 return -ERANGE
; // bummer!
12119 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12122 dir
->make_nosnap_relative_path(path
);
12123 path
.push_dentry(name
);
12124 req
->set_filepath(path
);
12125 req
->set_inode(dir
);
12126 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12128 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12129 req
->head
.args
.open
.stripe_count
= stripe_count
;
12130 req
->head
.args
.open
.object_size
= object_size
;
12131 if (cct
->_conf
->client_debug_getattr_caps
)
12132 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12134 req
->head
.args
.open
.mask
= 0;
12135 req
->head
.args
.open
.pool
= pool_id
;
12136 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12137 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12140 bufferlist xattrs_bl
;
12141 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12144 req
->head
.args
.open
.mode
= mode
;
12145 if (xattrs_bl
.length() > 0)
12146 req
->set_data(xattrs_bl
);
12149 res
= get_or_create(dir
, name
, &de
);
12152 req
->set_dentry(de
);
12154 res
= make_request(req
, perms
, inp
, created
);
12159 /* If the caller passed a value in fhp, do the open */
12161 (*inp
)->get_open_ref(cmode
);
12162 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12168 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12169 << " layout " << stripe_unit
12170 << ' ' << stripe_count
12171 << ' ' << object_size
12172 <<") = " << res
<< dendl
;
12181 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12184 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12185 << mode
<< dec
<< ", uid " << perm
.uid()
12186 << ", gid " << perm
.gid() << ")" << dendl
;
12188 if (strlen(name
) > NAME_MAX
)
12189 return -ENAMETOOLONG
;
12191 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12194 if (is_quota_files_exceeded(dir
, perm
)) {
12197 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12198 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12201 dir
->make_nosnap_relative_path(path
);
12202 path
.push_dentry(name
);
12203 req
->set_filepath(path
);
12204 req
->set_inode(dir
);
12205 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12206 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12209 bufferlist xattrs_bl
;
12210 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12213 req
->head
.args
.mkdir
.mode
= mode
;
12214 if (xattrs_bl
.length() > 0)
12215 req
->set_data(xattrs_bl
);
12218 res
= get_or_create(dir
, name
, &de
);
12221 req
->set_dentry(de
);
12223 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12224 res
= make_request(req
, perm
, inp
);
12225 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12229 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12237 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12238 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12240 std::lock_guard
lock(client_lock
);
12245 vinodeno_t vparent
= _get_vino(parent
);
12247 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12248 tout(cct
) << "ll_mkdir" << std::endl
;
12249 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12250 tout(cct
) << name
<< std::endl
;
12251 tout(cct
) << mode
<< std::endl
;
12253 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12254 "fuse_default_permissions");
12255 if (!fuse_default_permissions
) {
12256 int r
= may_create(parent
, perm
);
12262 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12264 fill_stat(in
, attr
);
12267 tout(cct
) << attr
->st_ino
<< std::endl
;
12268 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12269 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12274 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12275 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12276 const UserPerm
& perms
)
12278 std::lock_guard
lock(client_lock
);
12283 vinodeno_t vparent
= _get_vino(parent
);
12285 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12286 tout(cct
) << "ll_mkdirx" << std::endl
;
12287 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12288 tout(cct
) << name
<< std::endl
;
12289 tout(cct
) << mode
<< std::endl
;
12291 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12292 "fuse_default_permissions");
12293 if (!fuse_default_permissions
) {
12294 int r
= may_create(parent
, perms
);
12300 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12302 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12308 tout(cct
) << stx
->stx_ino
<< std::endl
;
12309 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12310 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12315 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12316 const UserPerm
& perms
, InodeRef
*inp
)
12318 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12319 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12322 if (strlen(name
) > NAME_MAX
)
12323 return -ENAMETOOLONG
;
12325 if (dir
->snapid
!= CEPH_NOSNAP
) {
12328 if (is_quota_files_exceeded(dir
, perms
)) {
12332 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12335 dir
->make_nosnap_relative_path(path
);
12336 path
.push_dentry(name
);
12337 req
->set_filepath(path
);
12338 req
->set_inode(dir
);
12339 req
->set_string2(target
);
12340 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12341 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12344 int res
= get_or_create(dir
, name
, &de
);
12347 req
->set_dentry(de
);
12349 res
= make_request(req
, perms
, inp
);
12352 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12361 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12362 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12364 std::lock_guard
lock(client_lock
);
12369 vinodeno_t vparent
= _get_vino(parent
);
12371 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12373 tout(cct
) << "ll_symlink" << std::endl
;
12374 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12375 tout(cct
) << name
<< std::endl
;
12376 tout(cct
) << value
<< std::endl
;
12378 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12379 "fuse_default_permissions");
12380 if (!fuse_default_permissions
) {
12381 int r
= may_create(parent
, perms
);
12387 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12389 fill_stat(in
, attr
);
12392 tout(cct
) << attr
->st_ino
<< std::endl
;
12393 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12394 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12399 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12400 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12401 unsigned flags
, const UserPerm
& perms
)
12403 std::lock_guard
lock(client_lock
);
12408 vinodeno_t vparent
= _get_vino(parent
);
12410 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12412 tout(cct
) << "ll_symlinkx" << std::endl
;
12413 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12414 tout(cct
) << name
<< std::endl
;
12415 tout(cct
) << value
<< std::endl
;
12417 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12418 "fuse_default_permissions");
12419 if (!fuse_default_permissions
) {
12420 int r
= may_create(parent
, perms
);
12426 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12428 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12431 tout(cct
) << stx
->stx_ino
<< std::endl
;
12432 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12433 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12438 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12440 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12441 << " uid " << perm
.uid() << " gid " << perm
.gid()
12444 if (dir
->snapid
!= CEPH_NOSNAP
) {
12448 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12451 dir
->make_nosnap_relative_path(path
);
12452 path
.push_dentry(name
);
12453 req
->set_filepath(path
);
12459 int res
= get_or_create(dir
, name
, &de
);
12462 req
->set_dentry(de
);
12463 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12464 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12466 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12470 in
= otherin
.get();
12471 req
->set_other_inode(in
);
12472 in
->break_all_delegs();
12473 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12475 req
->set_inode(dir
);
12477 res
= make_request(req
, perm
);
12480 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12488 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12490 std::lock_guard
lock(client_lock
);
12495 vinodeno_t vino
= _get_vino(in
);
12497 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12498 tout(cct
) << "ll_unlink" << std::endl
;
12499 tout(cct
) << vino
.ino
.val
<< std::endl
;
12500 tout(cct
) << name
<< std::endl
;
12502 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12503 "fuse_default_permissions");
12504 if (!fuse_default_permissions
) {
12505 int r
= may_delete(in
, name
, perm
);
12509 return _unlink(in
, name
, perm
);
12512 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12514 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12515 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12517 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12521 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12522 MetaRequest
*req
= new MetaRequest(op
);
12524 dir
->make_nosnap_relative_path(path
);
12525 path
.push_dentry(name
);
12526 req
->set_filepath(path
);
12527 req
->set_inode(dir
);
12529 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12530 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12531 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12536 int res
= get_or_create(dir
, name
, &de
);
12539 if (op
== CEPH_MDS_OP_RMDIR
)
12540 req
->set_dentry(de
);
12544 res
= _lookup(dir
, name
, 0, &in
, perms
);
12548 if (op
== CEPH_MDS_OP_RMSNAP
) {
12549 unlink(de
, true, true);
12552 req
->set_other_inode(in
.get());
12554 res
= make_request(req
, perms
);
12557 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12565 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12567 std::lock_guard
lock(client_lock
);
12572 vinodeno_t vino
= _get_vino(in
);
12574 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12575 tout(cct
) << "ll_rmdir" << std::endl
;
12576 tout(cct
) << vino
.ino
.val
<< std::endl
;
12577 tout(cct
) << name
<< std::endl
;
12579 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12580 "fuse_default_permissions");
12581 if (!fuse_default_permissions
) {
12582 int r
= may_delete(in
, name
, perms
);
12587 return _rmdir(in
, name
, perms
);
12590 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12592 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12593 << todir
->ino
<< " " << toname
12594 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12597 if (fromdir
->snapid
!= todir
->snapid
)
12600 int op
= CEPH_MDS_OP_RENAME
;
12601 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12602 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12603 op
= CEPH_MDS_OP_RENAMESNAP
;
12607 if (fromdir
!= todir
) {
12608 Inode
*fromdir_root
=
12609 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12610 Inode
*todir_root
=
12611 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12612 if (fromdir_root
!= todir_root
) {
12618 MetaRequest
*req
= new MetaRequest(op
);
12621 fromdir
->make_nosnap_relative_path(from
);
12622 from
.push_dentry(fromname
);
12624 todir
->make_nosnap_relative_path(to
);
12625 to
.push_dentry(toname
);
12626 req
->set_filepath(to
);
12627 req
->set_filepath2(from
);
12630 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12634 res
= get_or_create(todir
, toname
, &de
);
12638 if (op
== CEPH_MDS_OP_RENAME
) {
12639 req
->set_old_dentry(oldde
);
12640 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12641 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12643 req
->set_dentry(de
);
12644 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12645 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12647 InodeRef oldin
, otherin
;
12648 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
12652 Inode
*oldinode
= oldin
.get();
12653 oldinode
->break_all_delegs();
12654 req
->set_old_inode(oldinode
);
12655 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12657 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12661 Inode
*in
= otherin
.get();
12662 req
->set_other_inode(in
);
12663 in
->break_all_delegs();
12665 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12673 req
->set_inode(todir
);
12675 // renamesnap reply contains no tracedn, so we need to invalidate
12677 unlink(oldde
, true, true);
12678 unlink(de
, true, true);
12680 req
->set_inode(todir
);
12683 res
= make_request(req
, perm
, &target
);
12684 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12686 // renamed item from our cache
12689 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12697 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12698 const char *newname
, const UserPerm
& perm
)
12700 std::lock_guard
lock(client_lock
);
12705 vinodeno_t vparent
= _get_vino(parent
);
12706 vinodeno_t vnewparent
= _get_vino(newparent
);
12708 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12709 << vnewparent
<< " " << newname
<< dendl
;
12710 tout(cct
) << "ll_rename" << std::endl
;
12711 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12712 tout(cct
) << name
<< std::endl
;
12713 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12714 tout(cct
) << newname
<< std::endl
;
12716 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12717 "fuse_default_permissions");
12718 if (!fuse_default_permissions
) {
12719 int r
= may_delete(parent
, name
, perm
);
12722 r
= may_delete(newparent
, newname
, perm
);
12723 if (r
< 0 && r
!= -ENOENT
)
12727 return _rename(parent
, name
, newparent
, newname
, perm
);
12730 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12732 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12733 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12735 if (strlen(newname
) > NAME_MAX
)
12736 return -ENAMETOOLONG
;
12738 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12741 if (is_quota_files_exceeded(dir
, perm
)) {
12745 in
->break_all_delegs();
12746 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12748 filepath
path(newname
, dir
->ino
);
12749 req
->set_filepath(path
);
12750 filepath
existing(in
->ino
);
12751 req
->set_filepath2(existing
);
12753 req
->set_inode(dir
);
12754 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12755 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12758 int res
= get_or_create(dir
, newname
, &de
);
12761 req
->set_dentry(de
);
12763 res
= make_request(req
, perm
, inp
);
12764 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12767 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12775 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12776 const UserPerm
& perm
)
12778 std::lock_guard
lock(client_lock
);
12783 vinodeno_t vino
= _get_vino(in
);
12784 vinodeno_t vnewparent
= _get_vino(newparent
);
12786 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12788 tout(cct
) << "ll_link" << std::endl
;
12789 tout(cct
) << vino
.ino
.val
<< std::endl
;
12790 tout(cct
) << vnewparent
<< std::endl
;
12791 tout(cct
) << newname
<< std::endl
;
12795 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12796 "fuse_default_permissions");
12797 if (!fuse_default_permissions
) {
12798 if (S_ISDIR(in
->mode
))
12801 int r
= may_hardlink(in
, perm
);
12805 r
= may_create(newparent
, perm
);
12810 return _link(in
, newparent
, newname
, perm
, &target
);
12813 int Client::ll_num_osds(void)
12815 std::lock_guard
lock(client_lock
);
12816 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12819 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12821 std::lock_guard
lock(client_lock
);
12824 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12825 if (!o
.exists(osd
))
12827 g
= o
.get_addrs(osd
).front();
12832 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12833 *addr
= ntohl(nb_addr
);
12837 uint32_t Client::ll_stripe_unit(Inode
*in
)
12839 std::lock_guard
lock(client_lock
);
12840 return in
->layout
.stripe_unit
;
12843 uint64_t Client::ll_snap_seq(Inode
*in
)
12845 std::lock_guard
lock(client_lock
);
12846 return in
->snaprealm
->seq
;
12849 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12851 std::lock_guard
lock(client_lock
);
12852 *layout
= in
->layout
;
12856 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12858 return ll_file_layout(fh
->inode
.get(), layout
);
12861 /* Currently we cannot take advantage of redundancy in reads, since we
12862 would have to go through all possible placement groups (a
12863 potentially quite large number determined by a hash), and use CRUSH
12864 to calculate the appropriate set of OSDs for each placement group,
12865 then index into that. An array with one entry per OSD is much more
12866 tractable and works for demonstration purposes. */
12868 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12869 file_layout_t
* layout
)
12871 std::lock_guard
lock(client_lock
);
12873 inodeno_t ino
= in
->ino
;
12874 uint32_t object_size
= layout
->object_size
;
12875 uint32_t su
= layout
->stripe_unit
;
12876 uint32_t stripe_count
= layout
->stripe_count
;
12877 uint64_t stripes_per_object
= object_size
/ su
;
12878 uint64_t stripeno
= 0, stripepos
= 0;
12881 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12882 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12884 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12885 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12887 object_t oid
= file_object_t(ino
, objectno
);
12888 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12889 ceph_object_layout olayout
=
12890 o
.file_to_object_layout(oid
, *layout
);
12891 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12894 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12899 /* Return the offset of the block, internal to the object */
12901 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12903 std::lock_guard
lock(client_lock
);
12904 file_layout_t
*layout
=&(in
->layout
);
12905 uint32_t object_size
= layout
->object_size
;
12906 uint32_t su
= layout
->stripe_unit
;
12907 uint64_t stripes_per_object
= object_size
/ su
;
12909 return (blockno
% stripes_per_object
) * su
;
12912 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12913 const UserPerm
& perms
)
12915 std::lock_guard
lock(client_lock
);
12920 vinodeno_t vino
= _get_vino(in
);
12922 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
12923 tout(cct
) << "ll_opendir" << std::endl
;
12924 tout(cct
) << vino
.ino
.val
<< std::endl
;
12926 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12927 "fuse_default_permissions");
12928 if (!fuse_default_permissions
) {
12929 int r
= may_open(in
, flags
, perms
);
12934 int r
= _opendir(in
, dirpp
, perms
);
12935 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
12937 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
12942 int Client::ll_releasedir(dir_result_t
*dirp
)
12944 std::lock_guard
lock(client_lock
);
12945 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
12946 tout(cct
) << "ll_releasedir" << std::endl
;
12947 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12956 int Client::ll_fsyncdir(dir_result_t
*dirp
)
12958 std::lock_guard
lock(client_lock
);
12959 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
12960 tout(cct
) << "ll_fsyncdir" << std::endl
;
12961 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12966 return _fsync(dirp
->inode
.get(), false);
12969 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
12971 ceph_assert(!(flags
& O_CREAT
));
12973 std::lock_guard
lock(client_lock
);
12978 vinodeno_t vino
= _get_vino(in
);
12980 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
12981 tout(cct
) << "ll_open" << std::endl
;
12982 tout(cct
) << vino
.ino
.val
<< std::endl
;
12983 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
12986 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
12987 "fuse_default_permissions");
12988 if (!fuse_default_permissions
) {
12989 r
= may_open(in
, flags
, perms
);
12994 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
12997 Fh
*fhptr
= fhp
? *fhp
: NULL
;
12999 ll_unclosed_fh_set
.insert(fhptr
);
13001 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13002 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13003 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13007 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13008 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13009 const UserPerm
& perms
)
13013 vinodeno_t vparent
= _get_vino(parent
);
13015 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13016 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13017 << ", gid " << perms
.gid() << dendl
;
13018 tout(cct
) << "ll_create" << std::endl
;
13019 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13020 tout(cct
) << name
<< std::endl
;
13021 tout(cct
) << mode
<< std::endl
;
13022 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13024 bool created
= false;
13025 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13027 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13030 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13031 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
13032 "fuse_default_permissions");
13033 if (!fuse_default_permissions
) {
13034 r
= may_create(parent
, perms
);
13038 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13049 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13051 auto fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
13052 "fuse_default_permissions");
13053 if (!fuse_default_permissions
) {
13054 r
= may_open(in
->get(), flags
, perms
);
13057 int release_r
= _release_fh(*fhp
);
13058 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13063 if (*fhp
== NULL
) {
13064 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13072 ll_unclosed_fh_set
.insert(*fhp
);
13077 Inode
*inode
= in
->get();
13078 if (use_faked_inos())
13079 ino
= inode
->faked_ino
;
13084 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13085 tout(cct
) << ino
<< std::endl
;
13086 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13087 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13088 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13093 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13094 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13095 const UserPerm
& perms
)
13097 std::lock_guard
lock(client_lock
);
13103 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13108 // passing an Inode in outp requires an additional ref
13113 fill_stat(in
, attr
);
13121 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13122 int oflags
, Inode
**outp
, Fh
**fhp
,
13123 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13124 const UserPerm
& perms
)
13126 unsigned caps
= statx_to_mask(lflags
, want
);
13127 std::lock_guard
lock(client_lock
);
13133 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13137 // passing an Inode in outp requires an additional ref
13142 fill_statx(in
, caps
, stx
);
13151 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13153 std::lock_guard
lock(client_lock
);
13154 tout(cct
) << "ll_lseek" << std::endl
;
13155 tout(cct
) << offset
<< std::endl
;
13156 tout(cct
) << whence
<< std::endl
;
13161 return _lseek(fh
, offset
, whence
);
13164 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13166 std::lock_guard
lock(client_lock
);
13167 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13168 tout(cct
) << "ll_read" << std::endl
;
13169 tout(cct
) << (unsigned long)fh
<< std::endl
;
13170 tout(cct
) << off
<< std::endl
;
13171 tout(cct
) << len
<< std::endl
;
13176 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13177 len
= std::min(len
, (loff_t
)INT_MAX
);
13178 return _read(fh
, off
, len
, bl
);
13181 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13185 file_layout_t
* layout
)
13187 std::lock_guard
lock(client_lock
);
13192 vinodeno_t vino
= _get_vino(in
);
13193 object_t oid
= file_object_t(vino
.ino
, blockid
);
13194 C_SaferCond onfinish
;
13197 objecter
->read(oid
,
13198 object_locator_t(layout
->pool_id
),
13203 CEPH_OSD_FLAG_READ
,
13206 client_lock
.Unlock();
13207 int r
= onfinish
.wait();
13208 client_lock
.Lock();
13211 bl
.copy(0, bl
.length(), buf
);
13218 /* It appears that the OSD doesn't return success unless the entire
13219 buffer was written, return the write length on success. */
13221 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13222 char* buf
, uint64_t offset
,
13223 uint64_t length
, file_layout_t
* layout
,
13224 uint64_t snapseq
, uint32_t sync
)
13226 vinodeno_t vino
= ll_get_vino(in
);
13228 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13233 if (true || sync
) {
13234 /* if write is stable, the epilogue is waiting on
13236 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13238 object_t oid
= file_object_t(vino
.ino
, blockid
);
13239 SnapContext fakesnap
;
13240 ceph::bufferlist bl
;
13242 bl
.push_back(buffer::copy(buf
, length
));
13245 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13248 fakesnap
.seq
= snapseq
;
13250 /* lock just in time */
13251 client_lock
.Lock();
13253 client_lock
.Unlock();
13257 objecter
->write(oid
,
13258 object_locator_t(layout
->pool_id
),
13263 ceph::real_clock::now(),
13267 client_lock
.Unlock();
13268 if (nullptr != onsafe
) {
13269 r
= onsafe
->wait();
13279 int Client::ll_commit_blocks(Inode
*in
,
13283 std::lock_guard
lock(client_lock
);
13285 BarrierContext *bctx;
13286 vinodeno_t vino = _get_vino(in);
13287 uint64_t ino = vino.ino;
13289 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13290 << offset << " to " << length << dendl;
13296 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13297 if (p != barriers.end()) {
13298 barrier_interval civ(offset, offset + length);
13299 p->second->commit_barrier(civ);
13305 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13307 std::lock_guard
lock(client_lock
);
13308 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13309 "~" << len
<< dendl
;
13310 tout(cct
) << "ll_write" << std::endl
;
13311 tout(cct
) << (unsigned long)fh
<< std::endl
;
13312 tout(cct
) << off
<< std::endl
;
13313 tout(cct
) << len
<< std::endl
;
13318 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13319 len
= std::min(len
, (loff_t
)INT_MAX
);
13320 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13321 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13326 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13328 std::lock_guard
lock(client_lock
);
13331 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13334 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13336 std::lock_guard
lock(client_lock
);
13339 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13342 int Client::ll_flush(Fh
*fh
)
13344 std::lock_guard
lock(client_lock
);
13345 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13346 tout(cct
) << "ll_flush" << std::endl
;
13347 tout(cct
) << (unsigned long)fh
<< std::endl
;
13355 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13357 std::lock_guard
lock(client_lock
);
13358 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13359 tout(cct
) << "ll_fsync" << std::endl
;
13360 tout(cct
) << (unsigned long)fh
<< std::endl
;
13365 int r
= _fsync(fh
, syncdataonly
);
13367 // If we're returning an error, clear it from the FH
13368 fh
->take_async_err();
13373 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13375 std::lock_guard
lock(client_lock
);
13376 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13377 tout(cct
) << "ll_sync_inode" << std::endl
;
13378 tout(cct
) << (unsigned long)in
<< std::endl
;
13383 return _fsync(in
, syncdataonly
);
13386 #ifdef FALLOC_FL_PUNCH_HOLE
13388 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13390 if (offset
< 0 || length
<= 0)
13393 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13394 return -EOPNOTSUPP
;
13396 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13397 return -EOPNOTSUPP
;
13399 Inode
*in
= fh
->inode
.get();
13401 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13402 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13406 if (in
->snapid
!= CEPH_NOSNAP
)
13409 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13412 uint64_t size
= offset
+ length
;
13413 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13415 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13420 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13424 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13425 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13426 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13427 (have
& CEPH_CAP_FILE_BUFFER
)) {
13429 int len
= in
->inline_data
.length();
13430 if (offset
< len
) {
13432 in
->inline_data
.copy(0, offset
, bl
);
13434 if (offset
+ size
> len
)
13435 size
= len
- offset
;
13437 bl
.append_zero(size
);
13438 if (offset
+ size
< len
)
13439 in
->inline_data
.copy(offset
+ size
, len
- offset
- size
, bl
);
13440 in
->inline_data
= bl
;
13441 in
->inline_version
++;
13443 in
->mtime
= in
->ctime
= ceph_clock_now();
13445 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13447 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13448 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13449 uninline_data(in
, onuninline
.get());
13452 C_SaferCond
onfinish("Client::_punch_hole flock");
13454 unsafe_sync_write
++;
13455 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13457 _invalidate_inode_cache(in
, offset
, length
);
13458 filer
->zero(in
->ino
, &in
->layout
,
13459 in
->snaprealm
->get_snap_context(),
13461 ceph::real_clock::now(),
13462 0, true, &onfinish
);
13463 in
->mtime
= in
->ctime
= ceph_clock_now();
13465 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13467 client_lock
.Unlock();
13469 client_lock
.Lock();
13470 _sync_write_commit(in
);
13472 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13473 uint64_t size
= offset
+ length
;
13474 if (size
> in
->size
) {
13476 in
->mtime
= in
->ctime
= ceph_clock_now();
13478 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13480 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13481 check_caps(in
, CHECK_CAPS_NODELAY
);
13482 } else if (is_max_size_approaching(in
)) {
13488 if (nullptr != onuninline
) {
13489 client_lock
.Unlock();
13490 int ret
= onuninline
->wait();
13491 client_lock
.Lock();
13493 if (ret
>= 0 || ret
== -ECANCELED
) {
13494 in
->inline_data
.clear();
13495 in
->inline_version
= CEPH_INLINE_NONE
;
13496 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13502 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13507 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13509 return -EOPNOTSUPP
;
13515 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13517 std::lock_guard
lock(client_lock
);
13518 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13519 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13520 tout(cct
) << (unsigned long)fh
<< std::endl
;
13525 return _fallocate(fh
, mode
, offset
, length
);
13528 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13530 std::lock_guard
lock(client_lock
);
13531 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13536 Fh
*fh
= get_filehandle(fd
);
13539 #if defined(__linux__) && defined(O_PATH)
13540 if (fh
->flags
& O_PATH
)
13543 return _fallocate(fh
, mode
, offset
, length
);
13546 int Client::ll_release(Fh
*fh
)
13548 std::lock_guard
lock(client_lock
);
13553 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13555 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13556 tout(cct
) << (unsigned long)fh
<< std::endl
;
13558 if (ll_unclosed_fh_set
.count(fh
))
13559 ll_unclosed_fh_set
.erase(fh
);
13560 return _release_fh(fh
);
13563 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13565 std::lock_guard
lock(client_lock
);
13567 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13568 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13573 return _getlk(fh
, fl
, owner
);
13576 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13578 std::lock_guard
lock(client_lock
);
13580 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13581 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13586 return _setlk(fh
, fl
, owner
, sleep
);
13589 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13591 std::lock_guard
lock(client_lock
);
13593 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13594 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13599 return _flock(fh
, cmd
, owner
);
13602 int Client::set_deleg_timeout(uint32_t timeout
)
13604 std::lock_guard
lock(client_lock
);
13607 * The whole point is to prevent blacklisting so we must time out the
13608 * delegation before the session autoclose timeout kicks in.
13610 if (timeout
>= mdsmap
->get_session_autoclose())
13613 deleg_timeout
= timeout
;
13617 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13621 std::lock_guard
lock(client_lock
);
13626 Inode
*inode
= fh
->inode
.get();
13629 case CEPH_DELEGATION_NONE
:
13630 inode
->unset_deleg(fh
);
13635 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13636 } catch (std::bad_alloc
&) {
13644 class C_Client_RequestInterrupt
: public Context
{
13649 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13652 void finish(int r
) override
{
13653 std::lock_guard
l(client
->client_lock
);
13654 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13655 client
->_interrupt_filelock(req
);
13656 client
->put_request(req
);
13660 void Client::ll_interrupt(void *d
)
13662 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13663 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13664 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13665 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13668 // =========================================
13671 // expose file layouts
13673 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13674 const UserPerm
& perms
)
13676 std::lock_guard
lock(client_lock
);
13681 filepath
path(relpath
);
13683 int r
= path_walk(path
, &in
, perms
);
13689 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13693 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13695 std::lock_guard
lock(client_lock
);
13700 Fh
*f
= get_filehandle(fd
);
13703 Inode
*in
= f
->inode
.get();
13707 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13711 int64_t Client::get_default_pool_id()
13713 std::lock_guard
lock(client_lock
);
13718 /* first data pool is the default */
13719 return mdsmap
->get_first_data_pool();
13724 int64_t Client::get_pool_id(const char *pool_name
)
13726 std::lock_guard
lock(client_lock
);
13731 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13735 string
Client::get_pool_name(int64_t pool
)
13737 std::lock_guard
lock(client_lock
);
13742 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13743 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13747 int Client::get_pool_replication(int64_t pool
)
13749 std::lock_guard
lock(client_lock
);
13754 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13755 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13759 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13761 std::lock_guard
lock(client_lock
);
13766 Fh
*f
= get_filehandle(fd
);
13769 Inode
*in
= f
->inode
.get();
13771 vector
<ObjectExtent
> extents
;
13772 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13773 ceph_assert(extents
.size() == 1);
13775 objecter
->with_osdmap([&](const OSDMap
& o
) {
13776 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13777 o
.pg_to_acting_osds(pg
, osds
);
13784 * Return the remainder of the extent (stripe unit)
13786 * If length = 1 is passed to Striper::file_to_extents we get a single
13787 * extent back, but its length is one so we still need to compute the length
13788 * to the end of the stripe unit.
13790 * If length = su then we may get 1 or 2 objects back in the extents vector
13791 * which would have to be examined. Even then, the offsets are local to the
13792 * object, so matching up to the file offset is extra work.
13794 * It seems simpler to stick with length = 1 and manually compute the
13798 uint64_t su
= in
->layout
.stripe_unit
;
13799 *len
= su
- (off
% su
);
13805 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13807 std::lock_guard
lock(client_lock
);
13814 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13815 return o
.crush
->get_full_location_ordered(id
, path
);
13819 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13820 vector
<entity_addr_t
>& address
)
13822 std::lock_guard
lock(client_lock
);
13827 Fh
*f
= get_filehandle(fd
);
13830 Inode
*in
= f
->inode
.get();
13833 vector
<ObjectExtent
> extents
;
13834 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13835 in
->truncate_size
, extents
);
13836 ceph_assert(extents
.size() == 1);
13838 // now we have the object and its 'layout'
13839 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13840 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13842 o
.pg_to_acting_osds(pg
, osds
);
13845 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13846 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13847 address
.push_back(addr
);
13853 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13855 std::lock_guard
lock(client_lock
);
13860 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13861 if (!o
.exists(osd
))
13864 addr
= o
.get_addrs(osd
).front();
13869 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13870 loff_t length
, loff_t offset
)
13872 std::lock_guard
lock(client_lock
);
13877 Fh
*f
= get_filehandle(fd
);
13880 Inode
*in
= f
->inode
.get();
13882 // map to a list of extents
13883 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13885 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13890 /* find an osd with the same ip. -ENXIO if none. */
13891 int Client::get_local_osd()
13893 std::lock_guard
lock(client_lock
);
13898 objecter
->with_osdmap([this](const OSDMap
& o
) {
13899 if (o
.get_epoch() != local_osd_epoch
) {
13900 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
13901 local_osd_epoch
= o
.get_epoch();
13912 // ===============================
13914 void Client::ms_handle_connect(Connection
*con
)
13916 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13919 bool Client::ms_handle_reset(Connection
*con
)
13921 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13925 void Client::ms_handle_remote_reset(Connection
*con
)
13927 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13928 std::lock_guard
l(client_lock
);
13929 switch (con
->get_peer_type()) {
13930 case CEPH_ENTITY_TYPE_MDS
:
13932 // kludge to figure out which mds this is; fixme with a Connection* state
13933 mds_rank_t mds
= MDS_RANK_NONE
;
13934 MetaSession
*s
= NULL
;
13935 for (auto &p
: mds_sessions
) {
13936 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
13942 assert (s
!= NULL
);
13943 switch (s
->state
) {
13944 case MetaSession::STATE_CLOSING
:
13945 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
13946 _closed_mds_session(s
);
13949 case MetaSession::STATE_OPENING
:
13951 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
13952 list
<Context
*> waiters
;
13953 waiters
.swap(s
->waiting_for_open
);
13954 _closed_mds_session(s
);
13955 MetaSession
*news
= _get_or_open_mds_session(mds
);
13956 news
->waiting_for_open
.swap(waiters
);
13960 case MetaSession::STATE_OPEN
:
13962 objecter
->maybe_request_map(); /* to check if we are blacklisted */
13963 const auto& conf
= cct
->_conf
;
13964 if (conf
->client_reconnect_stale
) {
13965 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
13966 _closed_mds_session(s
);
13968 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
13969 s
->state
= MetaSession::STATE_STALE
;
13974 case MetaSession::STATE_NEW
:
13975 case MetaSession::STATE_CLOSED
:
13985 bool Client::ms_handle_refused(Connection
*con
)
13987 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
13991 bool Client::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
)
13993 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
13995 *authorizer
= monclient
->build_authorizer(dest_type
);
13999 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14001 Inode
*quota_in
= root_ancestor
;
14002 SnapRealm
*realm
= in
->snaprealm
;
14004 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14005 if (realm
->ino
!= in
->ino
) {
14006 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14007 if (p
== inode_map
.end())
14010 if (p
->second
->quota
.is_enable()) {
14011 quota_in
= p
->second
;
14015 realm
= realm
->pparent
;
14017 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14022 * Traverse quota ancestors of the Inode, return true
14023 * if any of them passes the passed function
14025 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14026 std::function
<bool (const Inode
&in
)> test
)
14029 ceph_assert(in
!= NULL
);
14034 if (in
== root_ancestor
) {
14035 // We're done traversing, drop out
14038 // Continue up the tree
14039 in
= get_quota_root(in
, perms
);
14046 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14048 return check_quota_condition(in
, perms
,
14049 [](const Inode
&in
) {
14050 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14054 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14055 const UserPerm
& perms
)
14057 return check_quota_condition(in
, perms
,
14058 [&new_bytes
](const Inode
&in
) {
14059 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14060 > in
.quota
.max_bytes
;
14064 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14066 return check_quota_condition(in
, perms
,
14067 [](const Inode
&in
) {
14068 if (in
.quota
.max_bytes
) {
14069 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14073 ceph_assert(in
.size
>= in
.reported_size
);
14074 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14075 const uint64_t size
= in
.size
- in
.reported_size
;
14076 return (space
>> 4) < size
;
14090 int Client::check_pool_perm(Inode
*in
, int need
)
14092 if (!cct
->_conf
->client_check_pool_perm
)
14095 int64_t pool_id
= in
->layout
.pool_id
;
14096 std::string pool_ns
= in
->layout
.pool_ns
;
14097 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14100 auto it
= pool_perms
.find(perm_key
);
14101 if (it
== pool_perms
.end())
14103 if (it
->second
== POOL_CHECKING
) {
14104 // avoid concurrent checkings
14105 wait_on_list(waiting_for_pool_perm
);
14108 ceph_assert(have
& POOL_CHECKED
);
14114 if (in
->snapid
!= CEPH_NOSNAP
) {
14115 // pool permission check needs to write to the first object. But for snapshot,
14116 // head of the first object may have alread been deleted. To avoid creating
14117 // orphan object, skip the check for now.
14121 pool_perms
[perm_key
] = POOL_CHECKING
;
14124 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14125 object_t oid
= oid_buf
;
14127 SnapContext nullsnapc
;
14129 C_SaferCond rd_cond
;
14130 ObjectOperation rd_op
;
14131 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14133 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14134 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14136 C_SaferCond wr_cond
;
14137 ObjectOperation wr_op
;
14138 wr_op
.create(true);
14140 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14141 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14143 client_lock
.Unlock();
14144 int rd_ret
= rd_cond
.wait();
14145 int wr_ret
= wr_cond
.wait();
14146 client_lock
.Lock();
14148 bool errored
= false;
14150 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14152 else if (rd_ret
!= -EPERM
) {
14153 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14154 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14158 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14159 have
|= POOL_WRITE
;
14160 else if (wr_ret
!= -EPERM
) {
14161 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14162 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14167 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14168 // Raise EIO because actual error code might be misleading for
14169 // userspace filesystem user.
14170 pool_perms
.erase(perm_key
);
14171 signal_cond_list(waiting_for_pool_perm
);
14175 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14176 signal_cond_list(waiting_for_pool_perm
);
14179 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14180 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14181 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14184 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14185 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14186 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14193 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14195 if (acl_type
== POSIX_ACL
) {
14196 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14197 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14199 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14205 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14207 if (acl_type
== NO_ACL
)
14210 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14214 if (acl_type
== POSIX_ACL
) {
14215 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14216 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14217 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14218 r
= posix_acl_access_chmod(acl
, mode
);
14221 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14227 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14231 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14232 const UserPerm
& perms
)
14234 if (acl_type
== NO_ACL
)
14237 if (S_ISLNK(*mode
))
14240 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14244 if (acl_type
== POSIX_ACL
) {
14245 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14246 map
<string
, bufferptr
> xattrs
;
14248 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14249 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14250 r
= posix_acl_inherit_mode(acl
, mode
);
14255 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14259 xattrs
[ACL_EA_ACCESS
] = acl
;
14262 if (S_ISDIR(*mode
))
14263 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14267 encode(xattrs
, xattrs_bl
);
14270 *mode
&= ~umask_cb(callback_handle
);
14275 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14279 void Client::set_filer_flags(int flags
)
14281 std::lock_guard
l(client_lock
);
14282 ceph_assert(flags
== 0 ||
14283 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14284 objecter
->add_global_op_flags(flags
);
14287 void Client::clear_filer_flags(int flags
)
14289 std::lock_guard
l(client_lock
);
14290 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14291 objecter
->clear_global_op_flag(flags
);
14294 // called before mount
14295 void Client::set_uuid(const std::string
& uuid
)
14297 std::lock_guard
l(client_lock
);
14298 assert(initialized
);
14299 assert(!uuid
.empty());
14301 metadata
["uuid"] = uuid
;
14305 // called before mount. 0 means infinite
14306 void Client::set_session_timeout(unsigned timeout
)
14308 std::lock_guard
l(client_lock
);
14309 assert(initialized
);
14311 metadata
["timeout"] = stringify(timeout
);
14314 // called before mount
14315 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14316 const std::string
& fs_name
)
14318 std::lock_guard
l(client_lock
);
14326 auto it
= metadata
.find("uuid");
14327 if (it
!= metadata
.end() && it
->second
== uuid
)
14331 int r
= subscribe_mdsmap(fs_name
);
14333 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14337 if (metadata
.empty())
14338 populate_metadata("");
14340 while (mdsmap
->get_epoch() == 0)
14341 wait_on_list(waiting_for_mdsmap
);
14344 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14345 if (!mdsmap
->is_up(mds
)) {
14346 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14347 wait_on_list(waiting_for_mdsmap
);
14351 MetaSession
*session
;
14352 if (!have_open_session(mds
)) {
14353 session
= _get_or_open_mds_session(mds
);
14354 if (session
->state
!= MetaSession::STATE_OPENING
) {
14358 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14359 wait_on_context_list(session
->waiting_for_open
);
14360 if (rejected_by_mds
.count(mds
))
14365 session
= &mds_sessions
.at(mds
);
14366 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14367 return -EOPNOTSUPP
;
14369 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14370 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14371 session
->reclaim_state
= MetaSession::RECLAIMING
;
14372 auto m
= MClientReclaim::create(uuid
, flags
);
14373 session
->con
->send_message2(std::move(m
));
14374 wait_on_list(waiting_for_reclaim
);
14375 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14376 return reclaim_errno
? : -ENOTRECOVERABLE
;
14382 // didn't find target session in any mds
14383 if (reclaim_target_addrs
.empty()) {
14384 if (flags
& CEPH_RECLAIM_RESET
)
14386 return -ENOTRECOVERABLE
;
14389 if (flags
& CEPH_RECLAIM_RESET
)
14392 // use blacklist to check if target session was killed
14393 // (config option mds_session_blacklist_on_evict needs to be true)
14395 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14396 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14397 client_lock
.Unlock();
14399 client_lock
.Lock();
14402 bool blacklisted
= objecter
->with_osdmap(
14403 [this](const OSDMap
&osd_map
) -> bool {
14404 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14407 return -ENOTRECOVERABLE
;
14409 metadata
["reclaiming_uuid"] = uuid
;
14413 void Client::finish_reclaim()
14415 auto it
= metadata
.find("reclaiming_uuid");
14416 if (it
== metadata
.end()) {
14417 for (auto &p
: mds_sessions
)
14418 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14422 for (auto &p
: mds_sessions
) {
14423 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14424 auto m
= MClientReclaim::create("", MClientReclaim::FLAG_FINISH
);
14425 p
.second
.con
->send_message2(std::move(m
));
14428 metadata
["uuid"] = it
->second
;
14429 metadata
.erase(it
);
14432 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14434 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14435 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14437 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14439 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14443 if (reply
->get_result() >= 0) {
14444 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14445 if (reply
->get_epoch() > reclaim_osd_epoch
)
14446 reclaim_osd_epoch
= reply
->get_epoch();
14447 if (!reply
->get_addrs().empty())
14448 reclaim_target_addrs
= reply
->get_addrs();
14450 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14451 reclaim_errno
= reply
->get_result();
14454 signal_cond_list(waiting_for_reclaim
);
14458 * This is included in cap release messages, to cause
14459 * the MDS to wait until this OSD map epoch. It is necessary
14460 * in corner cases where we cancel RADOS ops, so that
14461 * nobody else tries to do IO to the same objects in
14462 * the same epoch as the cancelled ops.
14464 void Client::set_cap_epoch_barrier(epoch_t e
)
14466 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14467 cap_epoch_barrier
= e
;
14470 const char** Client::get_tracked_conf_keys() const
14472 static const char* keys
[] = {
14473 "client_cache_size",
14474 "client_cache_mid",
14476 "client_deleg_timeout",
14477 "client_deleg_break_on_open",
14483 void Client::handle_conf_change(const ConfigProxy
& conf
,
14484 const std::set
<std::string
> &changed
)
14486 std::lock_guard
lock(client_lock
);
14488 if (changed
.count("client_cache_mid")) {
14489 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14491 if (changed
.count("client_acl_type")) {
14493 if (cct
->_conf
->client_acl_type
== "posix_acl")
14494 acl_type
= POSIX_ACL
;
14498 void intrusive_ptr_add_ref(Inode
*in
)
14503 void intrusive_ptr_release(Inode
*in
)
14505 in
->client
->put_inode(in
);
14508 mds_rank_t
Client::_get_random_up_mds() const
14510 ceph_assert(client_lock
.is_locked_by_me());
14512 std::set
<mds_rank_t
> up
;
14513 mdsmap
->get_up_mds_set(up
);
14516 return MDS_RANK_NONE
;
14517 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14518 for (int n
= rand() % up
.size(); n
; n
--)
14524 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14525 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14527 monclient
->set_messenger(m
);
14528 objecter
->set_client_incarnation(0);
14531 StandaloneClient::~StandaloneClient()
14534 objecter
= nullptr;
14537 int StandaloneClient::init()
14540 objectcacher
->start();
14543 client_lock
.Lock();
14544 ceph_assert(!is_initialized());
14546 messenger
->add_dispatcher_tail(objecter
);
14547 messenger
->add_dispatcher_tail(this);
14549 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14550 int r
= monclient
->init();
14552 // need to do cleanup because we're in an intermediate init state
14554 client_lock
.Unlock();
14555 objecter
->shutdown();
14556 objectcacher
->stop();
14557 monclient
->shutdown();
14562 client_lock
.Unlock();
14568 void StandaloneClient::shutdown()
14570 Client::shutdown();
14571 objecter
->shutdown();
14572 monclient
->shutdown();