1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
78 #define dout_subsys ceph_subsys_client
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
87 #include "Delegation.h"
89 #include "ClientSnapRealm.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
99 #include "include/cephfs/ceph_ll_client.h"
101 #if HAVE_GETGROUPLIST
108 #define dout_prefix *_dout << "client." << whoami << " "
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112 // FreeBSD fails to define this
116 // Darwin fails to define this
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
129 Client
*client
= static_cast<Client
*>(p
);
130 client
->flush_set_callback(oset
);
136 Client::CommandHook::CommandHook(Client
*client
) :
141 int Client::CommandHook::call(
142 std::string_view command
,
143 const cmdmap_t
& cmdmap
,
148 f
->open_object_section("result");
150 std::lock_guard l
{m_client
->client_lock
};
151 if (command
== "mds_requests")
152 m_client
->dump_mds_requests(f
);
153 else if (command
== "mds_sessions")
154 m_client
->dump_mds_sessions(f
);
155 else if (command
== "dump_cache")
156 m_client
->dump_cache(f
);
157 else if (command
== "kick_stale_sessions")
158 m_client
->_kick_stale_sessions();
159 else if (command
== "status")
160 m_client
->dump_status(f
);
162 ceph_abort_msg("bad command registered");
171 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
172 : inode(in
), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
177 void Client::_reset_faked_inos()
180 free_faked_inos
.clear();
181 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
182 last_used_faked_ino
= 0;
183 last_used_faked_root
= 0;
184 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
187 void Client::_assign_faked_ino(Inode
*in
)
189 if (0 == last_used_faked_ino
)
190 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
192 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
193 last_used_faked_ino
= 2048;
194 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
196 ceph_assert(it
!= free_faked_inos
.end());
197 if (last_used_faked_ino
< it
.get_start()) {
198 ceph_assert(it
.get_len() > 0);
199 last_used_faked_ino
= it
.get_start();
201 ++last_used_faked_ino
;
202 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
204 in
->faked_ino
= last_used_faked_ino
;
205 free_faked_inos
.erase(in
->faked_ino
);
206 faked_ino_map
[in
->faked_ino
] = in
->vino();
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
216 void Client::_assign_faked_root(Inode
*in
)
218 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
219 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
220 last_used_faked_root
= 0;
221 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
223 assert(it
!= free_faked_inos
.end());
224 vinodeno_t inode_info
= in
->vino();
225 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
226 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
227 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
230 in
->faked_ino
= last_used_faked_root
;
231 free_faked_inos
.erase(in
->faked_ino
);
232 faked_ino_map
[in
->faked_ino
] = in
->vino();
235 void Client::_release_faked_ino(Inode
*in
)
237 free_faked_inos
.insert(in
->faked_ino
);
238 faked_ino_map
.erase(in
->faked_ino
);
241 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
246 else if (faked_ino_map
.count(ino
))
247 vino
= faked_ino_map
[ino
];
249 vino
= vinodeno_t(0, CEPH_NOSNAP
);
250 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
254 vinodeno_t
Client::map_faked_ino(ino_t ino
)
256 std::lock_guard
lock(client_lock
);
257 return _map_faked_ino(ino
);
262 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
263 : Dispatcher(m
->cct
),
264 timer(m
->cct
, client_lock
),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 async_ino_releasor(m
->cct
),
274 objecter_finisher(m
->cct
),
275 m_command_hook(this),
280 user_id
= cct
->_conf
->client_mount_uid
;
281 group_id
= cct
->_conf
->client_mount_gid
;
282 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
283 "fuse_default_permissions");
285 if (cct
->_conf
->client_acl_type
== "posix_acl")
286 acl_type
= POSIX_ACL
;
288 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
291 free_fd_set
.insert(10, 1<<30);
293 mdsmap
.reset(new MDSMap
);
296 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
298 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
299 client_flush_set_callback
, // all commit callback
301 cct
->_conf
->client_oc_size
,
302 cct
->_conf
->client_oc_max_objects
,
303 cct
->_conf
->client_oc_max_dirty
,
304 cct
->_conf
->client_oc_target_dirty
,
305 cct
->_conf
->client_oc_max_dirty_age
,
312 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 std::lock_guard l
{client_lock
};
321 void Client::tear_down_cache()
324 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
328 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
333 while (!opened_dirs
.empty()) {
334 dir_result_t
*dirp
= *opened_dirs
.begin();
335 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
344 ceph_assert(lru
.lru_get_size() == 0);
347 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
348 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
352 while (!root_parents
.empty())
353 root_parents
.erase(root_parents
.begin());
358 ceph_assert(inode_map
.empty());
361 inodeno_t
Client::get_root_ino()
363 std::lock_guard
l(client_lock
);
364 if (use_faked_inos())
365 return root
->faked_ino
;
370 Inode
*Client::get_root()
372 std::lock_guard
l(client_lock
);
380 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
383 in
->make_long_path(path
);
384 ldout(cct
, 1) << "dump_inode: "
385 << (disconnected
? "DISCONNECTED ":"")
386 << "inode " << in
->ino
388 << " ref " << in
->get_num_ref()
392 f
->open_object_section("inode");
393 f
->dump_stream("path") << path
;
395 f
->dump_int("disconnected", 1);
402 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
403 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
404 it
!= in
->dir
->dentries
.end();
406 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
408 f
->open_object_section("dentry");
412 if (it
->second
->inode
)
413 dump_inode(f
, it
->second
->inode
.get(), did
, false);
418 void Client::dump_cache(Formatter
*f
)
422 ldout(cct
, 1) << __func__
<< dendl
;
425 f
->open_array_section("cache");
428 dump_inode(f
, root
, did
, true);
430 // make a second pass to catch anything disconnected
431 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
432 it
!= inode_map
.end();
434 if (did
.count(it
->second
))
436 dump_inode(f
, it
->second
, did
, true);
443 void Client::dump_status(Formatter
*f
)
445 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
447 ldout(cct
, 1) << __func__
<< dendl
;
449 const epoch_t osd_epoch
450 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
453 f
->open_object_section("metadata");
454 for (const auto& kv
: metadata
)
455 f
->dump_string(kv
.first
.c_str(), kv
.second
);
458 f
->dump_int("dentry_count", lru
.lru_get_size());
459 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
460 f
->dump_int("id", get_nodeid().v
);
461 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
462 f
->dump_object("inst", inst
);
463 f
->dump_object("addr", inst
.addr
);
464 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
465 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
466 f
->dump_int("inode_count", inode_map
.size());
467 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
468 f
->dump_int("osd_epoch", osd_epoch
);
469 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
470 f
->dump_bool("blacklisted", blacklisted
);
474 void Client::_pre_init()
478 objecter_finisher
.start();
479 filer
.reset(new Filer(objecter
, &objecter_finisher
));
480 objecter
->enable_blacklist_events();
482 objectcacher
->start();
489 std::lock_guard l
{client_lock
};
490 ceph_assert(!initialized
);
491 messenger
->add_dispatcher_tail(this);
497 void Client::_finish_init()
500 std::lock_guard l
{client_lock
};
502 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
503 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
504 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
505 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
506 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
507 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
508 logger
.reset(plb
.create_perf_counters());
509 cct
->get_perfcounters_collection()->add(logger
.get());
512 cct
->_conf
.add_observer(this);
514 AdminSocket
* admin_socket
= cct
->get_admin_socket();
515 int ret
= admin_socket
->register_command("mds_requests",
517 "show in-progress mds requests");
519 lderr(cct
) << "error registering admin socket command: "
520 << cpp_strerror(-ret
) << dendl
;
522 ret
= admin_socket
->register_command("mds_sessions",
524 "show mds session state");
526 lderr(cct
) << "error registering admin socket command: "
527 << cpp_strerror(-ret
) << dendl
;
529 ret
= admin_socket
->register_command("dump_cache",
531 "show in-memory metadata cache contents");
533 lderr(cct
) << "error registering admin socket command: "
534 << cpp_strerror(-ret
) << dendl
;
536 ret
= admin_socket
->register_command("kick_stale_sessions",
538 "kick sessions that were remote reset");
540 lderr(cct
) << "error registering admin socket command: "
541 << cpp_strerror(-ret
) << dendl
;
543 ret
= admin_socket
->register_command("status",
545 "show overall client status");
547 lderr(cct
) << "error registering admin socket command: "
548 << cpp_strerror(-ret
) << dendl
;
551 std::lock_guard l
{client_lock
};
555 void Client::shutdown()
557 ldout(cct
, 1) << __func__
<< dendl
;
559 // If we were not mounted, but were being used for sending
560 // MDS commands, we may have sessions that need closing.
562 std::lock_guard l
{client_lock
};
565 cct
->_conf
.remove_observer(this);
567 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
569 if (ino_invalidate_cb
) {
570 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
571 async_ino_invalidator
.wait_for_empty();
572 async_ino_invalidator
.stop();
575 if (dentry_invalidate_cb
) {
576 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
577 async_dentry_invalidator
.wait_for_empty();
578 async_dentry_invalidator
.stop();
581 if (switch_interrupt_cb
) {
582 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
583 interrupt_finisher
.wait_for_empty();
584 interrupt_finisher
.stop();
588 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
589 remount_finisher
.wait_for_empty();
590 remount_finisher
.stop();
593 if (ino_release_cb
) {
594 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
595 async_ino_releasor
.wait_for_empty();
596 async_ino_releasor
.stop();
599 objectcacher
->stop(); // outside of client_lock! this does a join.
601 std::lock_guard l
{client_lock
};
602 ceph_assert(initialized
);
606 objecter_finisher
.wait_for_empty();
607 objecter_finisher
.stop();
610 cct
->get_perfcounters_collection()->remove(logger
.get());
616 // ===================
617 // metadata cache stuff
619 void Client::trim_cache(bool trim_kernel_dcache
)
621 uint64_t max
= cct
->_conf
->client_cache_size
;
622 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
624 while (lru
.lru_get_size() != last
) {
625 last
= lru
.lru_get_size();
627 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
630 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
637 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
638 _invalidate_kernel_dcache();
641 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
642 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
646 while (!root_parents
.empty())
647 root_parents
.erase(root_parents
.begin());
653 void Client::trim_cache_for_reconnect(MetaSession
*s
)
655 mds_rank_t mds
= s
->mds_num
;
656 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
659 list
<Dentry
*> skipped
;
660 while (lru
.lru_get_size() > 0) {
661 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
665 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
666 dn
->dir
->parent_inode
->caps
.count(mds
)) {
670 skipped
.push_back(dn
);
673 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
674 lru
.lru_insert_mid(*p
);
676 ldout(cct
, 20) << __func__
<< " mds." << mds
677 << " trimmed " << trimmed
<< " dentries" << dendl
;
679 if (s
->caps
.size() > 0)
680 _invalidate_kernel_dcache();
683 void Client::trim_dentry(Dentry
*dn
)
685 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
687 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
690 Inode
*diri
= dn
->dir
->parent_inode
;
691 diri
->dir_release_count
++;
692 clear_dir_complete_and_ordered(diri
, true);
694 unlink(dn
, false, false); // drop dir, drop dentry
698 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
699 uint64_t truncate_seq
, uint64_t truncate_size
)
701 uint64_t prior_size
= in
->size
;
703 if (truncate_seq
> in
->truncate_seq
||
704 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
705 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
707 in
->reported_size
= size
;
708 if (truncate_seq
!= in
->truncate_seq
) {
709 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
710 << truncate_seq
<< dendl
;
711 in
->truncate_seq
= truncate_seq
;
712 in
->oset
.truncate_seq
= truncate_seq
;
714 // truncate cached file data
715 if (prior_size
> size
) {
716 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
720 // truncate inline data
721 if (in
->inline_version
< CEPH_INLINE_NONE
) {
722 uint32_t len
= in
->inline_data
.length();
724 in
->inline_data
.splice(size
, len
- size
);
727 if (truncate_seq
>= in
->truncate_seq
&&
728 in
->truncate_size
!= truncate_size
) {
730 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
731 << truncate_size
<< dendl
;
732 in
->truncate_size
= truncate_size
;
733 in
->oset
.truncate_size
= truncate_size
;
735 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
740 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
741 utime_t ctime
, utime_t mtime
, utime_t atime
)
743 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
744 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
746 if (time_warp_seq
> in
->time_warp_seq
)
747 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
748 << " is higher than local time_warp_seq "
749 << in
->time_warp_seq
<< dendl
;
752 // be careful with size, mtime, atime
753 if (issued
& (CEPH_CAP_FILE_EXCL
|
755 CEPH_CAP_FILE_BUFFER
|
757 CEPH_CAP_XATTR_EXCL
)) {
758 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
759 if (ctime
> in
->ctime
)
761 if (time_warp_seq
> in
->time_warp_seq
) {
762 //the mds updated times, so take those!
765 in
->time_warp_seq
= time_warp_seq
;
766 } else if (time_warp_seq
== in
->time_warp_seq
) {
768 if (mtime
> in
->mtime
)
770 if (atime
> in
->atime
)
772 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
773 //ignore mds values as we have a higher seq
776 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
777 if (time_warp_seq
>= in
->time_warp_seq
) {
781 in
->time_warp_seq
= time_warp_seq
;
785 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
786 << time_warp_seq
<< " is lower than local time_warp_seq "
792 void Client::_fragmap_remove_non_leaves(Inode
*in
)
794 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
795 if (!in
->dirfragtree
.is_leaf(p
->first
))
796 in
->fragmap
.erase(p
++);
801 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
803 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
804 if (p
->second
== mds
)
805 in
->fragmap
.erase(p
++);
810 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
811 MetaSession
*session
,
812 const UserPerm
& request_perms
)
815 bool was_new
= false;
816 if (inode_map
.count(st
->vino
)) {
817 in
= inode_map
[st
->vino
];
818 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
820 in
= new Inode(this, st
->vino
, &st
->layout
);
821 inode_map
[st
->vino
] = in
;
823 if (use_faked_inos())
824 _assign_faked_ino(in
);
828 if (use_faked_inos())
829 _assign_faked_root(root
);
832 } else if (!mounted
) {
833 root_parents
[root_ancestor
] = in
;
838 in
->ino
= st
->vino
.ino
;
839 in
->snapid
= st
->vino
.snapid
;
840 in
->mode
= st
->mode
& S_IFMT
;
845 if (in
->is_symlink())
846 in
->symlink
= st
->symlink
;
848 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
849 bool new_version
= false;
850 if (in
->version
== 0 ||
851 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
852 (in
->version
& ~1) < st
->version
))
856 in
->caps_issued(&issued
);
857 issued
|= in
->caps_dirty();
858 int new_issued
= ~issued
& (int)st
->cap
.caps
;
860 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
861 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
865 in
->btime
= st
->btime
;
866 in
->snap_btime
= st
->snap_btime
;
869 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
870 !(issued
& CEPH_CAP_LINK_EXCL
)) {
871 in
->nlink
= st
->nlink
;
874 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
875 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
876 st
->ctime
, st
->mtime
, st
->atime
);
880 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
881 in
->layout
= st
->layout
;
882 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
886 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
887 in
->dirstat
= st
->dirstat
;
889 // dir_layout/rstat/quota are not tracked by capability, update them only if
890 // the inode stat is from auth mds
891 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
892 in
->dir_layout
= st
->dir_layout
;
893 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
894 in
->rstat
= st
->rstat
;
895 in
->quota
= st
->quota
;
896 in
->dir_pin
= st
->dir_pin
;
898 // move me if/when version reflects fragtree changes.
899 if (in
->dirfragtree
!= st
->dirfragtree
) {
900 in
->dirfragtree
= st
->dirfragtree
;
901 _fragmap_remove_non_leaves(in
);
905 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
906 st
->xattrbl
.length() &&
907 st
->xattr_version
> in
->xattr_version
) {
908 auto p
= st
->xattrbl
.cbegin();
909 decode(in
->xattrs
, p
);
910 in
->xattr_version
= st
->xattr_version
;
913 if (st
->inline_version
> in
->inline_version
) {
914 in
->inline_data
= st
->inline_data
;
915 in
->inline_version
= st
->inline_version
;
918 /* always take a newer change attr */
919 if (st
->change_attr
> in
->change_attr
)
920 in
->change_attr
= st
->change_attr
;
922 if (st
->version
> in
->version
)
923 in
->version
= st
->version
;
926 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
929 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
931 if (in
->snapid
== CEPH_NOSNAP
) {
932 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
933 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
934 st
->cap
.flags
, request_perms
);
935 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
936 in
->max_size
= st
->max_size
;
937 in
->rstat
= st
->rstat
;
940 // setting I_COMPLETE needs to happen after adding the cap
942 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
943 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
944 in
->dirstat
.nfiles
== 0 &&
945 in
->dirstat
.nsubdirs
== 0) {
946 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
947 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
949 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
950 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
951 in
->dir
->readdir_cache
.clear();
952 for (const auto& p
: in
->dir
->dentries
) {
953 unlink(p
.second
, true, true); // keep dir, keep dentry
955 if (in
->dir
->dentries
.empty())
960 in
->snap_caps
|= st
->cap
.caps
;
968 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
970 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
971 Inode
*in
, utime_t from
, MetaSession
*session
,
975 if (dir
->dentries
.count(dname
))
976 dn
= dir
->dentries
[dname
];
978 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
979 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
982 if (dn
&& dn
->inode
) {
983 if (dn
->inode
->vino() == in
->vino()) {
985 ldout(cct
, 12) << " had dentry " << dname
986 << " with correct vino " << dn
->inode
->vino()
989 ldout(cct
, 12) << " had dentry " << dname
990 << " with WRONG vino " << dn
->inode
->vino()
992 unlink(dn
, true, true); // keep dir, keep dentry
996 if (!dn
|| !dn
->inode
) {
997 InodeRef
tmp_ref(in
);
999 if (old_dentry
->dir
!= dir
) {
1000 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1001 old_diri
->dir_ordered_count
++;
1002 clear_dir_complete_and_ordered(old_diri
, false);
1004 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1006 Inode
*diri
= dir
->parent_inode
;
1007 diri
->dir_ordered_count
++;
1008 clear_dir_complete_and_ordered(diri
, false);
1009 dn
= link(dir
, dname
, in
, dn
);
1012 update_dentry_lease(dn
, dlease
, from
, session
);
1016 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1018 utime_t dttl
= from
;
1019 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1023 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1024 if (dttl
> dn
->lease_ttl
) {
1025 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1026 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1027 dn
->lease_ttl
= dttl
;
1028 dn
->lease_mds
= session
->mds_num
;
1029 dn
->lease_seq
= dlease
->seq
;
1030 dn
->lease_gen
= session
->cap_gen
;
1033 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1038 * update MDS location cache for a single inode
1040 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1043 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1044 if (dst
->auth
>= 0) {
1045 in
->fragmap
[dst
->frag
] = dst
->auth
;
1047 in
->fragmap
.erase(dst
->frag
);
1049 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1050 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1051 _fragmap_remove_non_leaves(in
);
1055 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1058 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1060 if (diri
->flags
& I_COMPLETE
) {
1062 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1063 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1065 if (diri
->flags
& I_DIR_ORDERED
) {
1066 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1067 diri
->flags
&= ~I_DIR_ORDERED
;
1071 diri
->dir
->readdir_cache
.clear();
1076 * insert results from readdir or lssnap into the metadata cache.
1078 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1080 auto& reply
= request
->reply
;
1081 ConnectionRef con
= request
->reply
->get_connection();
1083 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1084 features
= (uint64_t)-1;
1087 features
= con
->get_features();
1090 dir_result_t
*dirp
= request
->dirp
;
1093 // the extra buffer list is only set for readdir and lssnap replies
1094 auto p
= reply
->get_extra_bl().cbegin();
1097 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1099 diri
= open_snapdir(diri
);
1102 // only open dir if we're actually adding stuff to it!
1103 Dir
*dir
= diri
->open_dir();
1107 DirStat
dst(p
, features
);
1113 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1114 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1116 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1117 unsigned readdir_offset
= dirp
->next_offset
;
1118 string readdir_start
= dirp
->last_name
;
1119 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1121 unsigned last_hash
= 0;
1123 if (!readdir_start
.empty()) {
1124 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1125 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1126 /* mds understands offset_hash */
1127 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1131 if (fg
!= dst
.frag
) {
1132 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1136 readdir_start
.clear();
1137 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1141 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1142 << ", hash_order=" << hash_order
1143 << ", readdir_start " << readdir_start
1144 << ", last_hash " << last_hash
1145 << ", next_offset " << readdir_offset
<< dendl
;
1147 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1148 fg
.is_leftmost() && readdir_offset
== 2 &&
1149 !(hash_order
&& last_hash
)) {
1150 dirp
->release_count
= diri
->dir_release_count
;
1151 dirp
->ordered_count
= diri
->dir_ordered_count
;
1152 dirp
->start_shared_gen
= diri
->shared_gen
;
1153 dirp
->cache_index
= 0;
1156 dirp
->buffer_frag
= fg
;
1158 _readdir_drop_dirp_buffer(dirp
);
1159 dirp
->buffer
.reserve(numdn
);
1163 for (unsigned i
=0; i
<numdn
; i
++) {
1165 dlease
.decode(p
, features
);
1166 InodeStat
ist(p
, features
);
1168 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1170 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1173 if (diri
->dir
->dentries
.count(dname
)) {
1174 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1175 if (olddn
->inode
!= in
) {
1176 // replace incorrect dentry
1177 unlink(olddn
, true, true); // keep dir, dentry
1178 dn
= link(dir
, dname
, in
, olddn
);
1179 ceph_assert(dn
== olddn
);
1187 dn
= link(dir
, dname
, in
, NULL
);
1190 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1192 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1193 if (hash
!= last_hash
)
1196 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1198 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1200 // add to readdir cache
1201 if (dirp
->release_count
== diri
->dir_release_count
&&
1202 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1203 dirp
->start_shared_gen
== diri
->shared_gen
) {
1204 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1206 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1207 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1209 dir
->readdir_cache
.push_back(dn
);
1210 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1211 if (dirp
->inode
->is_complete_and_ordered())
1212 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1214 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1216 ceph_abort_msg("unexpected readdir buffer idx");
1218 dirp
->cache_index
++;
1220 // add to cached result list
1221 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1222 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1226 dirp
->last_name
= dname
;
1228 dirp
->next_offset
= 2;
1230 dirp
->next_offset
= readdir_offset
;
1232 if (dir
->is_empty())
1239 * insert a trace from a MDS reply into the cache.
1241 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1243 auto& reply
= request
->reply
;
1244 int op
= request
->get_op();
1246 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1247 << " is_target=" << (int)reply
->head
.is_target
1248 << " is_dentry=" << (int)reply
->head
.is_dentry
1251 auto p
= reply
->get_trace_bl().cbegin();
1252 if (request
->got_unsafe
) {
1253 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1254 ceph_assert(p
.end());
1259 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1261 Dentry
*d
= request
->dentry();
1263 Inode
*diri
= d
->dir
->parent_inode
;
1264 diri
->dir_release_count
++;
1265 clear_dir_complete_and_ordered(diri
, true);
1268 if (d
&& reply
->get_result() == 0) {
1269 if (op
== CEPH_MDS_OP_RENAME
) {
1271 Dentry
*od
= request
->old_dentry();
1272 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1274 unlink(od
, true, true); // keep dir, dentry
1275 } else if (op
== CEPH_MDS_OP_RMDIR
||
1276 op
== CEPH_MDS_OP_UNLINK
) {
1278 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1279 unlink(d
, true, true); // keep dir, dentry
1285 ConnectionRef con
= request
->reply
->get_connection();
1287 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1288 features
= (uint64_t)-1;
1291 features
= con
->get_features();
1293 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1296 SnapRealm
*realm
= NULL
;
1297 if (reply
->snapbl
.length())
1298 update_snap_trace(reply
->snapbl
, &realm
);
1300 ldout(cct
, 10) << " hrm "
1301 << " is_target=" << (int)reply
->head
.is_target
1302 << " is_dentry=" << (int)reply
->head
.is_dentry
1311 if (reply
->head
.is_dentry
) {
1312 dirst
.decode(p
, features
);
1313 dst
.decode(p
, features
);
1315 dlease
.decode(p
, features
);
1319 if (reply
->head
.is_target
) {
1320 ist
.decode(p
, features
);
1321 if (cct
->_conf
->client_debug_getattr_caps
) {
1322 unsigned wanted
= 0;
1323 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1324 wanted
= request
->head
.args
.getattr
.mask
;
1325 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1326 wanted
= request
->head
.args
.open
.mask
;
1328 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1329 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1330 ceph_abort_msg("MDS reply does not contain xattrs");
1333 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1338 if (reply
->head
.is_dentry
) {
1339 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1341 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1344 Dir
*dir
= diri
->open_dir();
1345 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1346 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1349 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1350 dn
= diri
->dir
->dentries
[dname
];
1352 diri
->dir_ordered_count
++;
1353 clear_dir_complete_and_ordered(diri
, false);
1354 unlink(dn
, true, true); // keep dir, dentry
1357 if (dlease
.duration_ms
> 0) {
1359 Dir
*dir
= diri
->open_dir();
1360 dn
= link(dir
, dname
, NULL
, NULL
);
1362 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1365 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1366 op
== CEPH_MDS_OP_MKSNAP
) {
1367 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1368 // fake it for snap lookup
1369 vinodeno_t vino
= ist
.vino
;
1370 vino
.snapid
= CEPH_SNAPDIR
;
1371 ceph_assert(inode_map
.count(vino
));
1372 diri
= inode_map
[vino
];
1374 string dname
= request
->path
.last_dentry();
1377 dlease
.duration_ms
= 0;
1380 Dir
*dir
= diri
->open_dir();
1381 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1383 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1384 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1386 unlink(dn
, true, true); // keep dir, dentry
1392 if (op
== CEPH_MDS_OP_READDIR
||
1393 op
== CEPH_MDS_OP_LSSNAP
) {
1394 insert_readdir_results(request
, session
, in
);
1395 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1396 // hack: return parent inode instead
1400 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1401 // pin the target inode if its parent dentry is not pinned
1402 request
->set_other_inode(in
);
1407 put_snap_realm(realm
);
1409 request
->target
= in
;
1415 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1417 mds_rank_t mds
= MDS_RANK_NONE
;
1419 bool is_hash
= false;
1424 if (req
->resend_mds
>= 0) {
1425 mds
= req
->resend_mds
;
1426 req
->resend_mds
= -1;
1427 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1431 if (cct
->_conf
->client_use_random_mds
)
1437 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1438 if (req
->path
.depth()) {
1439 hash
= in
->hash_dentry_name(req
->path
[0]);
1440 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1441 << " on " << req
->path
[0]
1442 << " => " << hash
<< dendl
;
1447 in
= de
->inode
.get();
1448 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1450 in
= de
->dir
->parent_inode
;
1451 hash
= in
->hash_dentry_name(de
->name
);
1452 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1453 << " on " << de
->name
1454 << " => " << hash
<< dendl
;
1459 if (in
->snapid
!= CEPH_NOSNAP
) {
1460 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1461 while (in
->snapid
!= CEPH_NOSNAP
) {
1462 if (in
->snapid
== CEPH_SNAPDIR
)
1463 in
= in
->snapdir_parent
.get();
1464 else if (!in
->dentries
.empty())
1465 /* In most cases there will only be one dentry, so getting it
1466 * will be the correct action. If there are multiple hard links,
1467 * I think the MDS should be able to redirect as needed*/
1468 in
= in
->get_first_parent()->dir
->parent_inode
;
1470 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1477 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1478 << " hash=" << hash
<< dendl
;
1480 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1481 frag_t fg
= in
->dirfragtree
[hash
];
1482 if (in
->fragmap
.count(fg
)) {
1483 mds
= in
->fragmap
[fg
];
1486 } else if (in
->auth_cap
) {
1487 mds
= in
->auth_cap
->session
->mds_num
;
1490 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1495 if (in
->auth_cap
&& req
->auth_is_best()) {
1496 mds
= in
->auth_cap
->session
->mds_num
;
1497 } else if (!in
->caps
.empty()) {
1498 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1502 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1509 mds
= _get_random_up_mds();
1510 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1514 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1519 void Client::connect_mds_targets(mds_rank_t mds
)
1521 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1522 ceph_assert(mds_sessions
.count(mds
));
1523 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1524 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1525 q
!= info
.export_targets
.end();
1527 if (mds_sessions
.count(*q
) == 0 &&
1528 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1529 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1530 << " export target mds." << *q
<< dendl
;
1531 _open_mds_session(*q
);
1536 void Client::dump_mds_sessions(Formatter
*f
)
1538 f
->dump_int("id", get_nodeid().v
);
1539 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1540 f
->dump_object("inst", inst
);
1541 f
->dump_stream("inst_str") << inst
;
1542 f
->dump_stream("addr_str") << inst
.addr
;
1543 f
->open_array_section("sessions");
1544 for (const auto &p
: mds_sessions
) {
1545 f
->open_object_section("session");
1550 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1552 void Client::dump_mds_requests(Formatter
*f
)
1554 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1555 p
!= mds_requests
.end();
1557 f
->open_object_section("request");
1563 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1564 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1565 InodeRef
*ptarget
, bool *pcreated
,
1566 const UserPerm
& perms
)
1568 // check whether this request actually did the create, and set created flag
1569 bufferlist extra_bl
;
1570 inodeno_t created_ino
;
1571 bool got_created_ino
= false;
1572 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1574 extra_bl
= reply
->get_extra_bl();
1575 if (extra_bl
.length() >= 8) {
1576 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1577 struct openc_response_t ocres
;
1579 decode(ocres
, extra_bl
);
1580 created_ino
= ocres
.created_ino
;
1582 * The userland cephfs client doesn't have a way to do an async create
1583 * (yet), so just discard delegated_inos for now. Eventually we should
1584 * store them and use them in create calls, even if they are synchronous,
1585 * if only for testing purposes.
1587 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1589 // u64 containing number of created ino
1590 decode(created_ino
, extra_bl
);
1592 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1593 got_created_ino
= true;
1597 *pcreated
= got_created_ino
;
1599 if (request
->target
) {
1600 *ptarget
= request
->target
;
1601 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1603 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1604 (*ptarget
) = p
->second
;
1605 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1607 // we got a traceless reply, and need to look up what we just
1608 // created. for now, do this by name. someday, do this by the
1609 // ino... which we know! FIXME.
1611 Dentry
*d
= request
->dentry();
1614 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1615 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1616 << " got_ino " << got_created_ino
1617 << " ino " << created_ino
1619 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1622 // if the dentry is not linked, just do our best. see #5021.
1623 ceph_abort_msg("how did this happen? i want logs!");
1626 Inode
*in
= request
->inode();
1627 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1628 << in
->ino
<< dendl
;
1629 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1633 // verify ino returned in reply and trace_dist are the same
1634 if (got_created_ino
&&
1635 created_ino
.val
!= target
->ino
.val
) {
1636 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1640 ptarget
->swap(target
);
1652 * Blocking helper to make an MDS request.
1654 * If the ptarget flag is set, behavior changes slightly: the caller
1655 * expects to get a pointer to the inode we are creating or operating
1656 * on. As a result, we will follow up any traceless mutation reply
1657 * with a getattr or lookup to transparently handle a traceless reply
1658 * from the MDS (as when the MDS restarts and the client has to replay
1661 * @param request the MetaRequest to execute
1662 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665 * @param use_mds [optional] prefer a specific mds (-1 for default)
1666 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1668 int Client::make_request(MetaRequest
*request
,
1669 const UserPerm
& perms
,
1670 InodeRef
*ptarget
, bool *pcreated
,
1676 // assign a unique tid
1677 ceph_tid_t tid
= ++last_tid
;
1678 request
->set_tid(tid
);
1681 request
->op_stamp
= ceph_clock_now();
1684 mds_requests
[tid
] = request
->get();
1685 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1688 request
->set_caller_perms(perms
);
1690 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1691 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1692 request
->set_oldest_client_tid(1);
1694 request
->set_oldest_client_tid(oldest_tid
);
1699 request
->resend_mds
= use_mds
;
1701 MetaSession
*session
= NULL
;
1703 if (request
->aborted())
1707 request
->abort(-EBLACKLISTED
);
1712 ceph::condition_variable caller_cond
;
1713 request
->caller_cond
= &caller_cond
;
1716 Inode
*hash_diri
= NULL
;
1717 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1718 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1719 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1720 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1722 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1723 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1725 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1726 request
->resend_mds
= _get_random_up_mds();
1729 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1730 wait_on_list(waiting_for_mdsmap
);
1736 if (!have_open_session(mds
)) {
1737 session
= _get_or_open_mds_session(mds
);
1740 if (session
->state
== MetaSession::STATE_OPENING
) {
1741 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1742 wait_on_context_list(session
->waiting_for_open
);
1743 // Abort requests on REJECT from MDS
1744 if (rejected_by_mds
.count(mds
)) {
1745 request
->abort(-EPERM
);
1751 if (!have_open_session(mds
))
1754 session
= &mds_sessions
.at(mds
);
1758 send_request(request
, session
);
1761 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1762 request
->kick
= false;
1763 std::unique_lock l
{client_lock
, std::adopt_lock
};
1764 caller_cond
.wait(l
, [request
] {
1765 return (request
->reply
|| // reply
1766 request
->resend_mds
>= 0 || // forward
1770 request
->caller_cond
= nullptr;
1772 // did we get a reply?
1777 if (!request
->reply
) {
1778 ceph_assert(request
->aborted());
1779 ceph_assert(!request
->got_unsafe
);
1780 r
= request
->get_abort_code();
1781 request
->item
.remove_myself();
1782 unregister_request(request
);
1783 put_request(request
);
1788 auto reply
= std::move(request
->reply
);
1789 r
= reply
->get_result();
1791 request
->success
= true;
1793 // kick dispatcher (we've got it!)
1794 ceph_assert(request
->dispatch_cond
);
1795 request
->dispatch_cond
->notify_all();
1796 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1797 request
->dispatch_cond
= 0;
1799 if (r
>= 0 && ptarget
)
1800 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1803 *pdirbl
= reply
->get_extra_bl();
1806 utime_t lat
= ceph_clock_now();
1807 lat
-= request
->sent_stamp
;
1808 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1809 logger
->tinc(l_c_lat
, lat
);
1810 logger
->tinc(l_c_reply
, lat
);
1812 put_request(request
);
1816 void Client::unregister_request(MetaRequest
*req
)
1818 mds_requests
.erase(req
->tid
);
1819 if (req
->tid
== oldest_tid
) {
1820 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1822 if (p
== mds_requests
.end()) {
1826 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1827 oldest_tid
= p
->first
;
1836 void Client::put_request(MetaRequest
*request
)
1838 if (request
->_put()) {
1840 if (request
->success
)
1841 op
= request
->get_op();
1843 request
->take_other_inode(&other_in
);
1847 (op
== CEPH_MDS_OP_RMDIR
||
1848 op
== CEPH_MDS_OP_RENAME
||
1849 op
== CEPH_MDS_OP_RMSNAP
)) {
1850 _try_to_trim_inode(other_in
.get(), false);
1855 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1856 mds_rank_t mds
, int drop
,
1857 int unless
, int force
)
1859 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1860 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1861 << ", force:" << force
<< ")" << dendl
;
1863 auto it
= in
->caps
.find(mds
);
1864 if (it
!= in
->caps
.end()) {
1865 Cap
&cap
= it
->second
;
1866 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1867 if ((drop
& cap
.issued
) &&
1868 !(unless
& cap
.issued
)) {
1869 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1870 cap
.issued
&= ~drop
;
1871 cap
.implemented
&= ~drop
;
1877 cap
.wanted
= in
->caps_wanted();
1878 if (&cap
== in
->auth_cap
&&
1879 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1880 in
->requested_max_size
= 0;
1881 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1883 ceph_mds_request_release rel
;
1885 rel
.cap_id
= cap
.cap_id
;
1887 rel
.issue_seq
= cap
.issue_seq
;
1888 rel
.mseq
= cap
.mseq
;
1889 rel
.caps
= cap
.implemented
;
1890 rel
.wanted
= cap
.wanted
;
1893 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1896 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1897 << released
<< dendl
;
1901 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1902 mds_rank_t mds
, int drop
, int unless
)
1904 ldout(cct
, 20) << __func__
<< " enter(dn:"
1905 << dn
<< ")" << dendl
;
1908 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1909 mds
, drop
, unless
, 1);
1910 if (released
&& dn
->lease_mds
== mds
) {
1911 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1912 auto& rel
= req
->cap_releases
.back();
1913 rel
.item
.dname_len
= dn
->name
.length();
1914 rel
.item
.dname_seq
= dn
->lease_seq
;
1915 rel
.dname
= dn
->name
;
1917 ldout(cct
, 25) << __func__
<< " exit(dn:"
1918 << dn
<< ")" << dendl
;
1923 * This requires the MClientRequest *request member to be set.
1924 * It will error out horribly without one.
1925 * Additionally, if you set any *drop member, you'd better have
1926 * set the corresponding dentry!
1928 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1930 ldout(cct
, 20) << __func__
<< " enter (req: "
1931 << req
<< ", mds: " << mds
<< ")" << dendl
;
1932 if (req
->inode_drop
&& req
->inode())
1933 encode_inode_release(req
->inode(), req
,
1934 mds
, req
->inode_drop
,
1937 if (req
->old_inode_drop
&& req
->old_inode())
1938 encode_inode_release(req
->old_inode(), req
,
1939 mds
, req
->old_inode_drop
,
1940 req
->old_inode_unless
);
1941 if (req
->other_inode_drop
&& req
->other_inode())
1942 encode_inode_release(req
->other_inode(), req
,
1943 mds
, req
->other_inode_drop
,
1944 req
->other_inode_unless
);
1946 if (req
->dentry_drop
&& req
->dentry())
1947 encode_dentry_release(req
->dentry(), req
,
1948 mds
, req
->dentry_drop
,
1949 req
->dentry_unless
);
1951 if (req
->old_dentry_drop
&& req
->old_dentry())
1952 encode_dentry_release(req
->old_dentry(), req
,
1953 mds
, req
->old_dentry_drop
,
1954 req
->old_dentry_unless
);
1955 ldout(cct
, 25) << __func__
<< " exit (req: "
1956 << req
<< ", mds " << mds
<<dendl
;
1959 bool Client::have_open_session(mds_rank_t mds
)
1961 const auto &it
= mds_sessions
.find(mds
);
1962 return it
!= mds_sessions
.end() &&
1963 (it
->second
.state
== MetaSession::STATE_OPEN
||
1964 it
->second
.state
== MetaSession::STATE_STALE
);
1967 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1969 const auto &it
= mds_sessions
.find(mds
);
1970 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1977 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1979 auto it
= mds_sessions
.find(mds
);
1980 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1984 * Populate a map of strings with client-identifying metadata,
1985 * such as the hostname. Call this once at initialization.
1987 void Client::populate_metadata(const std::string
&mount_root
)
1993 metadata
["hostname"] = u
.nodename
;
1994 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1996 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1999 metadata
["pid"] = stringify(getpid());
2001 // Ceph entity id (the '0' in "client.0")
2002 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2004 // Our mount position
2005 if (!mount_root
.empty()) {
2006 metadata
["root"] = mount_root
;
2010 metadata
["ceph_version"] = pretty_version_to_str();
2011 metadata
["ceph_sha1"] = git_version_to_str();
2013 // Apply any metadata from the user's configured overrides
2014 std::vector
<std::string
> tokens
;
2015 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2016 for (const auto &i
: tokens
) {
2017 auto eqpos
= i
.find("=");
2018 // Throw out anything that isn't of the form "<str>=<str>"
2019 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2020 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2023 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2028 * Optionally add or override client metadata fields.
2030 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2032 std::lock_guard
l(client_lock
);
2033 ceph_assert(initialized
);
2035 auto it
= metadata
.find(k
);
2036 if (it
!= metadata
.end()) {
2037 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2038 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2044 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2046 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2047 auto addrs
= mdsmap
->get_addrs(mds
);
2048 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2049 std::forward_as_tuple(mds
),
2050 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2051 ceph_assert(em
.second
); /* not already present */
2052 MetaSession
*session
= &em
.first
->second
;
2054 // Maybe skip sending a request to open if this MDS daemon
2055 // has previously sent us a REJECT.
2056 if (rejected_by_mds
.count(mds
)) {
2057 if (rejected_by_mds
[mds
] == session
->addrs
) {
2058 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " skipping "
2059 "because we were rejected" << dendl
;
2062 ldout(cct
, 4) << __func__
<< " mds." << mds
<< " old inst "
2063 "rejected us, trying with new inst" << dendl
;
2064 rejected_by_mds
.erase(mds
);
2068 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2069 m
->metadata
= metadata
;
2070 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2071 session
->con
->send_message2(std::move(m
));
2075 void Client::_close_mds_session(MetaSession
*s
)
2077 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2078 s
->state
= MetaSession::STATE_CLOSING
;
2079 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2082 void Client::_closed_mds_session(MetaSession
*s
)
2084 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2085 s
->state
= MetaSession::STATE_CLOSED
;
2086 s
->con
->mark_down();
2087 signal_context_list(s
->waiting_for_open
);
2088 mount_cond
.notify_all();
2089 remove_session_caps(s
);
2090 kick_requests_closed(s
);
2091 mds_sessions
.erase(s
->mds_num
);
2094 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2096 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2097 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2099 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2101 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2105 switch (m
->get_op()) {
2106 case CEPH_SESSION_OPEN
:
2108 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2109 missing_features
-= m
->supported_features
;
2110 if (!missing_features
.empty()) {
2111 lderr(cct
) << "mds." << from
<< " lacks required features '"
2112 << missing_features
<< "', closing session " << dendl
;
2113 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2114 _close_mds_session(session
);
2115 _closed_mds_session(session
);
2118 session
->mds_features
= std::move(m
->supported_features
);
2120 renew_caps(session
);
2121 session
->state
= MetaSession::STATE_OPEN
;
2123 mount_cond
.notify_all();
2125 connect_mds_targets(from
);
2126 signal_context_list(session
->waiting_for_open
);
2130 case CEPH_SESSION_CLOSE
:
2131 _closed_mds_session(session
);
2134 case CEPH_SESSION_RENEWCAPS
:
2135 if (session
->cap_renew_seq
== m
->get_seq()) {
2136 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2138 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2140 wake_up_session_caps(session
, false);
2144 case CEPH_SESSION_STALE
:
2145 // invalidate session caps/leases
2147 session
->cap_ttl
= ceph_clock_now();
2148 session
->cap_ttl
-= 1;
2149 renew_caps(session
);
2152 case CEPH_SESSION_RECALL_STATE
:
2153 trim_caps(session
, m
->get_max_caps());
2156 case CEPH_SESSION_FLUSHMSG
:
2157 /* flush cap release */
2158 if (auto& m
= session
->release
; m
) {
2159 session
->con
->send_message2(std::move(m
));
2161 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2164 case CEPH_SESSION_FORCE_RO
:
2165 force_session_readonly(session
);
2168 case CEPH_SESSION_REJECT
:
2170 std::string_view error_str
;
2171 auto it
= m
->metadata
.find("error_string");
2172 if (it
!= m
->metadata
.end())
2173 error_str
= it
->second
;
2175 error_str
= "unknown error";
2176 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2178 rejected_by_mds
[session
->mds_num
] = session
->addrs
;
2179 _closed_mds_session(session
);
2188 bool Client::_any_stale_sessions() const
2190 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2192 for (const auto &p
: mds_sessions
) {
2193 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2201 void Client::_kick_stale_sessions()
2203 ldout(cct
, 1) << __func__
<< dendl
;
2205 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2206 MetaSession
&s
= it
->second
;
2208 if (s
.state
== MetaSession::STATE_STALE
)
2209 _closed_mds_session(&s
);
2213 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2214 bool drop_cap_releases
)
2217 mds_rank_t mds
= session
->mds_num
;
2218 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2219 << " for mds." << mds
<< dendl
;
2220 auto r
= build_client_request(request
);
2221 if (request
->dentry()) {
2222 r
->set_dentry_wanted();
2224 if (request
->got_unsafe
) {
2225 r
->set_replayed_op();
2226 if (request
->target
)
2227 r
->head
.ino
= request
->target
->ino
;
2229 encode_cap_releases(request
, mds
);
2230 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2231 request
->cap_releases
.clear();
2233 r
->releases
.swap(request
->cap_releases
);
2235 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2236 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2237 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2238 r
->set_osdmap_epoch(o
.get_epoch());
2242 if (request
->mds
== -1) {
2243 request
->sent_stamp
= ceph_clock_now();
2244 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2248 Inode
*in
= request
->inode();
2250 auto it
= in
->caps
.find(mds
);
2251 if (it
!= in
->caps
.end()) {
2252 request
->sent_on_mseq
= it
->second
.mseq
;
2256 session
->requests
.push_back(&request
->item
);
2258 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2259 session
->con
->send_message2(std::move(r
));
2262 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2264 auto req
= make_message
<MClientRequest
>(request
->get_op());
2265 req
->set_tid(request
->tid
);
2266 req
->set_stamp(request
->op_stamp
);
2267 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2269 // if the filepath's haven't been set, set them!
2270 if (request
->path
.empty()) {
2271 Inode
*in
= request
->inode();
2272 Dentry
*de
= request
->dentry();
2274 in
->make_nosnap_relative_path(request
->path
);
2277 de
->inode
->make_nosnap_relative_path(request
->path
);
2279 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2280 request
->path
.push_dentry(de
->name
);
2282 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2283 << " No path, inode, or appropriately-endowed dentry given!"
2285 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2286 << " No path, inode, or dentry given!"
2289 req
->set_filepath(request
->get_filepath());
2290 req
->set_filepath2(request
->get_filepath2());
2291 req
->set_data(request
->data
);
2292 req
->set_retry_attempt(request
->retry_attempt
++);
2293 req
->head
.num_fwd
= request
->num_fwd
;
2295 int gid_count
= request
->perms
.get_gids(&_gids
);
2296 req
->set_gid_list(gid_count
, _gids
);
2302 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2304 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2305 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2309 ceph_tid_t tid
= fwd
->get_tid();
2311 if (mds_requests
.count(tid
) == 0) {
2312 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2316 MetaRequest
*request
= mds_requests
[tid
];
2317 ceph_assert(request
);
2319 // reset retry counter
2320 request
->retry_attempt
= 0;
2322 // request not forwarded, or dest mds has no session.
2324 ldout(cct
, 10) << __func__
<< " tid " << tid
2325 << " fwd " << fwd
->get_num_fwd()
2326 << " to mds." << fwd
->get_dest_mds()
2327 << ", resending to " << fwd
->get_dest_mds()
2331 request
->item
.remove_myself();
2332 request
->num_fwd
= fwd
->get_num_fwd();
2333 request
->resend_mds
= fwd
->get_dest_mds();
2334 request
->caller_cond
->notify_all();
2337 bool Client::is_dir_operation(MetaRequest
*req
)
2339 int op
= req
->get_op();
2340 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2341 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2342 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2343 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2348 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2350 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2351 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2356 ceph_tid_t tid
= reply
->get_tid();
2357 bool is_safe
= reply
->is_safe();
2359 if (mds_requests
.count(tid
) == 0) {
2360 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2361 << " safe is:" << is_safe
<< dendl
;
2364 MetaRequest
*request
= mds_requests
.at(tid
);
2366 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2367 << " tid " << tid
<< dendl
;
2369 if (request
->got_unsafe
&& !is_safe
) {
2370 //duplicate response
2371 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2372 << mds_num
<< " safe:" << is_safe
<< dendl
;
2376 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2377 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2378 << " from mds." << request
->mds
<< dendl
;
2379 request
->send_to_auth
= true;
2380 request
->resend_mds
= choose_target_mds(request
);
2381 Inode
*in
= request
->inode();
2382 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2383 if (request
->resend_mds
>= 0 &&
2384 request
->resend_mds
== request
->mds
&&
2386 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2387 request
->sent_on_mseq
== it
->second
.mseq
)) {
2388 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2390 request
->caller_cond
->notify_all();
2395 ceph_assert(!request
->reply
);
2396 request
->reply
= reply
;
2397 insert_trace(request
, session
);
2399 // Handle unsafe reply
2401 request
->got_unsafe
= true;
2402 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2403 if (is_dir_operation(request
)) {
2404 Inode
*dir
= request
->inode();
2406 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2408 if (request
->target
) {
2409 InodeRef
&in
= request
->target
;
2410 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2414 // Only signal the caller once (on the first reply):
2415 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2416 if (!is_safe
|| !request
->got_unsafe
) {
2417 ceph::condition_variable cond
;
2418 request
->dispatch_cond
= &cond
;
2421 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2422 request
->caller_cond
->notify_all();
2424 // wake for kick back
2425 std::unique_lock l
{client_lock
, std::adopt_lock
};
2426 cond
.wait(l
, [tid
, request
, &cond
, this] {
2427 if (request
->dispatch_cond
) {
2428 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2429 << tid
<< " " << &cond
<< dendl
;
2431 return !request
->dispatch_cond
;
2437 // the filesystem change is committed to disk
2438 // we're done, clean up
2439 if (request
->got_unsafe
) {
2440 request
->unsafe_item
.remove_myself();
2441 request
->unsafe_dir_item
.remove_myself();
2442 request
->unsafe_target_item
.remove_myself();
2443 signal_cond_list(request
->waitfor_safe
);
2445 request
->item
.remove_myself();
2446 unregister_request(request
);
2449 mount_cond
.notify_all();
2452 void Client::_handle_full_flag(int64_t pool
)
2454 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2455 << "on " << pool
<< dendl
;
2456 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2457 // to do this rather than blocking, because otherwise when we fill up we
2458 // potentially lock caps forever on files with dirty pages, and we need
2459 // to be able to release those caps to the MDS so that it can delete files
2460 // and free up space.
2461 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2463 // For all inodes with layouts in this pool and a pending flush write op
2464 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2465 // from ObjectCacher so that it doesn't re-issue the write in response to
2466 // the ENOSPC error.
2467 // Fortunately since we're cancelling everything in a given pool, we don't
2468 // need to know which ops belong to which ObjectSet, we can just blow all
2469 // the un-flushed cached data away and mark any dirty inodes' async_err
2470 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2471 // affecting this pool, and all the objectsets we're purging were also
2473 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2474 i
!= inode_map
.end(); ++i
)
2476 Inode
*inode
= i
->second
;
2477 if (inode
->oset
.dirty_or_tx
2478 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2479 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2480 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2481 objectcacher
->purge_set(&inode
->oset
);
2482 inode
->set_async_err(-ENOSPC
);
2486 if (cancelled_epoch
!= (epoch_t
)-1) {
2487 set_cap_epoch_barrier(cancelled_epoch
);
2491 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2493 std::set
<entity_addr_t
> new_blacklists
;
2494 objecter
->consume_blacklist_events(&new_blacklists
);
2496 const auto myaddrs
= messenger
->get_myaddrs();
2497 bool new_blacklist
= false;
2498 bool prenautilus
= objecter
->with_osdmap(
2499 [&](const OSDMap
& o
) {
2500 return o
.require_osd_release
< ceph_release_t::nautilus
;
2503 for (auto a
: myaddrs
.v
) {
2504 // blacklist entries are always TYPE_ANY for nautilus+
2505 a
.set_type(entity_addr_t::TYPE_ANY
);
2506 if (new_blacklists
.count(a
)) {
2507 new_blacklist
= true;
2511 // ...except pre-nautilus, they were TYPE_LEGACY
2512 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2513 if (new_blacklists
.count(a
)) {
2514 new_blacklist
= true;
2520 if (new_blacklist
) {
2521 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2522 return o
.get_epoch();
2524 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2527 _abort_mds_sessions(-EBLACKLISTED
);
2529 // Since we know all our OSD ops will fail, cancel them all preemtively,
2530 // so that on an unhealthy cluster we can umount promptly even if e.g.
2531 // some PGs were inaccessible.
2532 objecter
->op_cancel_writes(-EBLACKLISTED
);
2534 } else if (blacklisted
) {
2535 // Handle case where we were blacklisted but no longer are
2536 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2537 return o
.is_blacklisted(myaddrs
);});
2540 // Always subscribe to next osdmap for blacklisted client
2541 // until this client is not blacklisted.
2543 objecter
->maybe_request_map();
2546 if (objecter
->osdmap_full_flag()) {
2547 _handle_full_flag(-1);
2549 // Accumulate local list of full pools so that I can drop
2550 // the objecter lock before re-entering objecter in
2552 std::vector
<int64_t> full_pools
;
2554 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2555 for (const auto& kv
: o
.get_pools()) {
2556 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2557 full_pools
.push_back(kv
.first
);
2562 for (auto p
: full_pools
)
2563 _handle_full_flag(p
);
2565 // Subscribe to subsequent maps to watch for the full flag going
2566 // away. For the global full flag objecter does this for us, but
2567 // it pays no attention to the per-pool full flag so in this branch
2568 // we do it ourselves.
2569 if (!full_pools
.empty()) {
2570 objecter
->maybe_request_map();
2576 // ------------------------
2577 // incoming messages
2580 bool Client::ms_dispatch2(const MessageRef
&m
)
2582 std::lock_guard
l(client_lock
);
2584 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2588 switch (m
->get_type()) {
2589 // mounting and mds sessions
2590 case CEPH_MSG_MDS_MAP
:
2591 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2593 case CEPH_MSG_FS_MAP
:
2594 handle_fs_map(ref_cast
<MFSMap
>(m
));
2596 case CEPH_MSG_FS_MAP_USER
:
2597 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2599 case CEPH_MSG_CLIENT_SESSION
:
2600 handle_client_session(ref_cast
<MClientSession
>(m
));
2603 case CEPH_MSG_OSD_MAP
:
2604 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2608 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2609 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2611 case CEPH_MSG_CLIENT_REPLY
:
2612 handle_client_reply(ref_cast
<MClientReply
>(m
));
2616 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2617 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2620 case CEPH_MSG_CLIENT_SNAP
:
2621 handle_snap(ref_cast
<MClientSnap
>(m
));
2623 case CEPH_MSG_CLIENT_CAPS
:
2624 handle_caps(ref_cast
<MClientCaps
>(m
));
2626 case CEPH_MSG_CLIENT_LEASE
:
2627 handle_lease(ref_cast
<MClientLease
>(m
));
2629 case MSG_COMMAND_REPLY
:
2630 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2631 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2636 case CEPH_MSG_CLIENT_QUOTA
:
2637 handle_quota(ref_cast
<MClientQuota
>(m
));
2646 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2647 << "+" << inode_map
.size() << dendl
;
2648 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2650 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2651 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2652 mount_cond
.notify_all();
2654 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2655 << "+" << inode_map
.size() << dendl
;
2662 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2664 fsmap
.reset(new FSMap(m
->get_fsmap()));
2666 signal_cond_list(waiting_for_fsmap
);
2668 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2671 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2673 fsmap_user
.reset(new FSMapUser
);
2674 *fsmap_user
= m
->get_fsmap();
2676 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2677 signal_cond_list(waiting_for_fsmap
);
2680 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2682 mds_gid_t old_inc
, new_inc
;
2683 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2684 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2685 << " is identical to or older than our "
2686 << mdsmap
->get_epoch() << dendl
;
2690 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2692 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2693 oldmap
.swap(mdsmap
);
2695 mdsmap
->decode(m
->get_encoded());
2697 // Cancel any commands for missing or laggy GIDs
2698 std::list
<ceph_tid_t
> cancel_ops
;
2699 auto &commands
= command_table
.get_commands();
2700 for (const auto &i
: commands
) {
2701 auto &op
= i
.second
;
2702 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2703 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2704 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2705 cancel_ops
.push_back(i
.first
);
2707 std::ostringstream ss
;
2708 ss
<< "MDS " << op_mds_gid
<< " went away";
2709 *(op
.outs
) = ss
.str();
2711 op
.con
->mark_down();
2713 op
.on_finish
->complete(-ETIMEDOUT
);
2718 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2719 i
!= cancel_ops
.end(); ++i
) {
2720 command_table
.erase(*i
);
2724 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2725 mds_rank_t mds
= p
->first
;
2726 MetaSession
*session
= &p
->second
;
2729 int oldstate
= oldmap
->get_state(mds
);
2730 int newstate
= mdsmap
->get_state(mds
);
2731 if (!mdsmap
->is_up(mds
)) {
2732 session
->con
->mark_down();
2733 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2734 old_inc
= oldmap
->get_incarnation(mds
);
2735 new_inc
= mdsmap
->get_incarnation(mds
);
2736 if (old_inc
!= new_inc
) {
2737 ldout(cct
, 1) << "mds incarnation changed from "
2738 << old_inc
<< " to " << new_inc
<< dendl
;
2739 oldstate
= MDSMap::STATE_NULL
;
2741 session
->con
->mark_down();
2742 session
->addrs
= mdsmap
->get_addrs(mds
);
2743 // When new MDS starts to take over, notify kernel to trim unused entries
2744 // in its dcache/icache. Hopefully, the kernel will release some unused
2745 // inodes before the new MDS enters reconnect state.
2746 trim_cache_for_reconnect(session
);
2747 } else if (oldstate
== newstate
)
2748 continue; // no change
2750 session
->mds_state
= newstate
;
2751 if (newstate
== MDSMap::STATE_RECONNECT
) {
2752 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2753 send_reconnect(session
);
2754 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2755 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2756 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2757 _closed_mds_session(session
);
2760 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2761 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2762 // kick new requests
2763 kick_requests(session
);
2764 kick_flushing_caps(session
);
2765 signal_context_list(session
->waiting_for_open
);
2766 wake_up_session_caps(session
, true);
2768 connect_mds_targets(mds
);
2770 } else if (newstate
== MDSMap::STATE_NULL
&&
2771 mds
>= mdsmap
->get_max_mds()) {
2772 _closed_mds_session(session
);
2776 // kick any waiting threads
2777 signal_cond_list(waiting_for_mdsmap
);
2779 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2782 void Client::send_reconnect(MetaSession
*session
)
2784 mds_rank_t mds
= session
->mds_num
;
2785 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2787 // trim unused caps to reduce MDS's cache rejoin time
2788 trim_cache_for_reconnect(session
);
2790 session
->readonly
= false;
2792 session
->release
.reset();
2794 // reset my cap seq number
2796 //connect to the mds' offload targets
2797 connect_mds_targets(mds
);
2798 //make sure unsafe requests get saved
2799 resend_unsafe_requests(session
);
2801 early_kick_flushing_caps(session
);
2803 auto m
= make_message
<MClientReconnect
>();
2804 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2806 // i have an open session.
2807 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2808 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2809 p
!= inode_map
.end();
2811 Inode
*in
= p
->second
;
2812 auto it
= in
->caps
.find(mds
);
2813 if (it
!= in
->caps
.end()) {
2815 m
->get_approx_size() >=
2816 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2818 session
->con
->send_message2(std::move(m
));
2820 m
= make_message
<MClientReconnect
>();
2823 Cap
&cap
= it
->second
;
2824 ldout(cct
, 10) << " caps on " << p
->first
2825 << " " << ccap_string(cap
.issued
)
2826 << " wants " << ccap_string(in
->caps_wanted())
2829 in
->make_long_path(path
);
2830 ldout(cct
, 10) << " path " << path
<< dendl
;
2833 _encode_filelocks(in
, flockbl
);
2835 cap
.seq
= 0; // reset seq.
2836 cap
.issue_seq
= 0; // reset seq.
2837 cap
.mseq
= 0; // reset seq.
2838 // cap gen should catch up with session cap_gen
2839 if (cap
.gen
< session
->cap_gen
) {
2840 cap
.gen
= session
->cap_gen
;
2841 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2843 cap
.issued
= cap
.implemented
;
2845 snapid_t snap_follows
= 0;
2846 if (!in
->cap_snaps
.empty())
2847 snap_follows
= in
->cap_snaps
.begin()->first
;
2849 m
->add_cap(p
->first
.ino
,
2851 path
.get_ino(), path
.get_path(), // ino
2852 in
->caps_wanted(), // wanted
2853 cap
.issued
, // issued
2858 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2859 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2860 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2861 did_snaprealm
.insert(in
->snaprealm
->ino
);
2867 m
->set_encoding_version(0); // use connection features to choose encoding
2868 session
->con
->send_message2(std::move(m
));
2870 mount_cond
.notify_all();
2872 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2873 signal_cond_list(waiting_for_reclaim
);
2877 void Client::kick_requests(MetaSession
*session
)
2879 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2880 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2881 p
!= mds_requests
.end();
2883 MetaRequest
*req
= p
->second
;
2884 if (req
->got_unsafe
)
2886 if (req
->aborted()) {
2887 if (req
->caller_cond
) {
2889 req
->caller_cond
->notify_all();
2893 if (req
->retry_attempt
> 0)
2894 continue; // new requests only
2895 if (req
->mds
== session
->mds_num
) {
2896 send_request(p
->second
, session
);
2901 void Client::resend_unsafe_requests(MetaSession
*session
)
2903 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2906 send_request(*iter
, session
);
2908 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2909 // process completed requests in clientreplay stage.
2910 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2911 p
!= mds_requests
.end();
2913 MetaRequest
*req
= p
->second
;
2914 if (req
->got_unsafe
)
2918 if (req
->retry_attempt
== 0)
2919 continue; // old requests only
2920 if (req
->mds
== session
->mds_num
)
2921 send_request(req
, session
, true);
2925 void Client::wait_unsafe_requests()
2927 list
<MetaRequest
*> last_unsafe_reqs
;
2928 for (const auto &p
: mds_sessions
) {
2929 const MetaSession
&s
= p
.second
;
2930 if (!s
.unsafe_requests
.empty()) {
2931 MetaRequest
*req
= s
.unsafe_requests
.back();
2933 last_unsafe_reqs
.push_back(req
);
2937 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2938 p
!= last_unsafe_reqs
.end();
2940 MetaRequest
*req
= *p
;
2941 if (req
->unsafe_item
.is_on_list())
2942 wait_on_list(req
->waitfor_safe
);
2947 void Client::kick_requests_closed(MetaSession
*session
)
2949 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2950 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2951 p
!= mds_requests
.end(); ) {
2952 MetaRequest
*req
= p
->second
;
2954 if (req
->mds
== session
->mds_num
) {
2955 if (req
->caller_cond
) {
2957 req
->caller_cond
->notify_all();
2959 req
->item
.remove_myself();
2960 if (req
->got_unsafe
) {
2961 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2962 req
->unsafe_item
.remove_myself();
2963 if (is_dir_operation(req
)) {
2964 Inode
*dir
= req
->inode();
2966 dir
->set_async_err(-EIO
);
2967 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2968 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2969 req
->unsafe_dir_item
.remove_myself();
2972 InodeRef
&in
= req
->target
;
2973 in
->set_async_err(-EIO
);
2974 lderr(cct
) << "kick_requests_closed drop req of inode : "
2975 << in
->ino
<< " " << req
->get_tid() << dendl
;
2976 req
->unsafe_target_item
.remove_myself();
2978 signal_cond_list(req
->waitfor_safe
);
2979 unregister_request(req
);
2983 ceph_assert(session
->requests
.empty());
2984 ceph_assert(session
->unsafe_requests
.empty());
2994 void Client::got_mds_push(MetaSession
*s
)
2997 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2998 if (s
->state
== MetaSession::STATE_CLOSING
) {
2999 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
3003 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
3005 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
3007 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3009 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3010 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
3015 got_mds_push(session
);
3017 ceph_seq_t seq
= m
->get_seq();
3020 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3021 if (inode_map
.count(vino
) == 0) {
3022 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3025 in
= inode_map
[vino
];
3027 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3028 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3029 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3032 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3033 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3039 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3040 m
->get_mask(), m
->get_ino(),
3041 m
->get_first(), m
->get_last(), m
->dname
);
3042 m
->get_connection()->send_message2(std::move(reply
));
3046 void Client::put_inode(Inode
*in
, int n
)
3048 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3049 int left
= in
->_put(n
);
3052 remove_all_caps(in
);
3054 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3055 bool unclean
= objectcacher
->release_set(&in
->oset
);
3056 ceph_assert(!unclean
);
3057 inode_map
.erase(in
->vino());
3058 if (use_faked_inos())
3059 _release_faked_ino(in
);
3064 while (!root_parents
.empty())
3065 root_parents
.erase(root_parents
.begin());
3072 void Client::close_dir(Dir
*dir
)
3074 Inode
*in
= dir
->parent_inode
;
3075 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3076 ceph_assert(dir
->is_empty());
3077 ceph_assert(in
->dir
== dir
);
3078 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3079 if (!in
->dentries
.empty())
3080 in
->get_first_parent()->put(); // unpin dentry
3084 put_inode(in
); // unpin inode
3088 * Don't call this with in==NULL, use get_or_create for that
3089 * leave dn set to default NULL unless you're trying to add
3090 * a new inode to a pre-created Dentry
3092 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3095 // create a new Dentry
3096 dn
= new Dentry(dir
, name
);
3098 lru
.lru_insert_mid(dn
); // mid or top?
3100 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3101 << " dn " << dn
<< " (new dn)" << dendl
;
3103 ceph_assert(!dn
->inode
);
3104 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3105 << " dn " << dn
<< " (old dn)" << dendl
;
3108 if (in
) { // link to inode
3110 // only one parent for directories!
3111 if (in
->is_dir() && !in
->dentries
.empty()) {
3112 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3113 Dentry
*olddn
= in
->get_first_parent();
3114 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3115 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3116 old_diri
->dir_release_count
++;
3117 clear_dir_complete_and_ordered(old_diri
, true);
3118 unlink(olddn
, true, true); // keep dir, dentry
3122 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3128 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3130 InodeRef
in(dn
->inode
);
3131 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3132 << " inode " << dn
->inode
<< dendl
;
3134 // unlink from inode
3137 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3143 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3153 if (dir
->is_empty() && !keepdir
)
3159 * For asynchronous flushes, check for errors from the IO and
3160 * update the inode if necessary
3162 class C_Client_FlushComplete
: public Context
{
3167 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3168 void finish(int r
) override
{
3169 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3171 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3172 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3173 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3174 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3175 inode
->set_async_err(r
);
3185 void Client::get_cap_ref(Inode
*in
, int cap
)
3187 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3188 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3189 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3192 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3193 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3194 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3197 in
->get_cap_ref(cap
);
3200 void Client::put_cap_ref(Inode
*in
, int cap
)
3202 int last
= in
->put_cap_ref(cap
);
3205 int drop
= last
& ~in
->caps_issued();
3206 if (in
->snapid
== CEPH_NOSNAP
) {
3207 if ((last
& CEPH_CAP_FILE_WR
) &&
3208 !in
->cap_snaps
.empty() &&
3209 in
->cap_snaps
.rbegin()->second
.writing
) {
3210 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3211 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3212 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3213 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3215 if (last
& CEPH_CAP_FILE_BUFFER
) {
3216 for (auto &p
: in
->cap_snaps
)
3217 p
.second
.dirty_data
= 0;
3218 signal_cond_list(in
->waitfor_commit
);
3219 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3223 if (last
& CEPH_CAP_FILE_CACHE
) {
3224 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3230 put_inode(in
, put_nref
);
3234 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3236 int r
= check_pool_perm(in
, need
);
3241 int file_wanted
= in
->caps_file_wanted();
3242 if ((file_wanted
& need
) != need
) {
3243 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3244 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3250 int have
= in
->caps_issued(&implemented
);
3252 bool waitfor_caps
= false;
3253 bool waitfor_commit
= false;
3255 if (have
& need
& CEPH_CAP_FILE_WR
) {
3257 if ((endoff
>= (loff_t
)in
->max_size
||
3258 endoff
> (loff_t
)(in
->size
<< 1)) &&
3259 endoff
> (loff_t
)in
->wanted_max_size
) {
3260 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3261 in
->wanted_max_size
= endoff
;
3263 if (in
->wanted_max_size
> in
->max_size
&&
3264 in
->wanted_max_size
> in
->requested_max_size
)
3268 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3269 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3270 waitfor_caps
= true;
3272 if (!in
->cap_snaps
.empty()) {
3273 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3274 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3275 waitfor_caps
= true;
3277 for (auto &p
: in
->cap_snaps
) {
3278 if (p
.second
.dirty_data
) {
3279 waitfor_commit
= true;
3283 if (waitfor_commit
) {
3284 _flush(in
, new C_Client_FlushComplete(this, in
));
3285 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3290 if (!waitfor_caps
&& !waitfor_commit
) {
3291 if ((have
& need
) == need
) {
3292 int revoking
= implemented
& ~have
;
3293 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3294 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3295 << " revoking " << ccap_string(revoking
)
3297 if ((revoking
& want
) == 0) {
3298 *phave
= need
| (have
& want
);
3299 in
->get_cap_ref(need
);
3303 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3304 waitfor_caps
= true;
3307 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3308 in
->auth_cap
->session
->readonly
)
3311 if (in
->flags
& I_CAP_DROPPED
) {
3312 int mds_wanted
= in
->caps_mds_wanted();
3313 if ((mds_wanted
& need
) != need
) {
3314 int ret
= _renew_caps(in
);
3319 if (!(file_wanted
& ~mds_wanted
))
3320 in
->flags
&= ~I_CAP_DROPPED
;
3324 wait_on_list(in
->waitfor_caps
);
3325 else if (waitfor_commit
)
3326 wait_on_list(in
->waitfor_commit
);
3330 int Client::get_caps_used(Inode
*in
)
3332 unsigned used
= in
->caps_used();
3333 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3334 !objectcacher
->set_is_empty(&in
->oset
))
3335 used
|= CEPH_CAP_FILE_CACHE
;
3339 void Client::cap_delay_requeue(Inode
*in
)
3341 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3342 in
->hold_caps_until
= ceph_clock_now();
3343 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3344 delayed_list
.push_back(&in
->delay_cap_item
);
3347 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3348 int flags
, int used
, int want
, int retain
,
3349 int flush
, ceph_tid_t flush_tid
)
3351 int held
= cap
->issued
| cap
->implemented
;
3352 int revoking
= cap
->implemented
& ~cap
->issued
;
3353 retain
&= ~revoking
;
3354 int dropping
= cap
->issued
& ~retain
;
3355 int op
= CEPH_CAP_OP_UPDATE
;
3357 ldout(cct
, 10) << __func__
<< " " << *in
3358 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3359 << " used " << ccap_string(used
)
3360 << " want " << ccap_string(want
)
3361 << " flush " << ccap_string(flush
)
3362 << " retain " << ccap_string(retain
)
3363 << " held "<< ccap_string(held
)
3364 << " revoking " << ccap_string(revoking
)
3365 << " dropping " << ccap_string(dropping
)
3368 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3369 const int would_have_issued
= cap
->issued
& retain
;
3370 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3372 // - tell the server we think issued is whatever they issued plus whatever we implemented
3373 // - leave what we have implemented in place
3374 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3375 cap
->issued
= cap
->issued
| cap
->implemented
;
3377 // Make an exception for revoking xattr caps: we are injecting
3378 // failure to release other caps, but allow xattr because client
3379 // will block on xattr ops if it can't release these to MDS (#9800)
3380 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3381 cap
->issued
^= xattr_mask
& revoking
;
3382 cap
->implemented
^= xattr_mask
& revoking
;
3384 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3385 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3388 cap
->issued
&= retain
;
3389 cap
->implemented
&= cap
->issued
| used
;
3392 snapid_t follows
= 0;
3395 follows
= in
->snaprealm
->get_snap_context().seq
;
3397 auto m
= make_message
<MClientCaps
>(op
,
3400 cap
->cap_id
, cap
->seq
,
3406 m
->caller_uid
= in
->cap_dirtier_uid
;
3407 m
->caller_gid
= in
->cap_dirtier_gid
;
3409 m
->head
.issue_seq
= cap
->issue_seq
;
3410 m
->set_tid(flush_tid
);
3412 m
->head
.uid
= in
->uid
;
3413 m
->head
.gid
= in
->gid
;
3414 m
->head
.mode
= in
->mode
;
3416 m
->head
.nlink
= in
->nlink
;
3418 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3419 encode(in
->xattrs
, m
->xattrbl
);
3420 m
->head
.xattr_version
= in
->xattr_version
;
3424 m
->max_size
= in
->max_size
;
3425 m
->truncate_seq
= in
->truncate_seq
;
3426 m
->truncate_size
= in
->truncate_size
;
3427 m
->mtime
= in
->mtime
;
3428 m
->atime
= in
->atime
;
3429 m
->ctime
= in
->ctime
;
3430 m
->btime
= in
->btime
;
3431 m
->time_warp_seq
= in
->time_warp_seq
;
3432 m
->change_attr
= in
->change_attr
;
3434 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3435 !in
->cap_snaps
.empty() &&
3436 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3437 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3440 if (flush
& CEPH_CAP_FILE_WR
) {
3441 m
->inline_version
= in
->inline_version
;
3442 m
->inline_data
= in
->inline_data
;
3445 in
->reported_size
= in
->size
;
3446 m
->set_snap_follows(follows
);
3448 if (cap
== in
->auth_cap
) {
3449 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3450 m
->set_max_size(in
->wanted_max_size
);
3451 in
->requested_max_size
= in
->wanted_max_size
;
3452 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3454 in
->requested_max_size
= 0;
3455 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3459 if (!session
->flushing_caps_tids
.empty())
3460 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3462 session
->con
->send_message2(std::move(m
));
3465 static bool is_max_size_approaching(Inode
*in
)
3467 /* mds will adjust max size according to the reported size */
3468 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3470 if (in
->size
>= in
->max_size
)
3472 /* half of previous max_size increment has been used */
3473 if (in
->max_size
> in
->reported_size
&&
3474 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3479 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3481 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3483 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3486 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3487 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3488 used
&= ~CEPH_CAP_FILE_CACHE
;
3489 used
|= CEPH_CAP_FILE_LAZYIO
;
3491 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3492 used
&= ~CEPH_CAP_FILE_BUFFER
;
3493 used
|= CEPH_CAP_FILE_LAZYIO
;
3496 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3497 used
&= ~CEPH_CAP_FILE_CACHE
;
3498 used
|= CEPH_CAP_FILE_LAZYIO
;
3500 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3501 used
&= ~CEPH_CAP_FILE_BUFFER
;
3502 used
|= CEPH_CAP_FILE_LAZYIO
;
3511 * Examine currently used and wanted versus held caps. Release, flush or ack
3512 * revoked caps to the MDS as appropriate.
3514 * @param in the inode to check
3515 * @param flags flags to apply to cap check
3517 void Client::check_caps(Inode
*in
, unsigned flags
)
3519 unsigned wanted
= in
->caps_wanted();
3520 unsigned used
= get_caps_used(in
);
3524 int issued
= in
->caps_issued(&implemented
);
3525 int revoking
= implemented
& ~issued
;
3527 int orig_used
= used
;
3528 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3530 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3531 if (!unmounting
&& in
->nlink
> 0) {
3533 retain
|= CEPH_CAP_ANY
;
3534 } else if (in
->is_dir() &&
3535 (issued
& CEPH_CAP_FILE_SHARED
) &&
3536 (in
->flags
& I_COMPLETE
)) {
3537 // we do this here because we don't want to drop to Fs (and then
3538 // drop the Fs if we do a create!) if that alone makes us send lookups
3539 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3540 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3543 retain
|= CEPH_CAP_ANY_SHARED
;
3544 // keep RD only if we didn't have the file open RW,
3545 // because then the mds would revoke it anyway to
3546 // journal max_size=0.
3547 if (in
->max_size
== 0)
3548 retain
|= CEPH_CAP_ANY_RD
;
3552 ldout(cct
, 10) << __func__
<< " on " << *in
3553 << " wanted " << ccap_string(wanted
)
3554 << " used " << ccap_string(used
)
3555 << " issued " << ccap_string(issued
)
3556 << " revoking " << ccap_string(revoking
)
3557 << " flags=" << flags
3560 if (in
->snapid
!= CEPH_NOSNAP
)
3561 return; //snap caps last forever, can't write
3563 if (in
->caps
.empty())
3564 return; // guard if at end of func
3566 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3567 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3569 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3573 for (auto &p
: in
->caps
) {
3574 mds_rank_t mds
= p
.first
;
3575 Cap
&cap
= p
.second
;
3577 MetaSession
*session
= &mds_sessions
.at(mds
);
3580 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3581 cap_used
&= ~in
->auth_cap
->issued
;
3583 revoking
= cap
.implemented
& ~cap
.issued
;
3585 ldout(cct
, 10) << " cap mds." << mds
3586 << " issued " << ccap_string(cap
.issued
)
3587 << " implemented " << ccap_string(cap
.implemented
)
3588 << " revoking " << ccap_string(revoking
) << dendl
;
3590 if (in
->wanted_max_size
> in
->max_size
&&
3591 in
->wanted_max_size
> in
->requested_max_size
&&
3592 &cap
== in
->auth_cap
)
3595 /* approaching file_max? */
3596 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3597 &cap
== in
->auth_cap
&&
3598 is_max_size_approaching(in
)) {
3599 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3600 << ", reported " << in
->reported_size
<< dendl
;
3604 /* completed revocation? */
3605 if (revoking
&& (revoking
& cap_used
) == 0) {
3606 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3610 /* want more caps from mds? */
3611 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3614 if (!revoking
&& unmounting
&& (cap_used
== 0))
3617 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3618 !in
->dirty_caps
) // and we have no dirty caps
3621 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3622 ldout(cct
, 10) << "delaying cap release" << dendl
;
3623 cap_delay_requeue(in
);
3628 if (&cap
== in
->auth_cap
) {
3629 if (in
->flags
& I_KICK_FLUSH
) {
3630 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3631 << " to mds." << mds
<< dendl
;
3632 kick_flushing_caps(in
, session
);
3634 if (!in
->cap_snaps
.empty() &&
3635 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3641 ceph_tid_t flush_tid
;
3642 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3643 flushing
= mark_caps_flushing(in
, &flush_tid
);
3644 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3645 msg_flags
|= MClientCaps::FLAG_SYNC
;
3651 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3652 flushing
, flush_tid
);
3657 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3659 int used
= get_caps_used(in
);
3660 int dirty
= in
->caps_dirty();
3661 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3663 if (in
->cap_snaps
.size() &&
3664 in
->cap_snaps
.rbegin()->second
.writing
) {
3665 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3667 } else if (in
->caps_dirty() ||
3668 (used
& CEPH_CAP_FILE_WR
) ||
3669 (dirty
& CEPH_CAP_ANY_WR
)) {
3670 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3671 ceph_assert(capsnapem
.second
); /* element inserted */
3672 CapSnap
&capsnap
= capsnapem
.first
->second
;
3673 capsnap
.context
= old_snapc
;
3674 capsnap
.issued
= in
->caps_issued();
3675 capsnap
.dirty
= in
->caps_dirty();
3677 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3679 capsnap
.uid
= in
->uid
;
3680 capsnap
.gid
= in
->gid
;
3681 capsnap
.mode
= in
->mode
;
3682 capsnap
.btime
= in
->btime
;
3683 capsnap
.xattrs
= in
->xattrs
;
3684 capsnap
.xattr_version
= in
->xattr_version
;
3685 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3686 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3688 if (used
& CEPH_CAP_FILE_WR
) {
3689 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3690 capsnap
.writing
= 1;
3692 finish_cap_snap(in
, capsnap
, used
);
3695 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3699 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3701 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3702 capsnap
.size
= in
->size
;
3703 capsnap
.mtime
= in
->mtime
;
3704 capsnap
.atime
= in
->atime
;
3705 capsnap
.ctime
= in
->ctime
;
3706 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3707 capsnap
.change_attr
= in
->change_attr
;
3708 capsnap
.dirty
|= in
->caps_dirty();
3710 /* Only reset it if it wasn't set before */
3711 if (capsnap
.cap_dirtier_uid
== -1) {
3712 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3713 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3716 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3717 capsnap
.inline_data
= in
->inline_data
;
3718 capsnap
.inline_version
= in
->inline_version
;
3721 if (used
& CEPH_CAP_FILE_BUFFER
) {
3722 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3723 << " WRBUFFER, delaying" << dendl
;
3725 capsnap
.dirty_data
= 0;
3730 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3732 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3733 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3737 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3738 snapid_t follows
, CapSnap
& capsnap
)
3740 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3741 in
->ino
, in
->snaprealm
->ino
, 0,
3742 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3743 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3744 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3746 m
->set_client_tid(capsnap
.flush_tid
);
3747 m
->head
.snap_follows
= follows
;
3749 m
->head
.caps
= capsnap
.issued
;
3750 m
->head
.dirty
= capsnap
.dirty
;
3752 m
->head
.uid
= capsnap
.uid
;
3753 m
->head
.gid
= capsnap
.gid
;
3754 m
->head
.mode
= capsnap
.mode
;
3755 m
->btime
= capsnap
.btime
;
3757 m
->size
= capsnap
.size
;
3759 m
->head
.xattr_version
= capsnap
.xattr_version
;
3760 encode(capsnap
.xattrs
, m
->xattrbl
);
3762 m
->ctime
= capsnap
.ctime
;
3763 m
->btime
= capsnap
.btime
;
3764 m
->mtime
= capsnap
.mtime
;
3765 m
->atime
= capsnap
.atime
;
3766 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3767 m
->change_attr
= capsnap
.change_attr
;
3769 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3770 m
->inline_version
= in
->inline_version
;
3771 m
->inline_data
= in
->inline_data
;
3774 ceph_assert(!session
->flushing_caps_tids
.empty());
3775 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3777 session
->con
->send_message2(std::move(m
));
3780 void Client::flush_snaps(Inode
*in
)
3782 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3783 ceph_assert(in
->cap_snaps
.size());
3786 ceph_assert(in
->auth_cap
);
3787 MetaSession
*session
= in
->auth_cap
->session
;
3789 for (auto &p
: in
->cap_snaps
) {
3790 CapSnap
&capsnap
= p
.second
;
3791 // only do new flush
3792 if (capsnap
.flush_tid
> 0)
3795 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3796 << " follows " << p
.first
3797 << " size " << capsnap
.size
3798 << " mtime " << capsnap
.mtime
3799 << " dirty_data=" << capsnap
.dirty_data
3800 << " writing=" << capsnap
.writing
3801 << " on " << *in
<< dendl
;
3802 if (capsnap
.dirty_data
|| capsnap
.writing
)
3805 capsnap
.flush_tid
= ++last_flush_tid
;
3806 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3807 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3808 if (!in
->flushing_cap_item
.is_on_list())
3809 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3811 send_flush_snap(in
, session
, p
.first
, capsnap
);
3815 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3817 ceph::condition_variable cond
;
3818 ls
.push_back(&cond
);
3819 std::unique_lock l
{client_lock
, std::adopt_lock
};
3825 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3827 for (auto cond
: ls
) {
3832 void Client::wait_on_context_list(list
<Context
*>& ls
)
3834 ceph::condition_variable cond
;
3837 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3838 std::unique_lock l
{client_lock
, std::adopt_lock
};
3839 cond
.wait(l
, [&done
] { return done
;});
3843 void Client::signal_context_list(list
<Context
*>& ls
)
3845 while (!ls
.empty()) {
3846 ls
.front()->complete(0);
3851 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3853 for (const auto &cap
: s
->caps
) {
3854 auto &in
= cap
->inode
;
3856 in
.requested_max_size
= 0;
3857 in
.wanted_max_size
= 0;
3859 if (cap
->gen
< s
->cap_gen
) {
3860 // mds did not re-issue stale cap.
3861 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3862 // make sure mds knows what we want.
3863 if (in
.caps_file_wanted() & ~cap
->wanted
)
3864 in
.flags
|= I_CAP_DROPPED
;
3867 signal_cond_list(in
.waitfor_caps
);
3872 // flush dirty data (from objectcache)
3874 class C_Client_CacheInvalidate
: public Context
{
3878 int64_t offset
, length
;
3880 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3881 client(c
), offset(off
), length(len
) {
3882 if (client
->use_faked_inos())
3883 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3887 void finish(int r
) override
{
3888 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3889 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
3890 client
->_async_invalidate(ino
, offset
, length
);
3894 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3898 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3899 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3902 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3904 if (ino_invalidate_cb
)
3905 // we queue the invalidate, which calls the callback and decrements the ref
3906 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3909 void Client::_invalidate_inode_cache(Inode
*in
)
3911 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3913 // invalidate our userspace inode cache
3914 if (cct
->_conf
->client_oc
) {
3915 objectcacher
->release_set(&in
->oset
);
3916 if (!objectcacher
->set_is_empty(&in
->oset
))
3917 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3920 _schedule_invalidate_callback(in
, 0, 0);
3923 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3925 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3927 // invalidate our userspace inode cache
3928 if (cct
->_conf
->client_oc
) {
3929 vector
<ObjectExtent
> ls
;
3930 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3931 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3934 _schedule_invalidate_callback(in
, off
, len
);
3937 bool Client::_release(Inode
*in
)
3939 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3940 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3941 _invalidate_inode_cache(in
);
3947 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3949 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3951 if (!in
->oset
.dirty_or_tx
) {
3952 ldout(cct
, 10) << " nothing to flush" << dendl
;
3953 onfinish
->complete(0);
3957 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3958 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3959 objectcacher
->purge_set(&in
->oset
);
3961 onfinish
->complete(-ENOSPC
);
3966 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3969 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3971 ceph_assert(ceph_mutex_is_locked(client_lock
));
3972 if (!in
->oset
.dirty_or_tx
) {
3973 ldout(cct
, 10) << " nothing to flush" << dendl
;
3977 C_SaferCond
onflush("Client::_flush_range flock");
3978 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3979 offset
, size
, &onflush
);
3982 client_lock
.unlock();
3988 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3990 // std::lock_guard l(client_lock);
3991 ceph_assert(ceph_mutex_is_locked(client_lock
)); // will be called via dispatch() -> objecter -> ...
3992 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3997 void Client::_flushed(Inode
*in
)
3999 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4001 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4006 // checks common to add_update_cap, handle_cap_grant
4007 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4009 unsigned had
= in
->caps_issued();
4011 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4012 !(had
& CEPH_CAP_FILE_CACHE
))
4015 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
4016 !(had
& CEPH_CAP_FILE_SHARED
)) {
4020 clear_dir_complete_and_ordered(in
, true);
4024 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4025 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4026 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4028 if (!in
->is_any_caps()) {
4029 ceph_assert(in
->snaprealm
== 0);
4030 in
->snaprealm
= get_snap_realm(realm
);
4031 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4032 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4034 ceph_assert(in
->snaprealm
);
4035 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4036 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4037 in
->snaprealm_item
.remove_myself();
4038 auto oldrealm
= in
->snaprealm
;
4039 in
->snaprealm
= get_snap_realm(realm
);
4040 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4041 put_snap_realm(oldrealm
);
4045 mds_rank_t mds
= mds_session
->mds_num
;
4046 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4047 Cap
&cap
= capem
.first
->second
;
4048 if (!capem
.second
) {
4049 if (cap
.gen
< mds_session
->cap_gen
)
4050 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4053 * auth mds of the inode changed. we received the cap export
4054 * message, but still haven't received the cap import message.
4055 * handle_cap_export() updated the new auth MDS' cap.
4057 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4058 * a message that was send before the cap import message. So
4059 * don't remove caps.
4061 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4062 if (&cap
!= in
->auth_cap
)
4063 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4065 ceph_assert(cap
.cap_id
== cap_id
);
4068 issued
|= cap
.issued
;
4069 flags
|= CEPH_CAP_FLAG_AUTH
;
4073 check_cap_issue(in
, issued
);
4075 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4076 if (in
->auth_cap
!= &cap
&&
4077 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4078 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4079 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4080 << "add myself to new auth MDS' flushing caps list" << dendl
;
4081 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4083 in
->auth_cap
= &cap
;
4087 unsigned old_caps
= cap
.issued
;
4088 cap
.cap_id
= cap_id
;
4089 cap
.issued
= issued
;
4090 cap
.implemented
|= issued
;
4091 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4092 cap
.wanted
= wanted
;
4094 cap
.wanted
|= wanted
;
4096 cap
.issue_seq
= seq
;
4098 cap
.gen
= mds_session
->cap_gen
;
4099 cap
.latest_perms
= cap_perms
;
4100 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4101 << " from mds." << mds
4105 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4106 // non-auth MDS is revoking the newly grant caps ?
4107 for (auto &p
: in
->caps
) {
4108 if (&p
.second
== &cap
)
4110 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4111 check_caps(in
, CHECK_CAPS_NODELAY
);
4117 if (issued
& ~old_caps
)
4118 signal_cond_list(in
->waitfor_caps
);
4121 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4123 auto &in
= cap
->inode
;
4124 MetaSession
*session
= cap
->session
;
4125 mds_rank_t mds
= cap
->session
->mds_num
;
4127 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4129 if (queue_release
) {
4130 session
->enqueue_cap_release(
4138 if (in
.auth_cap
== cap
) {
4139 if (in
.flushing_cap_item
.is_on_list()) {
4140 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4141 in
.flushing_cap_item
.remove_myself();
4145 size_t n
= in
.caps
.erase(mds
);
4146 ceph_assert(n
== 1);
4149 if (!in
.is_any_caps()) {
4150 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4151 in
.snaprealm_item
.remove_myself();
4152 put_snap_realm(in
.snaprealm
);
4157 void Client::remove_all_caps(Inode
*in
)
4159 while (!in
->caps
.empty())
4160 remove_cap(&in
->caps
.begin()->second
, true);
4163 void Client::remove_session_caps(MetaSession
*s
)
4165 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4167 while (s
->caps
.size()) {
4168 Cap
*cap
= *s
->caps
.begin();
4169 InodeRef
in(&cap
->inode
);
4170 bool dirty_caps
= false;
4171 if (in
->auth_cap
== cap
) {
4172 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4173 in
->wanted_max_size
= 0;
4174 in
->requested_max_size
= 0;
4176 if (cap
->wanted
| cap
->issued
)
4177 in
->flags
|= I_CAP_DROPPED
;
4178 remove_cap(cap
, false);
4179 in
->cap_snaps
.clear();
4181 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4182 if (in
->flushing_caps
) {
4183 num_flushing_caps
--;
4184 in
->flushing_cap_tids
.clear();
4186 in
->flushing_caps
= 0;
4187 in
->mark_caps_clean();
4188 put_inode(in
.get());
4190 signal_cond_list(in
->waitfor_caps
);
4192 s
->flushing_caps_tids
.clear();
4193 sync_cond
.notify_all();
4196 int Client::_do_remount(bool retry_on_error
)
4198 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4201 int r
= remount_cb(callback_handle
);
4203 retries_on_invalidate
= 0;
4206 client_t whoami
= get_nodeid();
4209 "failed to remount (to trim kernel dentries): "
4210 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4213 "failed to remount (to trim kernel dentries): "
4214 "return code = " << r
<< dendl
;
4217 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4218 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4219 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4220 if (should_abort
&& !unmounting
) {
4221 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4228 class C_Client_Remount
: public Context
{
4232 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4233 void finish(int r
) override
{
4234 ceph_assert(r
== 0);
4235 client
->_do_remount(true);
4239 void Client::_invalidate_kernel_dcache()
4243 if (can_invalidate_dentries
) {
4244 if (dentry_invalidate_cb
&& root
->dir
) {
4245 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4246 p
!= root
->dir
->dentries
.end();
4248 if (p
->second
->inode
)
4249 _schedule_invalidate_dentry_callback(p
->second
, false);
4252 } else if (remount_cb
) {
4254 // when remounting a file system, linux kernel trims all unused dentries in the fs
4255 remount_finisher
.queue(new C_Client_Remount(this));
4259 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4265 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4266 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4267 Dentry
*dn
= p
->second
;
4269 ceph_assert(!dn
->inode
);
4270 if (dn
->lru_is_expireable())
4271 unlink(dn
, true, false); // keep dir, drop dentry
4273 if (dir
->dentries
.empty()) {
4278 if (in
->flags
& I_SNAPDIR_OPEN
) {
4279 InodeRef snapdir
= open_snapdir(in
.get());
4280 _trim_negative_child_dentries(snapdir
);
4284 class C_Client_CacheRelease
: public Context
{
4289 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4291 if (client
->use_faked_inos())
4292 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4296 void finish(int r
) override
{
4297 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4298 client
->_async_inode_release(ino
);
4302 void Client::_async_inode_release(vinodeno_t ino
)
4306 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4307 ino_release_cb(callback_handle
, ino
);
4310 void Client::_schedule_ino_release_callback(Inode
*in
) {
4313 // we queue the invalidate, which calls the callback and decrements the ref
4314 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4317 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4319 mds_rank_t mds
= s
->mds_num
;
4320 size_t caps_size
= s
->caps
.size();
4321 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4322 << " caps " << caps_size
<< dendl
;
4324 uint64_t trimmed
= 0;
4325 auto p
= s
->caps
.begin();
4326 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4327 * looking at from getting deleted during traversal. */
4328 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4330 InodeRef
in(&cap
->inode
);
4332 // Increment p early because it will be invalidated if cap
4333 // is deleted inside remove_cap
4336 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4337 int mine
= cap
->issued
| cap
->implemented
;
4338 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4339 // disposable non-auth cap
4340 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4341 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4342 cap
= (remove_cap(cap
, true), nullptr);
4346 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4347 _trim_negative_child_dentries(in
);
4349 auto q
= in
->dentries
.begin();
4350 while (q
!= in
->dentries
.end()) {
4353 if (dn
->lru_is_expireable()) {
4354 if (can_invalidate_dentries
&&
4355 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4356 // Only issue one of these per DN for inodes in root: handle
4357 // others more efficiently by calling for root-child DNs at
4358 // the end of this function.
4359 _schedule_invalidate_dentry_callback(dn
, true);
4361 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4364 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4368 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4369 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4371 _schedule_ino_release_callback(in
.get());
4375 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4376 for (const auto &dn
: to_trim
) {
4381 caps_size
= s
->caps
.size();
4382 if (caps_size
> (size_t)max
)
4383 _invalidate_kernel_dcache();
4386 void Client::force_session_readonly(MetaSession
*s
)
4389 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4390 auto &in
= (*p
)->inode
;
4391 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4392 signal_cond_list(in
.waitfor_caps
);
4396 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4398 MetaSession
*session
= in
->auth_cap
->session
;
4400 int flushing
= in
->dirty_caps
;
4401 ceph_assert(flushing
);
4403 ceph_tid_t flush_tid
= ++last_flush_tid
;
4404 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4406 if (!in
->flushing_caps
) {
4407 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4408 num_flushing_caps
++;
4410 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4413 in
->flushing_caps
|= flushing
;
4414 in
->mark_caps_clean();
4416 if (!in
->flushing_cap_item
.is_on_list())
4417 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4418 session
->flushing_caps_tids
.insert(flush_tid
);
4424 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4426 for (auto &p
: in
->cap_snaps
) {
4427 CapSnap
&capsnap
= p
.second
;
4428 if (capsnap
.flush_tid
> 0) {
4429 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4430 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4433 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4434 it
!= in
->flushing_cap_tids
.end();
4436 old_s
->flushing_caps_tids
.erase(it
->first
);
4437 new_s
->flushing_caps_tids
.insert(it
->first
);
4439 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4443 * Flush all caps back to the MDS. Because the callers generally wait on the
4444 * result of this function (syncfs and umount cases), we set
4445 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4447 void Client::flush_caps_sync()
4449 ldout(cct
, 10) << __func__
<< dendl
;
4450 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4452 unsigned flags
= CHECK_CAPS_NODELAY
;
4456 delayed_list
.pop_front();
4457 if (p
.end() && dirty_list
.empty())
4458 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4459 check_caps(in
, flags
);
4463 p
= dirty_list
.begin();
4465 unsigned flags
= CHECK_CAPS_NODELAY
;
4470 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4471 check_caps(in
, flags
);
4475 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4477 while (in
->flushing_caps
) {
4478 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4479 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4480 if (it
->first
> want
)
4482 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4483 << ccap_string(it
->second
) << " want " << want
4484 << " last " << it
->first
<< dendl
;
4485 wait_on_list(in
->waitfor_caps
);
4489 void Client::wait_sync_caps(ceph_tid_t want
)
4492 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4493 << num_flushing_caps
<< " total flushing)" << dendl
;
4494 for (auto &p
: mds_sessions
) {
4495 MetaSession
*s
= &p
.second
;
4496 if (s
->flushing_caps_tids
.empty())
4498 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4499 if (oldest_tid
<= want
) {
4500 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4501 << " (want " << want
<< ")" << dendl
;
4502 std::unique_lock l
{client_lock
, std::adopt_lock
};
4510 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4512 in
->flags
&= ~I_KICK_FLUSH
;
4514 Cap
*cap
= in
->auth_cap
;
4515 ceph_assert(cap
->session
== session
);
4517 ceph_tid_t last_snap_flush
= 0;
4518 for (auto p
= in
->flushing_cap_tids
.rbegin();
4519 p
!= in
->flushing_cap_tids
.rend();
4522 last_snap_flush
= p
->first
;
4527 int wanted
= in
->caps_wanted();
4528 int used
= get_caps_used(in
) | in
->caps_dirty();
4529 auto it
= in
->cap_snaps
.begin();
4530 for (auto& p
: in
->flushing_cap_tids
) {
4532 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4533 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4536 ceph_assert(it
!= in
->cap_snaps
.end());
4537 ceph_assert(it
->second
.flush_tid
== p
.first
);
4538 send_flush_snap(in
, session
, it
->first
, it
->second
);
4544 void Client::kick_flushing_caps(MetaSession
*session
)
4546 mds_rank_t mds
= session
->mds_num
;
4547 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4549 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4551 if (in
->flags
& I_KICK_FLUSH
) {
4552 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4553 kick_flushing_caps(in
, session
);
4558 void Client::early_kick_flushing_caps(MetaSession
*session
)
4560 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4562 Cap
*cap
= in
->auth_cap
;
4565 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4566 // stage. This guarantees that MDS processes the cap flush message before issuing
4567 // the flushing caps to other client.
4568 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4569 in
->flags
|= I_KICK_FLUSH
;
4573 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4574 << " to mds." << session
->mds_num
<< dendl
;
4575 // send_reconnect() also will reset these sequence numbers. make sure
4576 // sequence numbers in cap flush message match later reconnect message.
4580 cap
->issued
= cap
->implemented
;
4582 kick_flushing_caps(in
, session
);
4586 void SnapRealm::build_snap_context()
4588 set
<snapid_t
> snaps
;
4589 snapid_t max_seq
= seq
;
4591 // start with prior_parents?
4592 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4593 snaps
.insert(prior_parent_snaps
[i
]);
4595 // current parent's snaps
4597 const SnapContext
& psnapc
= pparent
->get_snap_context();
4598 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4599 if (psnapc
.snaps
[i
] >= parent_since
)
4600 snaps
.insert(psnapc
.snaps
[i
]);
4601 if (psnapc
.seq
> max_seq
)
4602 max_seq
= psnapc
.seq
;
4606 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4607 snaps
.insert(my_snaps
[i
]);
4610 cached_snap_context
.seq
= max_seq
;
4611 cached_snap_context
.snaps
.resize(0);
4612 cached_snap_context
.snaps
.reserve(snaps
.size());
4613 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4614 cached_snap_context
.snaps
.push_back(*p
);
4617 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4622 while (!q
.empty()) {
4626 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4627 realm
->invalidate_cache();
4629 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4630 p
!= realm
->pchildren
.end();
4636 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4638 SnapRealm
*realm
= snap_realms
[r
];
4640 snap_realms
[r
] = realm
= new SnapRealm(r
);
4641 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4646 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4648 if (snap_realms
.count(r
) == 0) {
4649 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4652 SnapRealm
*realm
= snap_realms
[r
];
4653 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4658 void Client::put_snap_realm(SnapRealm
*realm
)
4660 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4661 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4662 if (--realm
->nref
== 0) {
4663 snap_realms
.erase(realm
->ino
);
4664 if (realm
->pparent
) {
4665 realm
->pparent
->pchildren
.erase(realm
);
4666 put_snap_realm(realm
->pparent
);
4672 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4674 if (realm
->parent
!= parent
) {
4675 ldout(cct
, 10) << __func__
<< " " << *realm
4676 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4677 realm
->parent
= parent
;
4678 if (realm
->pparent
) {
4679 realm
->pparent
->pchildren
.erase(realm
);
4680 put_snap_realm(realm
->pparent
);
4682 realm
->pparent
= get_snap_realm(parent
);
4683 realm
->pparent
->pchildren
.insert(realm
);
4689 static bool has_new_snaps(const SnapContext
& old_snapc
,
4690 const SnapContext
& new_snapc
)
4692 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4696 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4698 SnapRealm
*first_realm
= NULL
;
4699 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4701 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4703 auto p
= bl
.cbegin();
4707 SnapRealm
*realm
= get_snap_realm(info
.ino());
4709 bool invalidate
= false;
4711 if (info
.seq() > realm
->seq
) {
4712 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4716 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4717 // flush me + children
4720 while (!q
.empty()) {
4721 SnapRealm
*realm
= q
.front();
4724 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4725 p
!= realm
->pchildren
.end();
4729 if (dirty_realms
.count(realm
) == 0) {
4731 dirty_realms
[realm
] = realm
->get_snap_context();
4737 realm
->seq
= info
.seq();
4738 realm
->created
= info
.created();
4739 realm
->parent_since
= info
.parent_since();
4740 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4741 realm
->my_snaps
= info
.my_snaps
;
4745 // _always_ verify parent
4746 if (adjust_realm_parent(realm
, info
.parent()))
4750 invalidate_snaprealm_and_children(realm
);
4751 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4752 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4754 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4755 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4759 first_realm
= realm
;
4761 put_snap_realm(realm
);
4764 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4765 q
!= dirty_realms
.end();
4767 SnapRealm
*realm
= q
->first
;
4768 // if there are new snaps ?
4769 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4770 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4771 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4775 queue_cap_snap(in
, q
->second
);
4778 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4780 put_snap_realm(realm
);
4784 *realm_ret
= first_realm
;
4786 put_snap_realm(first_realm
);
4789 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4791 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4792 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4793 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4798 got_mds_push(session
);
4800 map
<Inode
*, SnapContext
> to_move
;
4801 SnapRealm
*realm
= 0;
4803 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4804 ceph_assert(m
->head
.split
);
4806 auto p
= m
->bl
.cbegin();
4808 ceph_assert(info
.ino() == m
->head
.split
);
4810 // flush, then move, ino's.
4811 realm
= get_snap_realm(info
.ino());
4812 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4813 for (auto& ino
: m
->split_inos
) {
4814 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4815 if (inode_map
.count(vino
)) {
4816 Inode
*in
= inode_map
[vino
];
4817 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4819 if (in
->snaprealm
->created
> info
.created()) {
4820 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4821 << *in
->snaprealm
<< dendl
;
4824 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4827 in
->snaprealm_item
.remove_myself();
4828 to_move
[in
] = in
->snaprealm
->get_snap_context();
4829 put_snap_realm(in
->snaprealm
);
4833 // move child snaprealms, too
4834 for (auto& child_realm
: m
->split_realms
) {
4835 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4836 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4839 adjust_realm_parent(child
, realm
->ino
);
4840 put_snap_realm(child
);
4844 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4847 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4848 Inode
*in
= p
->first
;
4849 in
->snaprealm
= realm
;
4850 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4852 // queue for snap writeback
4853 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4854 queue_cap_snap(in
, p
->second
);
4856 put_snap_realm(realm
);
4860 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4862 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4863 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4868 got_mds_push(session
);
4870 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4872 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4873 if (inode_map
.count(vino
)) {
4875 in
= inode_map
[vino
];
4878 in
->quota
= m
->quota
;
4879 in
->rstat
= m
->rstat
;
4884 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4886 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4887 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4892 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4893 // Pause RADOS operations until we see the required epoch
4894 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4897 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4898 // Record the barrier so that we will transmit it to MDS when releasing
4899 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4902 got_mds_push(session
);
4905 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4906 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4909 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4910 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4911 session
->enqueue_cap_release(
4918 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4921 // in case the mds is waiting on e.g. a revocation
4922 flush_cap_releases();
4926 switch (m
->get_op()) {
4927 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4928 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4929 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4932 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4933 Cap
&cap
= in
->caps
.at(mds
);
4935 switch (m
->get_op()) {
4936 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4937 case CEPH_CAP_OP_IMPORT
:
4938 case CEPH_CAP_OP_REVOKE
:
4939 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4940 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4943 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4948 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4950 mds_rank_t mds
= session
->mds_num
;
4952 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4953 << " IMPORT from mds." << mds
<< dendl
;
4955 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4958 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4960 cap_perms
= cap
->latest_perms
;
4964 SnapRealm
*realm
= NULL
;
4965 update_snap_trace(m
->snapbl
, &realm
);
4967 int issued
= m
->get_caps();
4968 int wanted
= m
->get_wanted();
4969 add_update_cap(in
, session
, m
->get_cap_id(),
4970 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
4971 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4973 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4974 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4978 put_snap_realm(realm
);
4980 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4981 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
4982 in
->requested_max_size
> m
->get_max_size()) {
4983 in
->requested_max_size
= 0;
4984 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
4986 // reflush any/all caps (if we are now the auth_cap)
4987 kick_flushing_caps(in
, session
);
4991 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4993 mds_rank_t mds
= session
->mds_num
;
4995 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4996 << " EXPORT from mds." << mds
<< dendl
;
4998 auto it
= in
->caps
.find(mds
);
4999 if (it
!= in
->caps
.end()) {
5000 Cap
&cap
= it
->second
;
5001 if (cap
.cap_id
== m
->get_cap_id()) {
5002 if (m
->peer
.cap_id
) {
5003 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5004 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
5005 auto it
= in
->caps
.find(peer_mds
);
5006 if (it
!= in
->caps
.end()) {
5007 Cap
&tcap
= it
->second
;
5008 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5009 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5010 tcap
.cap_id
= m
->peer
.cap_id
;
5011 tcap
.seq
= m
->peer
.seq
- 1;
5012 tcap
.issue_seq
= tcap
.seq
;
5013 tcap
.issued
|= cap
.issued
;
5014 tcap
.implemented
|= cap
.issued
;
5015 if (&cap
== in
->auth_cap
)
5016 in
->auth_cap
= &tcap
;
5017 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5018 adjust_session_flushing_caps(in
, session
, tsession
);
5021 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
5022 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5023 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5027 if (cap
.wanted
| cap
.issued
)
5028 in
->flags
|= I_CAP_DROPPED
;
5031 remove_cap(&cap
, false);
5036 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5038 mds_rank_t mds
= session
->mds_num
;
5039 ceph_assert(in
->caps
.count(mds
));
5041 ldout(cct
, 10) << __func__
<< " on ino " << *in
5042 << " size " << in
->size
<< " -> " << m
->get_size()
5046 in
->caps_issued(&issued
);
5047 issued
|= in
->caps_dirty();
5048 update_inode_file_size(in
, issued
, m
->get_size(),
5049 m
->get_truncate_seq(), m
->get_truncate_size());
5052 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5054 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5055 int dirty
= m
->get_dirty();
5059 auto it
= in
->flushing_cap_tids
.begin();
5060 if (it
->first
< flush_ack_tid
) {
5061 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5062 << " got unexpected flush ack tid " << flush_ack_tid
5063 << " expected is " << it
->first
<< dendl
;
5065 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5071 if (it
->first
== flush_ack_tid
)
5072 cleaned
= it
->second
;
5073 if (it
->first
<= flush_ack_tid
) {
5074 session
->flushing_caps_tids
.erase(it
->first
);
5075 in
->flushing_cap_tids
.erase(it
++);
5079 cleaned
&= ~it
->second
;
5085 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5086 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5087 << " with " << ccap_string(dirty
) << dendl
;
5090 signal_cond_list(in
->waitfor_caps
);
5091 if (session
->flushing_caps_tids
.empty() ||
5092 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5093 sync_cond
.notify_all();
5097 in
->cap_dirtier_uid
= -1;
5098 in
->cap_dirtier_gid
= -1;
5102 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5104 if (in
->flushing_caps
) {
5105 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5106 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5107 in
->flushing_caps
&= ~cleaned
;
5108 if (in
->flushing_caps
== 0) {
5109 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5110 num_flushing_caps
--;
5111 if (in
->flushing_cap_tids
.empty())
5112 in
->flushing_cap_item
.remove_myself();
5114 if (!in
->caps_dirty())
5121 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5123 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5124 mds_rank_t mds
= session
->mds_num
;
5125 ceph_assert(in
->caps
.count(mds
));
5126 snapid_t follows
= m
->get_snap_follows();
5128 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5129 auto& capsnap
= it
->second
;
5130 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5131 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5133 InodeRef
tmp_ref(in
);
5134 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5135 << " on " << *in
<< dendl
;
5136 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5137 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5138 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5139 in
->flushing_cap_item
.remove_myself();
5140 in
->cap_snaps
.erase(it
);
5142 signal_cond_list(in
->waitfor_caps
);
5143 if (session
->flushing_caps_tids
.empty() ||
5144 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5145 sync_cond
.notify_all();
5148 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5149 << " on " << *in
<< dendl
;
5150 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5154 class C_Client_DentryInvalidate
: public Context
{
5161 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5162 client(c
), name(dn
->name
) {
5163 if (client
->use_faked_inos()) {
5164 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5166 ino
.ino
= dn
->inode
->faked_ino
;
5168 dirino
= dn
->dir
->parent_inode
->vino();
5170 ino
= dn
->inode
->vino();
5173 ino
.ino
= inodeno_t();
5175 void finish(int r
) override
{
5176 // _async_dentry_invalidate is responsible for its own locking
5177 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5178 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5182 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5186 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5187 << " in dir " << dirino
<< dendl
;
5188 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5191 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5193 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5194 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5197 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5199 int ref
= in
->get_num_ref();
5200 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5202 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5203 for (auto p
= in
->dir
->dentries
.begin();
5204 p
!= in
->dir
->dentries
.end(); ) {
5205 Dentry
*dn
= p
->second
;
5207 /* rmsnap removes whole subtree, need trim inodes recursively.
5208 * we don't need to invalidate dentries recursively. because
5209 * invalidating a directory dentry effectively invalidate
5211 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5212 _try_to_trim_inode(dn
->inode
.get(), false);
5214 if (dn
->lru_is_expireable())
5215 unlink(dn
, true, false); // keep dir, drop dentry
5217 if (in
->dir
->dentries
.empty()) {
5223 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5224 InodeRef snapdir
= open_snapdir(in
);
5225 _try_to_trim_inode(snapdir
.get(), false);
5230 auto q
= in
->dentries
.begin();
5231 while (q
!= in
->dentries
.end()) {
5234 if( in
->ll_ref
> 0 && sched_inval
) {
5235 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5236 // so in->dentries doesn't always reflect the state of kernel's dcache.
5237 _schedule_invalidate_dentry_callback(dn
, true);
5239 unlink(dn
, true, true);
5244 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5246 mds_rank_t mds
= session
->mds_num
;
5247 int used
= get_caps_used(in
);
5248 int wanted
= in
->caps_wanted();
5250 const unsigned new_caps
= m
->get_caps();
5251 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5252 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5253 << " mds." << mds
<< " seq " << m
->get_seq()
5254 << " caps now " << ccap_string(new_caps
)
5255 << " was " << ccap_string(cap
->issued
)
5256 << (was_stale
? " (stale)" : "") << dendl
;
5259 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5260 cap
->seq
= m
->get_seq();
5261 cap
->gen
= session
->cap_gen
;
5263 check_cap_issue(in
, new_caps
);
5267 in
->caps_issued(&issued
);
5268 issued
|= in
->caps_dirty();
5270 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5271 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5272 in
->mode
= m
->head
.mode
;
5273 in
->uid
= m
->head
.uid
;
5274 in
->gid
= m
->head
.gid
;
5275 in
->btime
= m
->btime
;
5277 bool deleted_inode
= false;
5278 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5279 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5280 in
->nlink
= m
->head
.nlink
;
5281 if (in
->nlink
== 0 &&
5282 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5283 deleted_inode
= true;
5285 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5286 m
->xattrbl
.length() &&
5287 m
->head
.xattr_version
> in
->xattr_version
) {
5288 auto p
= m
->xattrbl
.cbegin();
5289 decode(in
->xattrs
, p
);
5290 in
->xattr_version
= m
->head
.xattr_version
;
5293 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5294 in
->dirstat
.nfiles
= m
->get_nfiles();
5295 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5298 if (new_caps
& CEPH_CAP_ANY_RD
) {
5299 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5300 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5303 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5304 in
->layout
= m
->get_layout();
5305 update_inode_file_size(in
, issued
, m
->get_size(),
5306 m
->get_truncate_seq(), m
->get_truncate_size());
5309 if (m
->inline_version
> in
->inline_version
) {
5310 in
->inline_data
= m
->inline_data
;
5311 in
->inline_version
= m
->inline_version
;
5314 /* always take a newer change attr */
5315 if (m
->get_change_attr() > in
->change_attr
)
5316 in
->change_attr
= m
->get_change_attr();
5319 if (cap
== in
->auth_cap
&&
5320 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5321 (m
->get_max_size() != in
->max_size
)) {
5322 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5323 in
->max_size
= m
->get_max_size();
5324 if (in
->max_size
> in
->wanted_max_size
) {
5325 in
->wanted_max_size
= 0;
5326 in
->requested_max_size
= 0;
5331 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5332 (wanted
& ~(cap
->wanted
| new_caps
))) {
5333 // If mds is importing cap, prior cap messages that update 'wanted'
5334 // may get dropped by mds (migrate seq mismatch).
5336 // We don't send cap message to update 'wanted' if what we want are
5337 // already issued. If mds revokes caps, cap message that releases caps
5338 // also tells mds what we want. But if caps got revoked by mds forcedly
5339 // (session stale). We may haven't told mds what we want.
5345 auto revoked
= cap
->issued
& ~new_caps
;
5347 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5348 cap
->issued
= new_caps
;
5349 cap
->implemented
|= new_caps
;
5351 // recall delegations if we're losing caps necessary for them
5352 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5353 in
->recall_deleg(false);
5354 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5355 in
->recall_deleg(true);
5357 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5358 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5359 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5360 // waitin' for flush
5361 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5365 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5368 } else if (cap
->issued
== new_caps
) {
5369 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5371 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5372 cap
->issued
= new_caps
;
5373 cap
->implemented
|= new_caps
;
5375 if (cap
== in
->auth_cap
) {
5376 // non-auth MDS is revoking the newly grant caps ?
5377 for (const auto &p
: in
->caps
) {
5378 if (&p
.second
== cap
)
5380 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5393 signal_cond_list(in
->waitfor_caps
);
5395 // may drop inode's last ref
5397 _try_to_trim_inode(in
, true);
5400 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5402 if (perms
.uid() == 0)
5405 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5406 int ret
= _posix_acl_permission(in
, perms
, want
);
5411 // check permissions before doing anything else
5412 if (!in
->check_mode(perms
, want
))
5417 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5418 const UserPerm
& perms
)
5420 int r
= _getattr_for_perm(in
, perms
);
5425 if (strncmp(name
, "system.", 7) == 0) {
5426 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5429 r
= inode_permission(in
, perms
, want
);
5432 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5436 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5437 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5441 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5442 const UserPerm
& perms
)
5444 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5445 int r
= _getattr_for_perm(in
, perms
);
5449 if (mask
& CEPH_SETATTR_SIZE
) {
5450 r
= inode_permission(in
, perms
, MAY_WRITE
);
5456 if (mask
& CEPH_SETATTR_UID
) {
5457 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5460 if (mask
& CEPH_SETATTR_GID
) {
5461 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5462 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5466 if (mask
& CEPH_SETATTR_MODE
) {
5467 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5470 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5471 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5472 stx
->stx_mode
&= ~S_ISGID
;
5475 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5476 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5477 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5478 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5479 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5480 check_mask
|= CEPH_SETATTR_MTIME
;
5481 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5482 check_mask
|= CEPH_SETATTR_ATIME
;
5483 if (check_mask
& mask
) {
5486 r
= inode_permission(in
, perms
, MAY_WRITE
);
5494 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5498 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5500 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5503 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5505 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5506 want
= MAY_READ
| MAY_WRITE
;
5507 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5509 if (flags
& O_TRUNC
)
5513 switch (in
->mode
& S_IFMT
) {
5518 if (want
& MAY_WRITE
) {
5525 r
= _getattr_for_perm(in
, perms
);
5529 r
= inode_permission(in
, perms
, want
);
5531 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5535 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5537 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5538 int r
= _getattr_for_perm(dir
, perms
);
5542 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5544 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5548 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5550 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5551 int r
= _getattr_for_perm(dir
, perms
);
5555 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5557 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5561 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5563 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5564 int r
= _getattr_for_perm(dir
, perms
);
5568 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5572 /* 'name == NULL' means rmsnap */
5573 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5575 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5578 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5582 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5586 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5588 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5589 int r
= _getattr_for_perm(in
, perms
);
5593 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5599 if (!S_ISREG(in
->mode
))
5602 if (in
->mode
& S_ISUID
)
5605 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5608 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5610 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5614 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5616 int mask
= CEPH_STAT_CAP_MODE
;
5618 if (acl_type
!= NO_ACL
) {
5619 mask
|= CEPH_STAT_CAP_XATTR
;
5620 force
= in
->xattr_version
== 0;
5622 return _getattr(in
, mask
, perms
, force
);
5625 vinodeno_t
Client::_get_vino(Inode
*in
)
5627 /* The caller must hold the client lock */
5628 return vinodeno_t(in
->ino
, in
->snapid
);
5632 * Resolve an MDS spec to a list of MDS daemon GIDs.
5634 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5635 * It may be '*' in which case it matches all GIDs.
5637 * If no error is returned, the `targets` vector will be populated with at least
5640 int Client::resolve_mds(
5641 const std::string
&mds_spec
,
5642 std::vector
<mds_gid_t
> *targets
)
5645 ceph_assert(targets
!= nullptr);
5648 std::stringstream ss
;
5649 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5651 // We got a role, resolve it to a GID
5652 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5653 << role
<< "'" << dendl
;
5655 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5659 std::string strtol_err
;
5660 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5661 if (strtol_err
.empty()) {
5662 // It is a possible GID
5663 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5664 if (fsmap
->gid_exists(mds_gid
)) {
5665 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5666 targets
->push_back(mds_gid
);
5668 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5672 } else if (mds_spec
== "*") {
5673 // It is a wildcard: use all MDSs
5674 const auto mds_info
= fsmap
->get_mds_info();
5676 if (mds_info
.empty()) {
5677 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5681 for (const auto i
: mds_info
) {
5682 targets
->push_back(i
.first
);
5685 // It did not parse as an integer, it is not a wildcard, it must be a name
5686 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5688 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5690 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5694 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5695 << "' to GID " << mds_gid
<< dendl
;
5696 targets
->push_back(mds_gid
);
5705 * Authenticate with mon and establish global ID
5707 int Client::authenticate()
5709 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5711 if (monclient
->is_authenticated()) {
5715 client_lock
.unlock();
5716 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5722 whoami
= monclient
->get_global_id();
5723 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5728 int Client::fetch_fsmap(bool user
)
5731 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5732 // rather than MDSMap because no one MDSMap contains all the daemons, and
5733 // a `tell` can address any daemon.
5734 version_t fsmap_latest
;
5737 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5738 client_lock
.unlock();
5741 } while (r
== -EAGAIN
);
5744 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5748 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5751 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5752 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5753 monclient
->renew_subs();
5754 wait_on_list(waiting_for_fsmap
);
5756 ceph_assert(fsmap_user
);
5757 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5759 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5760 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5761 monclient
->renew_subs();
5762 wait_on_list(waiting_for_fsmap
);
5765 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5767 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5768 << fsmap_latest
<< dendl
;
5774 * @mds_spec one of ID, rank, GID, "*"
5777 int Client::mds_command(
5778 const std::string
&mds_spec
,
5779 const vector
<string
>& cmd
,
5780 const bufferlist
& inbl
,
5785 std::lock_guard
lock(client_lock
);
5796 r
= fetch_fsmap(false);
5801 // Look up MDS target(s) of the command
5802 std::vector
<mds_gid_t
> targets
;
5803 r
= resolve_mds(mds_spec
, &targets
);
5808 // If daemons are laggy, we won't send them commands. If all
5809 // are laggy then we fail.
5810 std::vector
<mds_gid_t
> non_laggy
;
5811 for (const auto gid
: targets
) {
5812 const auto info
= fsmap
->get_info_gid(gid
);
5813 if (!info
.laggy()) {
5814 non_laggy
.push_back(gid
);
5817 if (non_laggy
.size() == 0) {
5818 *outs
= "All targeted MDS daemons are laggy";
5822 if (metadata
.empty()) {
5823 // We are called on an unmounted client, so metadata
5824 // won't be initialized yet.
5825 populate_metadata("");
5828 // Send commands to targets
5829 C_GatherBuilder
gather(cct
, onfinish
);
5830 for (const auto target_gid
: non_laggy
) {
5831 const auto info
= fsmap
->get_info_gid(target_gid
);
5833 // Open a connection to the target MDS
5834 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5836 // Generate MDSCommandOp state
5837 auto &op
= command_table
.start_command();
5839 op
.on_finish
= gather
.new_sub();
5844 op
.mds_gid
= target_gid
;
5847 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5848 << " tid=" << op
.tid
<< cmd
<< dendl
;
5850 // Construct and send MCommand
5851 auto m
= op
.get_message(monclient
->get_fsid());
5852 conn
->send_message2(std::move(m
));
5859 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5861 ceph_tid_t
const tid
= m
->get_tid();
5863 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5865 if (!command_table
.exists(tid
)) {
5866 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5870 auto &op
= command_table
.get_command(tid
);
5872 *op
.outbl
= m
->get_data();
5879 op
.on_finish
->complete(m
->r
);
5882 command_table
.erase(tid
);
5885 // -------------------
5888 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5890 int r
= authenticate();
5892 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5896 std::string resolved_fs_name
;
5897 if (fs_name
.empty()) {
5898 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
5899 if (resolved_fs_name
.empty())
5900 // Try the backwards compatibility fs name option
5901 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5903 resolved_fs_name
= fs_name
;
5906 std::string want
= "mdsmap";
5907 if (!resolved_fs_name
.empty()) {
5908 r
= fetch_fsmap(true);
5911 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5912 if (fscid
== FS_CLUSTER_ID_NONE
) {
5916 std::ostringstream oss
;
5917 oss
<< want
<< "." << fscid
;
5920 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5922 monclient
->sub_want(want
, 0, 0);
5923 monclient
->renew_subs();
5928 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5929 bool require_mds
, const std::string
&fs_name
)
5931 std::lock_guard
lock(client_lock
);
5934 ldout(cct
, 5) << "already mounted" << dendl
;
5940 int r
= subscribe_mdsmap(fs_name
);
5942 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5946 tick(); // start tick
5950 auto availability
= mdsmap
->is_cluster_available();
5951 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5953 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5954 return CEPH_FUSE_NO_MDS_UP
;
5955 } else if (availability
== MDSMap::AVAILABLE
) {
5956 // Continue to mount
5958 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5959 // Else, wait. MDSMonitor will update the map to bring
5960 // us to a conclusion eventually.
5961 wait_on_list(waiting_for_mdsmap
);
5963 // Unexpected value!
5969 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5971 filepath
fp(CEPH_INO_ROOT
);
5972 if (!mount_root
.empty()) {
5973 fp
= filepath(mount_root
.c_str());
5976 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5977 req
->set_filepath(fp
);
5978 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5979 int res
= make_request(req
, perms
);
5981 if (res
== -EACCES
&& root
) {
5982 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6000 if (!cct
->_conf
->client_trace
.empty()) {
6001 traceout
.open(cct
->_conf
->client_trace
.c_str());
6002 if (traceout
.is_open()) {
6003 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6005 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6010 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6011 ldout(cct, 3) << "op: struct stat st;" << dendl;
6012 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6013 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6014 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6015 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6016 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6017 ldout(cct, 3) << "op: int fd;" << dendl;
6024 void Client::_close_sessions()
6026 while (!mds_sessions
.empty()) {
6027 // send session closes!
6028 for (auto &p
: mds_sessions
) {
6029 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
6030 _close_mds_session(&p
.second
);
6034 // wait for sessions to close
6035 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
6036 std::unique_lock l
{client_lock
, std::adopt_lock
};
6042 void Client::flush_mdlog_sync()
6044 if (mds_requests
.empty())
6046 for (auto &p
: mds_sessions
) {
6047 flush_mdlog(&p
.second
);
6051 void Client::flush_mdlog(MetaSession
*session
)
6053 // Only send this to Luminous or newer MDS daemons, older daemons
6054 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6055 const uint64_t features
= session
->con
->get_features();
6056 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6057 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6058 session
->con
->send_message2(std::move(m
));
6063 void Client::_abort_mds_sessions(int err
)
6065 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6066 auto req
= p
->second
;
6068 // unsafe requests will be removed during close session below.
6069 if (req
->got_unsafe
)
6073 if (req
->caller_cond
) {
6075 req
->caller_cond
->notify_all();
6079 // Process aborts on any requests that were on this waitlist.
6080 // Any requests that were on a waiting_for_open session waitlist
6081 // will get kicked during close session below.
6082 signal_cond_list(waiting_for_mdsmap
);
6084 // Force-close all sessions
6085 while(!mds_sessions
.empty()) {
6086 auto& session
= mds_sessions
.begin()->second
;
6087 _closed_mds_session(&session
);
6091 void Client::_unmount(bool abort
)
6093 std::unique_lock lock
{client_lock
, std::adopt_lock
};
6097 if (abort
|| blacklisted
) {
6098 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6100 ldout(cct
, 2) << "unmounting" << dendl
;
6107 // Abort all mds sessions
6108 _abort_mds_sessions(-ENOTCONN
);
6110 objecter
->op_cancel_writes(-ENOTCONN
);
6112 // flush the mdlog for pending requests, if any
6116 mount_cond
.wait(lock
, [this] {
6117 if (!mds_requests
.empty()) {
6118 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6121 return mds_requests
.empty();
6124 timer
.cancel_event(tick_event
);
6129 // clean up any unclosed files
6130 while (!fd_map
.empty()) {
6131 Fh
*fh
= fd_map
.begin()->second
;
6132 fd_map
.erase(fd_map
.begin());
6133 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6137 while (!ll_unclosed_fh_set
.empty()) {
6138 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6140 ll_unclosed_fh_set
.erase(fh
);
6141 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6145 while (!opened_dirs
.empty()) {
6146 dir_result_t
*dirp
= *opened_dirs
.begin();
6147 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6153 mount_cond
.wait(lock
, [this] {
6154 if (unsafe_sync_write
> 0) {
6155 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting"
6158 return unsafe_sync_write
<= 0;
6161 if (cct
->_conf
->client_oc
) {
6162 // flush/release all buffered data
6163 std::list
<InodeRef
> anchor
;
6164 for (auto& p
: inode_map
) {
6165 Inode
*in
= p
.second
;
6167 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6171 // prevent inode from getting freed
6172 anchor
.emplace_back(in
);
6174 if (abort
|| blacklisted
) {
6175 objectcacher
->purge_set(&in
->oset
);
6176 } else if (!in
->caps
.empty()) {
6178 _flush(in
, new C_Client_FlushComplete(this, in
));
6183 if (abort
|| blacklisted
) {
6184 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6187 if (in
->dirty_caps
) {
6188 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6189 in
->mark_caps_clean();
6195 wait_sync_caps(last_flush_tid
);
6201 while (lru
.lru_get_size() > 0 ||
6202 !inode_map
.empty()) {
6203 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6204 << "+" << inode_map
.size() << " items"
6205 << ", waiting (for caps to release?)"
6207 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6208 r
== std::cv_status::timeout
) {
6212 ceph_assert(lru
.lru_get_size() == 0);
6213 ceph_assert(inode_map
.empty());
6216 if (!cct
->_conf
->client_trace
.empty()) {
6217 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6226 ldout(cct
, 2) << "unmounted." << dendl
;
6229 void Client::unmount()
6231 std::lock_guard
lock(client_lock
);
6235 void Client::abort_conn()
6237 std::lock_guard
lock(client_lock
);
6241 void Client::flush_cap_releases()
6243 // send any cap releases
6244 for (auto &p
: mds_sessions
) {
6245 auto &session
= p
.second
;
6246 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6248 if (cct
->_conf
->client_inject_release_failure
) {
6249 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6251 session
.con
->send_message2(std::move(session
.release
));
6253 session
.release
.reset();
6260 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6261 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6262 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6263 cct
->_conf
.apply_changes(nullptr);
6266 ldout(cct
, 21) << "tick" << dendl
;
6267 tick_event
= timer
.add_event_after(
6268 cct
->_conf
->client_tick_interval
,
6269 new LambdaContext([this](int) {
6270 // Called back via Timer, which takes client_lock for us
6271 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6274 utime_t now
= ceph_clock_now();
6276 if (!mounted
&& !mds_requests
.empty()) {
6277 MetaRequest
*req
= mds_requests
.begin()->second
;
6278 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6279 req
->abort(-ETIMEDOUT
);
6280 if (req
->caller_cond
) {
6282 req
->caller_cond
->notify_all();
6284 signal_cond_list(waiting_for_mdsmap
);
6285 for (auto &p
: mds_sessions
) {
6286 signal_context_list(p
.second
.waiting_for_open
);
6291 if (mdsmap
->get_epoch()) {
6293 utime_t el
= now
- last_cap_renew
;
6294 if (el
> mdsmap
->get_session_timeout() / 3.0)
6297 flush_cap_releases();
6301 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6305 if (in
->hold_caps_until
> now
)
6307 delayed_list
.pop_front();
6308 check_caps(in
, CHECK_CAPS_NODELAY
);
6314 void Client::renew_caps()
6316 ldout(cct
, 10) << "renew_caps()" << dendl
;
6317 last_cap_renew
= ceph_clock_now();
6319 for (auto &p
: mds_sessions
) {
6320 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6321 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6322 renew_caps(&p
.second
);
6326 void Client::renew_caps(MetaSession
*session
)
6328 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6329 session
->last_cap_renew_request
= ceph_clock_now();
6330 uint64_t seq
= ++session
->cap_renew_seq
;
6331 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6335 // ===============================================================
6336 // high level (POSIXy) interface
6338 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6339 InodeRef
*target
, const UserPerm
& perms
)
6341 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6342 MetaRequest
*req
= new MetaRequest(op
);
6344 dir
->make_nosnap_relative_path(path
);
6345 path
.push_dentry(name
);
6346 req
->set_filepath(path
);
6347 req
->set_inode(dir
);
6348 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6349 mask
|= DEBUG_GETATTR_CAPS
;
6350 req
->head
.args
.getattr
.mask
= mask
;
6352 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6354 int r
= make_request(req
, perms
, target
);
6355 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6359 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6360 const UserPerm
& perms
)
6365 if (dname
== "..") {
6366 if (dir
->dentries
.empty()) {
6367 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6368 filepath
path(dir
->ino
);
6369 req
->set_filepath(path
);
6372 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6375 Inode
*tempino
= tmptarget
.get();
6378 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6384 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6393 if (!dir
->is_dir()) {
6398 if (dname
.length() > NAME_MAX
) {
6403 if (dname
== cct
->_conf
->client_snapdir
&&
6404 dir
->snapid
== CEPH_NOSNAP
) {
6405 *target
= open_snapdir(dir
);
6410 dir
->dir
->dentries
.count(dname
)) {
6411 dn
= dir
->dir
->dentries
[dname
];
6413 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6414 << " seq " << dn
->lease_seq
6417 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6418 // is dn lease valid?
6419 utime_t now
= ceph_clock_now();
6420 if (dn
->lease_mds
>= 0 &&
6421 dn
->lease_ttl
> now
&&
6422 mds_sessions
.count(dn
->lease_mds
)) {
6423 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6424 if (s
.cap_ttl
> now
&&
6425 s
.cap_gen
== dn
->lease_gen
) {
6426 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6427 // make trim_caps() behave.
6428 dir
->try_touch_cap(dn
->lease_mds
);
6431 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6432 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6435 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6436 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6437 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6439 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6440 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6441 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6446 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6449 // can we conclude ENOENT locally?
6450 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6451 (dir
->flags
& I_COMPLETE
)) {
6452 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6457 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6462 *target
= dn
->inode
;
6470 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6472 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6476 int Client::get_or_create(Inode
*dir
, const char* name
,
6477 Dentry
**pdn
, bool expect_null
)
6480 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6482 if (dir
->dir
->dentries
.count(name
)) {
6483 Dentry
*dn
= dir
->dir
->dentries
[name
];
6485 // is dn lease valid?
6486 utime_t now
= ceph_clock_now();
6488 dn
->lease_mds
>= 0 &&
6489 dn
->lease_ttl
> now
&&
6490 mds_sessions
.count(dn
->lease_mds
)) {
6491 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6492 if (s
.cap_ttl
> now
&&
6493 s
.cap_gen
== dn
->lease_gen
) {
6500 // otherwise link up a new one
6501 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6508 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6509 const UserPerm
& perms
, bool followsym
, int mask
)
6511 filepath path
= origpath
;
6513 if (origpath
.absolute())
6519 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6524 while (i
< path
.depth() && cur
) {
6526 const string
&dname
= path
[i
];
6527 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6528 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6530 if (cct
->_conf
->client_permissions
) {
6531 int r
= may_lookup(cur
.get(), perms
);
6534 caps
= CEPH_CAP_AUTH_SHARED
;
6537 /* Get extra requested caps on the last component */
6538 if (i
== (path
.depth() - 1))
6540 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6543 // only follow trailing symlink if followsym. always follow
6544 // 'directory' symlinks.
6545 if (next
&& next
->is_symlink()) {
6547 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6548 if (symlinks
> MAXSYMLINKS
) {
6552 if (i
< path
.depth() - 1) {
6554 // replace consumed components of path with symlink dir target
6555 filepath
resolved(next
->symlink
.c_str());
6556 resolved
.append(path
.postfixpath(i
+ 1));
6559 if (next
->symlink
[0] == '/') {
6563 } else if (followsym
) {
6564 if (next
->symlink
[0] == '/') {
6565 path
= next
->symlink
.c_str();
6570 filepath
more(next
->symlink
.c_str());
6571 // we need to remove the symlink component from off of the path
6572 // before adding the target that the symlink points to. remain
6573 // at the same position in the path.
6593 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6595 std::lock_guard
lock(client_lock
);
6596 tout(cct
) << "link" << std::endl
;
6597 tout(cct
) << relexisting
<< std::endl
;
6598 tout(cct
) << relpath
<< std::endl
;
6603 filepath
existing(relexisting
);
6606 int r
= path_walk(existing
, &in
, perm
, true);
6609 if (std::string(relpath
) == "/") {
6613 filepath
path(relpath
);
6614 string name
= path
.last_dentry();
6617 r
= path_walk(path
, &dir
, perm
, true);
6620 if (cct
->_conf
->client_permissions
) {
6621 if (S_ISDIR(in
->mode
)) {
6625 r
= may_hardlink(in
.get(), perm
);
6628 r
= may_create(dir
.get(), perm
);
6632 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6636 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6638 std::lock_guard
lock(client_lock
);
6639 tout(cct
) << __func__
<< std::endl
;
6640 tout(cct
) << relpath
<< std::endl
;
6645 if (std::string(relpath
) == "/")
6648 filepath
path(relpath
);
6649 string name
= path
.last_dentry();
6652 int r
= path_walk(path
, &dir
, perm
);
6655 if (cct
->_conf
->client_permissions
) {
6656 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6660 return _unlink(dir
.get(), name
.c_str(), perm
);
6663 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6665 std::lock_guard
lock(client_lock
);
6666 tout(cct
) << __func__
<< std::endl
;
6667 tout(cct
) << relfrom
<< std::endl
;
6668 tout(cct
) << relto
<< std::endl
;
6673 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6676 filepath
from(relfrom
);
6678 string fromname
= from
.last_dentry();
6680 string toname
= to
.last_dentry();
6683 InodeRef fromdir
, todir
;
6684 int r
= path_walk(from
, &fromdir
, perm
);
6687 r
= path_walk(to
, &todir
, perm
);
6691 if (cct
->_conf
->client_permissions
) {
6692 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6695 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6696 if (r
< 0 && r
!= -ENOENT
)
6699 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6706 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6708 std::lock_guard
lock(client_lock
);
6709 tout(cct
) << __func__
<< std::endl
;
6710 tout(cct
) << relpath
<< std::endl
;
6711 tout(cct
) << mode
<< std::endl
;
6712 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6717 if (std::string(relpath
) == "/")
6720 filepath
path(relpath
);
6721 string name
= path
.last_dentry();
6724 int r
= path_walk(path
, &dir
, perm
);
6727 if (cct
->_conf
->client_permissions
) {
6728 r
= may_create(dir
.get(), perm
);
6732 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6735 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6737 std::lock_guard
lock(client_lock
);
6738 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6739 tout(cct
) << __func__
<< std::endl
;
6740 tout(cct
) << relpath
<< std::endl
;
6741 tout(cct
) << mode
<< std::endl
;
6746 //get through existing parts of path
6747 filepath
path(relpath
);
6749 int r
= 0, caps
= 0;
6752 for (i
=0; i
<path
.depth(); ++i
) {
6753 if (cct
->_conf
->client_permissions
) {
6754 r
= may_lookup(cur
.get(), perms
);
6757 caps
= CEPH_CAP_AUTH_SHARED
;
6759 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6764 if (r
!=-ENOENT
) return r
;
6765 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6766 //make new directory at each level
6767 for (; i
<path
.depth(); ++i
) {
6768 if (cct
->_conf
->client_permissions
) {
6769 r
= may_create(cur
.get(), perms
);
6774 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6776 //check proper creation/existence
6777 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6778 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6782 //move to new dir and continue
6784 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6785 << filepath(cur
->ino
).get_path() << dendl
;
6790 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6792 std::lock_guard
lock(client_lock
);
6793 tout(cct
) << __func__
<< std::endl
;
6794 tout(cct
) << relpath
<< std::endl
;
6799 if (std::string(relpath
) == "/")
6802 filepath
path(relpath
);
6803 string name
= path
.last_dentry();
6806 int r
= path_walk(path
, &dir
, perms
);
6809 if (cct
->_conf
->client_permissions
) {
6810 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6814 return _rmdir(dir
.get(), name
.c_str(), perms
);
6817 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6819 std::lock_guard
lock(client_lock
);
6820 tout(cct
) << __func__
<< std::endl
;
6821 tout(cct
) << relpath
<< std::endl
;
6822 tout(cct
) << mode
<< std::endl
;
6823 tout(cct
) << rdev
<< std::endl
;
6828 if (std::string(relpath
) == "/")
6831 filepath
path(relpath
);
6832 string name
= path
.last_dentry();
6835 int r
= path_walk(path
, &dir
, perms
);
6838 if (cct
->_conf
->client_permissions
) {
6839 int r
= may_create(dir
.get(), perms
);
6843 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6848 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6850 std::lock_guard
lock(client_lock
);
6851 tout(cct
) << __func__
<< std::endl
;
6852 tout(cct
) << target
<< std::endl
;
6853 tout(cct
) << relpath
<< std::endl
;
6858 if (std::string(relpath
) == "/")
6861 filepath
path(relpath
);
6862 string name
= path
.last_dentry();
6865 int r
= path_walk(path
, &dir
, perms
);
6868 if (cct
->_conf
->client_permissions
) {
6869 int r
= may_create(dir
.get(), perms
);
6873 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6876 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6878 std::lock_guard
lock(client_lock
);
6879 tout(cct
) << __func__
<< std::endl
;
6880 tout(cct
) << relpath
<< std::endl
;
6885 filepath
path(relpath
);
6887 int r
= path_walk(path
, &in
, perms
, false);
6891 return _readlink(in
.get(), buf
, size
);
6894 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6896 if (!in
->is_symlink())
6899 // copy into buf (at most size bytes)
6900 int r
= in
->symlink
.length();
6903 memcpy(buf
, in
->symlink
.c_str(), r
);
6910 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6912 bool yes
= in
->caps_issued_mask(mask
, true);
6914 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6918 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6920 in
->make_nosnap_relative_path(path
);
6921 req
->set_filepath(path
);
6923 req
->head
.args
.getattr
.mask
= mask
;
6925 int res
= make_request(req
, perms
);
6926 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6930 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6931 const UserPerm
& perms
, InodeRef
*inp
)
6933 int issued
= in
->caps_issued();
6935 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6936 ccap_string(issued
) << dendl
;
6938 if (in
->snapid
!= CEPH_NOSNAP
) {
6941 if ((mask
& CEPH_SETATTR_SIZE
) &&
6942 (unsigned long)stx
->stx_size
> in
->size
&&
6943 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6948 // make the change locally?
6949 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6950 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6951 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6952 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6953 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6956 * This works because we implicitly flush the caps as part of the
6957 * request, so the cap update check will happen with the writeback
6958 * cap context, and then the setattr check will happen with the
6961 * In reality this pattern is likely pretty rare (different users
6962 * setattr'ing the same file). If that turns out not to be the
6963 * case later, we can build a more complex pipelined cap writeback
6967 mask
|= CEPH_SETATTR_CTIME
;
6972 // caller just needs us to bump the ctime
6973 in
->ctime
= ceph_clock_now();
6974 in
->cap_dirtier_uid
= perms
.uid();
6975 in
->cap_dirtier_gid
= perms
.gid();
6976 if (issued
& CEPH_CAP_AUTH_EXCL
)
6977 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6978 else if (issued
& CEPH_CAP_FILE_EXCL
)
6979 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6980 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6981 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6983 mask
|= CEPH_SETATTR_CTIME
;
6986 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6987 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6989 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6991 if (mask
& CEPH_SETATTR_UID
) {
6992 in
->ctime
= ceph_clock_now();
6993 in
->cap_dirtier_uid
= perms
.uid();
6994 in
->cap_dirtier_gid
= perms
.gid();
6995 in
->uid
= stx
->stx_uid
;
6996 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6997 mask
&= ~CEPH_SETATTR_UID
;
6999 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7001 if (mask
& CEPH_SETATTR_GID
) {
7002 in
->ctime
= ceph_clock_now();
7003 in
->cap_dirtier_uid
= perms
.uid();
7004 in
->cap_dirtier_gid
= perms
.gid();
7005 in
->gid
= stx
->stx_gid
;
7006 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7007 mask
&= ~CEPH_SETATTR_GID
;
7009 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7012 if (mask
& CEPH_SETATTR_MODE
) {
7013 in
->ctime
= ceph_clock_now();
7014 in
->cap_dirtier_uid
= perms
.uid();
7015 in
->cap_dirtier_gid
= perms
.gid();
7016 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7017 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7018 mask
&= ~CEPH_SETATTR_MODE
;
7019 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7020 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7021 /* Must squash the any setuid/setgid bits with an ownership change */
7022 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7023 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7026 if (mask
& CEPH_SETATTR_BTIME
) {
7027 in
->ctime
= ceph_clock_now();
7028 in
->cap_dirtier_uid
= perms
.uid();
7029 in
->cap_dirtier_gid
= perms
.gid();
7030 in
->btime
= utime_t(stx
->stx_btime
);
7031 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7032 mask
&= ~CEPH_SETATTR_BTIME
;
7033 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7035 } else if (mask
& CEPH_SETATTR_SIZE
) {
7036 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7037 mask
|= CEPH_SETATTR_KILL_SGUID
;
7040 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7041 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
7042 if (mask
& CEPH_SETATTR_MTIME
)
7043 in
->mtime
= utime_t(stx
->stx_mtime
);
7044 if (mask
& CEPH_SETATTR_ATIME
)
7045 in
->atime
= utime_t(stx
->stx_atime
);
7046 in
->ctime
= ceph_clock_now();
7047 in
->cap_dirtier_uid
= perms
.uid();
7048 in
->cap_dirtier_gid
= perms
.gid();
7049 in
->time_warp_seq
++;
7050 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7051 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
7060 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7064 in
->make_nosnap_relative_path(path
);
7065 req
->set_filepath(path
);
7068 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
7069 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7071 if (mask
& CEPH_SETATTR_MODE
) {
7072 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7073 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7074 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7076 if (mask
& CEPH_SETATTR_UID
) {
7077 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7078 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7079 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7081 if (mask
& CEPH_SETATTR_GID
) {
7082 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7083 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7084 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7086 if (mask
& CEPH_SETATTR_BTIME
) {
7087 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7088 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7090 if (mask
& CEPH_SETATTR_MTIME
) {
7091 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7092 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7095 if (mask
& CEPH_SETATTR_ATIME
) {
7096 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7097 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7100 if (mask
& CEPH_SETATTR_SIZE
) {
7101 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7102 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7103 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7106 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7109 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7112 req
->head
.args
.setattr
.mask
= mask
;
7114 req
->regetattr_mask
= mask
;
7116 int res
= make_request(req
, perms
, inp
);
7117 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7121 /* Note that we only care about attrs that setattr cares about */
7122 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7124 stx
->stx_size
= st
->st_size
;
7125 stx
->stx_mode
= st
->st_mode
;
7126 stx
->stx_uid
= st
->st_uid
;
7127 stx
->stx_gid
= st
->st_gid
;
7129 stx
->stx_mtime
= st
->st_mtimespec
;
7130 stx
->stx_atime
= st
->st_atimespec
;
7132 stx
->stx_mtime
= st
->st_mtim
;
7133 stx
->stx_atime
= st
->st_atim
;
7137 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7138 const UserPerm
& perms
, InodeRef
*inp
)
7140 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7143 if (mask
& CEPH_SETATTR_MODE
)
7144 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7148 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7149 const UserPerm
& perms
)
7151 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7152 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7153 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7154 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7155 if (cct
->_conf
->client_permissions
) {
7156 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7160 return __setattrx(in
.get(), stx
, mask
, perms
);
7163 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7164 const UserPerm
& perms
)
7166 struct ceph_statx stx
;
7168 stat_to_statx(attr
, &stx
);
7169 mask
&= ~CEPH_SETATTR_BTIME
;
7171 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7172 mask
&= ~CEPH_SETATTR_UID
;
7174 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7175 mask
&= ~CEPH_SETATTR_GID
;
7178 return _setattrx(in
, &stx
, mask
, perms
);
7181 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7182 const UserPerm
& perms
)
7184 std::lock_guard
lock(client_lock
);
7185 tout(cct
) << __func__
<< std::endl
;
7186 tout(cct
) << relpath
<< std::endl
;
7187 tout(cct
) << mask
<< std::endl
;
7192 filepath
path(relpath
);
7194 int r
= path_walk(path
, &in
, perms
);
7197 return _setattr(in
, attr
, mask
, perms
);
7200 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7201 const UserPerm
& perms
, int flags
)
7203 std::lock_guard
lock(client_lock
);
7204 tout(cct
) << __func__
<< std::endl
;
7205 tout(cct
) << relpath
<< std::endl
;
7206 tout(cct
) << mask
<< std::endl
;
7211 filepath
path(relpath
);
7213 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7216 return _setattrx(in
, stx
, mask
, perms
);
7219 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7221 std::lock_guard
lock(client_lock
);
7222 tout(cct
) << __func__
<< std::endl
;
7223 tout(cct
) << fd
<< std::endl
;
7224 tout(cct
) << mask
<< std::endl
;
7229 Fh
*f
= get_filehandle(fd
);
7232 #if defined(__linux__) && defined(O_PATH)
7233 if (f
->flags
& O_PATH
)
7236 return _setattr(f
->inode
, attr
, mask
, perms
);
7239 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7241 std::lock_guard
lock(client_lock
);
7242 tout(cct
) << __func__
<< std::endl
;
7243 tout(cct
) << fd
<< std::endl
;
7244 tout(cct
) << mask
<< std::endl
;
7249 Fh
*f
= get_filehandle(fd
);
7252 #if defined(__linux__) && defined(O_PATH)
7253 if (f
->flags
& O_PATH
)
7256 return _setattrx(f
->inode
, stx
, mask
, perms
);
7259 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7260 frag_info_t
*dirstat
, int mask
)
7262 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7263 std::lock_guard
lock(client_lock
);
7264 tout(cct
) << "stat" << std::endl
;
7265 tout(cct
) << relpath
<< std::endl
;
7270 filepath
path(relpath
);
7272 int r
= path_walk(path
, &in
, perms
, true, mask
);
7275 r
= _getattr(in
, mask
, perms
);
7277 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7280 fill_stat(in
, stbuf
, dirstat
);
7281 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7285 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7289 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7290 if (flags
& AT_NO_ATTR_SYNC
)
7293 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7294 mask
|= CEPH_CAP_PIN
;
7295 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7296 mask
|= CEPH_CAP_AUTH_SHARED
;
7297 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7298 mask
|= CEPH_CAP_LINK_SHARED
;
7299 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7300 mask
|= CEPH_CAP_FILE_SHARED
;
7301 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7302 mask
|= CEPH_CAP_XATTR_SHARED
;
7307 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7308 const UserPerm
& perms
,
7309 unsigned int want
, unsigned int flags
)
7311 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7312 std::lock_guard
lock(client_lock
);
7313 tout(cct
) << "statx" << std::endl
;
7314 tout(cct
) << relpath
<< std::endl
;
7319 filepath
path(relpath
);
7322 unsigned mask
= statx_to_mask(flags
, want
);
7324 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7328 r
= _getattr(in
, mask
, perms
);
7330 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7334 fill_statx(in
, mask
, stx
);
7335 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7339 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7340 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7342 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7343 std::lock_guard
lock(client_lock
);
7344 tout(cct
) << __func__
<< std::endl
;
7345 tout(cct
) << relpath
<< std::endl
;
7350 filepath
path(relpath
);
7352 // don't follow symlinks
7353 int r
= path_walk(path
, &in
, perms
, false, mask
);
7356 r
= _getattr(in
, mask
, perms
);
7358 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7361 fill_stat(in
, stbuf
, dirstat
);
7362 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7366 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7368 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7369 << " mode 0" << oct
<< in
->mode
<< dec
7370 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7371 memset(st
, 0, sizeof(struct stat
));
7372 if (use_faked_inos())
7373 st
->st_ino
= in
->faked_ino
;
7375 st
->st_ino
= in
->ino
;
7376 st
->st_dev
= in
->snapid
;
7377 st
->st_mode
= in
->mode
;
7378 st
->st_rdev
= in
->rdev
;
7380 switch (in
->nlink
) {
7382 st
->st_nlink
= 0; /* dir is unlinked */
7385 st
->st_nlink
= 1 /* parent dentry */
7387 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7393 st
->st_nlink
= in
->nlink
;
7395 st
->st_uid
= in
->uid
;
7396 st
->st_gid
= in
->gid
;
7397 if (in
->ctime
> in
->mtime
) {
7398 stat_set_ctime_sec(st
, in
->ctime
.sec());
7399 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7401 stat_set_ctime_sec(st
, in
->mtime
.sec());
7402 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7404 stat_set_atime_sec(st
, in
->atime
.sec());
7405 stat_set_atime_nsec(st
, in
->atime
.nsec());
7406 stat_set_mtime_sec(st
, in
->mtime
.sec());
7407 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7409 if (cct
->_conf
->client_dirsize_rbytes
)
7410 st
->st_size
= in
->rstat
.rbytes
;
7412 st
->st_size
= in
->dirstat
.size();
7415 st
->st_size
= in
->size
;
7416 st
->st_blocks
= (in
->size
+ 511) >> 9;
7418 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7421 *dirstat
= in
->dirstat
;
7425 return in
->caps_issued();
7428 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7430 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7431 << " mode 0" << oct
<< in
->mode
<< dec
7432 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7433 memset(stx
, 0, sizeof(struct ceph_statx
));
7436 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7437 * so that all bits are set.
7442 /* These are always considered to be available */
7443 stx
->stx_dev
= in
->snapid
;
7444 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7446 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7447 stx
->stx_mode
= S_IFMT
& in
->mode
;
7448 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7449 stx
->stx_rdev
= in
->rdev
;
7450 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7452 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7453 stx
->stx_uid
= in
->uid
;
7454 stx
->stx_gid
= in
->gid
;
7455 stx
->stx_mode
= in
->mode
;
7456 in
->btime
.to_timespec(&stx
->stx_btime
);
7457 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7460 if (mask
& CEPH_CAP_LINK_SHARED
) {
7462 switch (in
->nlink
) {
7464 stx
->stx_nlink
= 0; /* dir is unlinked */
7467 stx
->stx_nlink
= 1 /* parent dentry */
7469 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7475 stx
->stx_nlink
= in
->nlink
;
7477 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7480 if (mask
& CEPH_CAP_FILE_SHARED
) {
7482 in
->atime
.to_timespec(&stx
->stx_atime
);
7483 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7486 if (cct
->_conf
->client_dirsize_rbytes
)
7487 stx
->stx_size
= in
->rstat
.rbytes
;
7489 stx
->stx_size
= in
->dirstat
.size();
7490 stx
->stx_blocks
= 1;
7492 stx
->stx_size
= in
->size
;
7493 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7495 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7496 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7499 /* Change time and change_attr both require all shared caps to view */
7500 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7501 stx
->stx_version
= in
->change_attr
;
7502 if (in
->ctime
> in
->mtime
)
7503 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7505 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7506 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7511 void Client::touch_dn(Dentry
*dn
)
7516 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7518 std::lock_guard
lock(client_lock
);
7519 tout(cct
) << __func__
<< std::endl
;
7520 tout(cct
) << relpath
<< std::endl
;
7521 tout(cct
) << mode
<< std::endl
;
7526 filepath
path(relpath
);
7528 int r
= path_walk(path
, &in
, perms
);
7532 attr
.st_mode
= mode
;
7533 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7536 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7538 std::lock_guard
lock(client_lock
);
7539 tout(cct
) << __func__
<< std::endl
;
7540 tout(cct
) << fd
<< std::endl
;
7541 tout(cct
) << mode
<< std::endl
;
7546 Fh
*f
= get_filehandle(fd
);
7549 #if defined(__linux__) && defined(O_PATH)
7550 if (f
->flags
& O_PATH
)
7554 attr
.st_mode
= mode
;
7555 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7558 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7560 std::lock_guard
lock(client_lock
);
7561 tout(cct
) << __func__
<< std::endl
;
7562 tout(cct
) << relpath
<< std::endl
;
7563 tout(cct
) << mode
<< std::endl
;
7568 filepath
path(relpath
);
7570 // don't follow symlinks
7571 int r
= path_walk(path
, &in
, perms
, false);
7575 attr
.st_mode
= mode
;
7576 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7579 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7580 const UserPerm
& perms
)
7582 std::lock_guard
lock(client_lock
);
7583 tout(cct
) << __func__
<< std::endl
;
7584 tout(cct
) << relpath
<< std::endl
;
7585 tout(cct
) << new_uid
<< std::endl
;
7586 tout(cct
) << new_gid
<< std::endl
;
7591 filepath
path(relpath
);
7593 int r
= path_walk(path
, &in
, perms
);
7597 attr
.st_uid
= new_uid
;
7598 attr
.st_gid
= new_gid
;
7599 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7602 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7604 std::lock_guard
lock(client_lock
);
7605 tout(cct
) << __func__
<< std::endl
;
7606 tout(cct
) << fd
<< std::endl
;
7607 tout(cct
) << new_uid
<< std::endl
;
7608 tout(cct
) << new_gid
<< std::endl
;
7613 Fh
*f
= get_filehandle(fd
);
7616 #if defined(__linux__) && defined(O_PATH)
7617 if (f
->flags
& O_PATH
)
7621 attr
.st_uid
= new_uid
;
7622 attr
.st_gid
= new_gid
;
7624 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7625 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7626 return _setattr(f
->inode
, &attr
, mask
, perms
);
7629 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7630 const UserPerm
& perms
)
7632 std::lock_guard
lock(client_lock
);
7633 tout(cct
) << __func__
<< std::endl
;
7634 tout(cct
) << relpath
<< std::endl
;
7635 tout(cct
) << new_uid
<< std::endl
;
7636 tout(cct
) << new_gid
<< std::endl
;
7641 filepath
path(relpath
);
7643 // don't follow symlinks
7644 int r
= path_walk(path
, &in
, perms
, false);
7648 attr
.st_uid
= new_uid
;
7649 attr
.st_gid
= new_gid
;
7651 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7652 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7653 return _setattr(in
, &attr
, mask
, perms
);
7656 static void attr_set_atime_and_mtime(struct stat
*attr
,
7657 const utime_t
&atime
,
7658 const utime_t
&mtime
)
7660 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7661 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7662 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7663 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7666 // for [l]utime() invoke the timeval variant as the timespec
7667 // variant are not yet implemented. for futime[s](), invoke
7668 // the timespec variant.
7669 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7670 const UserPerm
& perms
)
7672 struct timeval tv
[2];
7673 tv
[0].tv_sec
= buf
->actime
;
7675 tv
[1].tv_sec
= buf
->modtime
;
7678 return utimes(relpath
, tv
, perms
);
7681 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7682 const UserPerm
& perms
)
7684 struct timeval tv
[2];
7685 tv
[0].tv_sec
= buf
->actime
;
7687 tv
[1].tv_sec
= buf
->modtime
;
7690 return lutimes(relpath
, tv
, perms
);
7693 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7695 struct timespec ts
[2];
7696 ts
[0].tv_sec
= buf
->actime
;
7698 ts
[1].tv_sec
= buf
->modtime
;
7701 return futimens(fd
, ts
, perms
);
7704 int Client::utimes(const char *relpath
, struct timeval times
[2],
7705 const UserPerm
& perms
)
7707 std::lock_guard
lock(client_lock
);
7708 tout(cct
) << __func__
<< std::endl
;
7709 tout(cct
) << relpath
<< std::endl
;
7710 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7712 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7718 filepath
path(relpath
);
7720 int r
= path_walk(path
, &in
, perms
);
7724 utime_t
atime(times
[0]);
7725 utime_t
mtime(times
[1]);
7727 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7728 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7731 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7732 const UserPerm
& perms
)
7734 std::lock_guard
lock(client_lock
);
7735 tout(cct
) << __func__
<< std::endl
;
7736 tout(cct
) << relpath
<< std::endl
;
7737 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7739 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7745 filepath
path(relpath
);
7747 int r
= path_walk(path
, &in
, perms
, false);
7751 utime_t
atime(times
[0]);
7752 utime_t
mtime(times
[1]);
7754 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7755 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7758 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7760 struct timespec ts
[2];
7761 ts
[0].tv_sec
= times
[0].tv_sec
;
7762 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7763 ts
[1].tv_sec
= times
[1].tv_sec
;
7764 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7766 return futimens(fd
, ts
, perms
);
7769 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7771 std::lock_guard
lock(client_lock
);
7772 tout(cct
) << __func__
<< std::endl
;
7773 tout(cct
) << fd
<< std::endl
;
7774 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7776 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7782 Fh
*f
= get_filehandle(fd
);
7785 #if defined(__linux__) && defined(O_PATH)
7786 if (f
->flags
& O_PATH
)
7790 utime_t
atime(times
[0]);
7791 utime_t
mtime(times
[1]);
7793 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7794 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7797 int Client::flock(int fd
, int operation
, uint64_t owner
)
7799 std::lock_guard
lock(client_lock
);
7800 tout(cct
) << __func__
<< std::endl
;
7801 tout(cct
) << fd
<< std::endl
;
7802 tout(cct
) << operation
<< std::endl
;
7803 tout(cct
) << owner
<< std::endl
;
7808 Fh
*f
= get_filehandle(fd
);
7812 return _flock(f
, operation
, owner
);
7815 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7817 std::lock_guard
lock(client_lock
);
7818 tout(cct
) << __func__
<< std::endl
;
7819 tout(cct
) << relpath
<< std::endl
;
7824 filepath
path(relpath
);
7826 int r
= path_walk(path
, &in
, perms
, true);
7829 if (cct
->_conf
->client_permissions
) {
7830 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7834 r
= _opendir(in
.get(), dirpp
, perms
);
7835 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7837 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7841 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7845 *dirpp
= new dir_result_t(in
, perms
);
7846 opened_dirs
.insert(*dirpp
);
7847 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7852 int Client::closedir(dir_result_t
*dir
)
7854 std::lock_guard
lock(client_lock
);
7855 tout(cct
) << __func__
<< std::endl
;
7856 tout(cct
) << (unsigned long)dir
<< std::endl
;
7858 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7863 void Client::_closedir(dir_result_t
*dirp
)
7865 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7867 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7868 dirp
->inode
.reset();
7870 _readdir_drop_dirp_buffer(dirp
);
7871 opened_dirs
.erase(dirp
);
7875 void Client::rewinddir(dir_result_t
*dirp
)
7877 std::lock_guard
lock(client_lock
);
7878 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7883 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7884 _readdir_drop_dirp_buffer(d
);
7888 loff_t
Client::telldir(dir_result_t
*dirp
)
7890 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7891 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7895 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7897 std::lock_guard
lock(client_lock
);
7899 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7904 if (offset
== dirp
->offset
)
7907 if (offset
> dirp
->offset
)
7908 dirp
->release_count
= 0; // bump if we do a forward seek
7910 dirp
->ordered_count
= 0; // disable filling readdir cache
7912 if (dirp
->hash_order()) {
7913 if (dirp
->offset
> offset
) {
7914 _readdir_drop_dirp_buffer(dirp
);
7919 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7920 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7921 _readdir_drop_dirp_buffer(dirp
);
7926 dirp
->offset
= offset
;
7931 // ino_t d_ino; /* inode number */
7932 // off_t d_off; /* offset to the next dirent */
7933 // unsigned short d_reclen; /* length of this record */
7934 // unsigned char d_type; /* type of file */
7935 // char d_name[256]; /* filename */
7937 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7939 strncpy(de
->d_name
, name
, 255);
7940 de
->d_name
[255] = '\0';
7943 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7944 de
->d_off
= next_off
;
7947 de
->d_type
= IFTODT(type
);
7948 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7949 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7953 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7955 frag_t fg
= dirp
->buffer_frag
;
7957 if (fg
.is_rightmost()) {
7958 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
7965 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7967 if (dirp
->hash_order()) {
7969 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7970 if (dirp
->offset
< new_offset
) // don't decrease offset
7971 dirp
->offset
= new_offset
;
7973 dirp
->last_name
.clear();
7974 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7975 _readdir_rechoose_frag(dirp
);
7979 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7981 ceph_assert(dirp
->inode
);
7983 if (dirp
->hash_order())
7986 frag_t cur
= frag_t(dirp
->offset_high());
7987 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7989 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
7990 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7991 dirp
->last_name
.clear();
7992 dirp
->next_offset
= 2;
7996 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7998 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
7999 dirp
->buffer
.clear();
8002 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8005 ceph_assert(dirp
->inode
);
8007 // get the current frag.
8009 if (dirp
->hash_order())
8010 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8012 fg
= frag_t(dirp
->offset_high());
8014 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8015 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8017 int op
= CEPH_MDS_OP_READDIR
;
8018 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8019 op
= CEPH_MDS_OP_LSSNAP
;
8021 InodeRef
& diri
= dirp
->inode
;
8023 MetaRequest
*req
= new MetaRequest(op
);
8025 diri
->make_nosnap_relative_path(path
);
8026 req
->set_filepath(path
);
8027 req
->set_inode(diri
.get());
8028 req
->head
.args
.readdir
.frag
= fg
;
8029 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8030 if (dirp
->last_name
.length()) {
8031 req
->path2
.set_path(dirp
->last_name
);
8032 } else if (dirp
->hash_order()) {
8033 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8038 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8040 if (res
== -EAGAIN
) {
8041 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8042 _readdir_rechoose_frag(dirp
);
8043 return _readdir_get_frag(dirp
);
8047 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8048 << " size " << dirp
->buffer
.size() << dendl
;
8050 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8057 struct dentry_off_lt
{
8058 bool operator()(const Dentry
* dn
, int64_t off
) const {
8059 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8063 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8064 int caps
, bool getref
)
8066 ceph_assert(ceph_mutex_is_locked(client_lock
));
8067 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8068 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8070 Dir
*dir
= dirp
->inode
->dir
;
8073 ldout(cct
, 10) << " dir is empty" << dendl
;
8078 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8079 dir
->readdir_cache
.end(),
8080 dirp
->offset
, dentry_off_lt());
8084 if (!dirp
->inode
->is_complete_and_ordered())
8086 if (pd
== dir
->readdir_cache
.end())
8089 if (dn
->inode
== NULL
) {
8090 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8094 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8095 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8100 int idx
= pd
- dir
->readdir_cache
.begin();
8101 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8105 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8106 pd
= dir
->readdir_cache
.begin() + idx
;
8107 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8110 struct ceph_statx stx
;
8112 fill_statx(dn
->inode
, caps
, &stx
);
8114 uint64_t next_off
= dn
->offset
+ 1;
8115 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8117 if (pd
== dir
->readdir_cache
.end())
8118 next_off
= dir_result_t::END
;
8122 in
= dn
->inode
.get();
8126 dn_name
= dn
->name
; // fill in name while we have lock
8128 client_lock
.unlock();
8129 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8131 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8132 << " = " << r
<< dendl
;
8137 dirp
->offset
= next_off
;
8139 dirp
->next_offset
= 2;
8141 dirp
->next_offset
= dirp
->offset_low();
8142 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8143 dirp
->release_count
= 0; // last_name no longer match cache index
8148 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8153 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8154 unsigned want
, unsigned flags
, bool getref
)
8156 int caps
= statx_to_mask(flags
, want
);
8158 std::lock_guard
lock(client_lock
);
8163 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8165 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8166 << dec
<< " at_end=" << dirp
->at_end()
8167 << " hash_order=" << dirp
->hash_order() << dendl
;
8170 struct ceph_statx stx
;
8171 memset(&de
, 0, sizeof(de
));
8172 memset(&stx
, 0, sizeof(stx
));
8174 InodeRef
& diri
= dirp
->inode
;
8179 if (dirp
->offset
== 0) {
8180 ldout(cct
, 15) << " including ." << dendl
;
8181 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8182 uint64_t next_off
= 1;
8185 r
= _getattr(diri
, caps
, dirp
->perms
);
8189 fill_statx(diri
, caps
, &stx
);
8190 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8192 Inode
*inode
= NULL
;
8198 client_lock
.unlock();
8199 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8204 dirp
->offset
= next_off
;
8208 if (dirp
->offset
== 1) {
8209 ldout(cct
, 15) << " including .." << dendl
;
8210 uint64_t next_off
= 2;
8212 if (diri
->dentries
.empty())
8215 in
= diri
->get_first_parent()->dir
->parent_inode
;
8218 r
= _getattr(in
, caps
, dirp
->perms
);
8222 fill_statx(in
, caps
, &stx
);
8223 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8225 Inode
*inode
= NULL
;
8231 client_lock
.unlock();
8232 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8237 dirp
->offset
= next_off
;
8242 // can we read from our cache?
8243 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8244 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8245 << dirp
->inode
->is_complete_and_ordered()
8246 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8248 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8249 dirp
->inode
->is_complete_and_ordered() &&
8250 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8251 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8260 bool check_caps
= true;
8261 if (!dirp
->is_cached()) {
8262 int r
= _readdir_get_frag(dirp
);
8265 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8266 // different than the requested one. (our dirfragtree was outdated)
8269 frag_t fg
= dirp
->buffer_frag
;
8271 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8272 << " offset " << hex
<< dirp
->offset
<< dendl
;
8274 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8275 dirp
->offset
, dir_result_t::dentry_off_lt());
8276 it
!= dirp
->buffer
.end();
8278 dir_result_t::dentry
&entry
= *it
;
8280 uint64_t next_off
= entry
.offset
+ 1;
8284 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8289 fill_statx(entry
.inode
, caps
, &stx
);
8290 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8292 Inode
*inode
= NULL
;
8294 inode
= entry
.inode
.get();
8298 client_lock
.unlock();
8299 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8302 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8303 << " = " << r
<< dendl
;
8307 dirp
->offset
= next_off
;
8312 if (dirp
->next_offset
> 2) {
8313 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8314 _readdir_drop_dirp_buffer(dirp
);
8318 if (!fg
.is_rightmost()) {
8320 _readdir_next_frag(dirp
);
8324 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8325 diri
->dir_release_count
== dirp
->release_count
) {
8326 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8327 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8329 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8330 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8332 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8334 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8335 diri
->flags
|= I_COMPLETE
;
8347 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8349 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8356 * 1 if we got a dirent
8357 * 0 for end of directory
8361 struct single_readdir
{
8363 struct ceph_statx
*stx
;
8368 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8369 struct ceph_statx
*stx
, off_t off
,
8372 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8375 return -1; // already filled this dirent
8385 struct dirent
*Client::readdir(dir_result_t
*d
)
8388 static struct dirent de
;
8395 // our callback fills the dirent and sets sr.full=true on first
8396 // call, and returns -1 the second time around.
8397 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8399 errno
= -ret
; // this sucks.
8400 return (dirent
*) NULL
;
8405 return (dirent
*) NULL
;
8408 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8409 struct ceph_statx
*stx
, unsigned want
,
8410 unsigned flags
, Inode
**out
)
8418 // our callback fills the dirent and sets sr.full=true on first
8419 // call, and returns -1 the second time around.
8420 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8432 struct getdents_result
{
8439 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8440 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8442 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8448 dlen
= strlen(de
->d_name
) + 1;
8450 if (c
->pos
+ dlen
> c
->buflen
)
8451 return -1; // doesn't fit
8454 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8456 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8462 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8467 gr
.fullent
= fullent
;
8470 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8472 if (r
< 0) { // some error
8473 if (r
== -1) { // buffer ran out of space
8474 if (gr
.pos
) { // but we got some entries already!
8476 } // or we need a larger buffer
8478 } else { // actual error, return it
8487 struct getdir_result
{
8488 list
<string
> *contents
;
8492 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8494 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8496 r
->contents
->push_back(de
->d_name
);
8501 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8502 const UserPerm
& perms
)
8504 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8506 std::lock_guard
lock(client_lock
);
8507 tout(cct
) << "getdir" << std::endl
;
8508 tout(cct
) << relpath
<< std::endl
;
8512 int r
= opendir(relpath
, &d
, perms
);
8517 gr
.contents
= &contents
;
8519 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8529 /****** file i/o **********/
8530 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8531 mode_t mode
, int stripe_unit
, int stripe_count
,
8532 int object_size
, const char *data_pool
)
8534 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8535 std::lock_guard
lock(client_lock
);
8536 tout(cct
) << "open" << std::endl
;
8537 tout(cct
) << relpath
<< std::endl
;
8538 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8545 #if defined(__linux__) && defined(O_PATH)
8546 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8547 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8548 * in kernel (fs/open.c). */
8550 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8553 filepath
path(relpath
);
8555 bool created
= false;
8556 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8557 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8558 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8560 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8563 #if defined(__linux__) && defined(O_PATH)
8564 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8566 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8570 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8571 filepath dirpath
= path
;
8572 string dname
= dirpath
.last_dentry();
8573 dirpath
.pop_dentry();
8575 r
= path_walk(dirpath
, &dir
, perms
, true,
8576 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8579 if (cct
->_conf
->client_permissions
) {
8580 r
= may_create(dir
.get(), perms
);
8584 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8585 stripe_count
, object_size
, data_pool
, &created
, perms
);
8591 // posix says we can only check permissions of existing files
8592 if (cct
->_conf
->client_permissions
) {
8593 r
= may_open(in
.get(), flags
, perms
);
8600 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8602 // allocate a integer file descriptor
8605 ceph_assert(fd_map
.count(r
) == 0);
8610 tout(cct
) << r
<< std::endl
;
8611 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8615 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8617 /* Use default file striping parameters */
8618 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8621 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8622 const UserPerm
& perms
)
8624 std::lock_guard
lock(client_lock
);
8625 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8630 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8632 req
->set_filepath(path
);
8634 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8636 sprintf(f
, "%u", h
);
8637 filepath
path2(dirino
);
8638 path2
.push_dentry(string(f
));
8639 req
->set_filepath2(path2
);
8641 int r
= make_request(req
, perms
, NULL
, NULL
,
8642 rand() % mdsmap
->get_num_in_mds());
8643 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8649 * Load inode into local cache.
8651 * If inode pointer is non-NULL, and take a reference on
8652 * the resulting Inode object in one operation, so that caller
8653 * can safely assume inode will still be there after return.
8655 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8657 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8662 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8664 req
->set_filepath(path
);
8666 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8667 if (r
== 0 && inode
!= NULL
) {
8668 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8669 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8670 ceph_assert(p
!= inode_map
.end());
8674 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8678 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8680 std::lock_guard
lock(client_lock
);
8681 return _lookup_ino(ino
, perms
, inode
);
8685 * Find the parent inode of `ino` and insert it into
8686 * our cache. Conditionally also set `parent` to a referenced
8687 * Inode* if caller provides non-NULL value.
8689 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8691 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8693 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8694 filepath
path(ino
->ino
);
8695 req
->set_filepath(path
);
8698 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8699 // Give caller a reference to the parent ino if they provided a pointer.
8700 if (parent
!= NULL
) {
8702 *parent
= target
.get();
8704 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8709 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8714 * Populate the parent dentry for `ino`, provided it is
8715 * a child of `parent`.
8717 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8719 ceph_assert(parent
->is_dir());
8720 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8725 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8726 req
->set_filepath2(filepath(parent
->ino
));
8727 req
->set_filepath(filepath(ino
->ino
));
8728 req
->set_inode(ino
);
8730 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8731 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8735 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8737 std::lock_guard
lock(client_lock
);
8738 return _lookup_name(ino
, parent
, perms
);
8741 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8744 Fh
*f
= new Fh(in
, flags
, cmode
, perms
);
8746 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8748 if (in
->snapid
!= CEPH_NOSNAP
) {
8749 in
->snap_cap_refs
++;
8750 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8751 << ccap_string(in
->caps_issued()) << dendl
;
8754 const auto& conf
= cct
->_conf
;
8755 f
->readahead
.set_trigger_requests(1);
8756 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8757 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8758 if (conf
->client_readahead_max_bytes
) {
8759 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8761 if (conf
->client_readahead_max_periods
) {
8762 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8764 f
->readahead
.set_max_readahead_size(max_readahead
);
8765 vector
<uint64_t> alignments
;
8766 alignments
.push_back(in
->layout
.get_period());
8767 alignments
.push_back(in
->layout
.stripe_unit
);
8768 f
->readahead
.set_alignments(alignments
);
8773 int Client::_release_fh(Fh
*f
)
8775 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8776 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8777 Inode
*in
= f
->inode
.get();
8778 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8782 if (in
->snapid
== CEPH_NOSNAP
) {
8783 if (in
->put_open_ref(f
->mode
)) {
8784 _flush(in
, new C_Client_FlushComplete(this, in
));
8788 ceph_assert(in
->snap_cap_refs
> 0);
8789 in
->snap_cap_refs
--;
8792 _release_filelocks(f
);
8794 // Finally, read any async err (i.e. from flushes)
8795 int err
= f
->take_async_err();
8797 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8798 << cpp_strerror(err
) << dendl
;
8800 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8808 void Client::_put_fh(Fh
*f
)
8810 int left
= f
->put();
8816 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8817 const UserPerm
& perms
)
8819 if (in
->snapid
!= CEPH_NOSNAP
&&
8820 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8824 // use normalized flags to generate cmode
8825 int cflags
= ceph_flags_sys2wire(flags
);
8826 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8827 cflags
|= CEPH_O_LAZY
;
8829 int cmode
= ceph_flags_to_mode(cflags
);
8830 int want
= ceph_caps_for_mode(cmode
);
8833 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8835 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8837 check_caps(in
, CHECK_CAPS_NODELAY
);
8840 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8842 in
->make_nosnap_relative_path(path
);
8843 req
->set_filepath(path
);
8844 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8845 req
->head
.args
.open
.mode
= mode
;
8846 req
->head
.args
.open
.pool
= -1;
8847 if (cct
->_conf
->client_debug_getattr_caps
)
8848 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8850 req
->head
.args
.open
.mask
= 0;
8851 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8853 result
= make_request(req
, perms
);
8856 * NFS expects that delegations will be broken on a conflicting open,
8857 * not just when there is actual conflicting access to the file. SMB leases
8858 * and oplocks also have similar semantics.
8860 * Ensure that clients that have delegations enabled will wait on minimal
8861 * caps during open, just to ensure that other clients holding delegations
8862 * return theirs first.
8864 if (deleg_timeout
&& result
== 0) {
8867 if (cmode
& CEPH_FILE_MODE_WR
)
8868 need
|= CEPH_CAP_FILE_WR
;
8869 if (cmode
& CEPH_FILE_MODE_RD
)
8870 need
|= CEPH_CAP_FILE_RD
;
8872 result
= get_caps(in
, need
, want
, &have
, -1);
8874 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8875 " . Denying open: " <<
8876 cpp_strerror(result
) << dendl
;
8877 in
->put_open_ref(cmode
);
8879 put_cap_ref(in
, need
);
8887 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8889 in
->put_open_ref(cmode
);
8897 int Client::_renew_caps(Inode
*in
)
8899 int wanted
= in
->caps_file_wanted();
8900 if (in
->is_any_caps() &&
8901 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8902 check_caps(in
, CHECK_CAPS_NODELAY
);
8907 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8909 else if (wanted
& CEPH_CAP_FILE_RD
)
8911 else if (wanted
& CEPH_CAP_FILE_WR
)
8914 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8916 in
->make_nosnap_relative_path(path
);
8917 req
->set_filepath(path
);
8918 req
->head
.args
.open
.flags
= flags
;
8919 req
->head
.args
.open
.pool
= -1;
8920 if (cct
->_conf
->client_debug_getattr_caps
)
8921 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8923 req
->head
.args
.open
.mask
= 0;
8926 // duplicate in case Cap goes away; not sure if that race is a concern?
8927 const UserPerm
*pperm
= in
->get_best_perms();
8931 int ret
= make_request(req
, perms
);
8935 int Client::close(int fd
)
8937 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8938 std::lock_guard
lock(client_lock
);
8939 tout(cct
) << "close" << std::endl
;
8940 tout(cct
) << fd
<< std::endl
;
8945 Fh
*fh
= get_filehandle(fd
);
8948 int err
= _release_fh(fh
);
8951 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8959 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8961 std::lock_guard
lock(client_lock
);
8962 tout(cct
) << "lseek" << std::endl
;
8963 tout(cct
) << fd
<< std::endl
;
8964 tout(cct
) << offset
<< std::endl
;
8965 tout(cct
) << whence
<< std::endl
;
8970 Fh
*f
= get_filehandle(fd
);
8973 #if defined(__linux__) && defined(O_PATH)
8974 if (f
->flags
& O_PATH
)
8977 return _lseek(f
, offset
, whence
);
8980 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8982 Inode
*in
= f
->inode
.get();
8983 bool whence_check
= false;
8988 whence_check
= true;
8993 whence_check
= true;
8999 whence_check
= true;
9005 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9016 pos
= f
->pos
+ offset
;
9020 pos
= in
->size
+ offset
;
9025 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9033 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9040 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9050 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9055 void Client::lock_fh_pos(Fh
*f
)
9057 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9059 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9060 ceph::condition_variable cond
;
9061 f
->pos_waiters
.push_back(&cond
);
9062 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9063 std::unique_lock l
{client_lock
, std::adopt_lock
};
9064 cond
.wait(l
, [f
, me
=&cond
] {
9065 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9068 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9069 ceph_assert(f
->pos_waiters
.front() == &cond
);
9070 f
->pos_waiters
.pop_front();
9073 f
->pos_locked
= true;
9076 void Client::unlock_fh_pos(Fh
*f
)
9078 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9079 f
->pos_locked
= false;
9082 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9084 if (!in
->inline_data
.length()) {
9085 onfinish
->complete(0);
9090 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9091 object_t oid
= oid_buf
;
9093 ObjectOperation create_ops
;
9094 create_ops
.create(false);
9096 objecter
->mutate(oid
,
9097 OSDMap::file_to_object_locator(in
->layout
),
9099 in
->snaprealm
->get_snap_context(),
9100 ceph::real_clock::now(),
9104 bufferlist inline_version_bl
;
9105 encode(in
->inline_version
, inline_version_bl
);
9107 ObjectOperation uninline_ops
;
9108 uninline_ops
.cmpxattr("inline_version",
9109 CEPH_OSD_CMPXATTR_OP_GT
,
9110 CEPH_OSD_CMPXATTR_MODE_U64
,
9112 bufferlist inline_data
= in
->inline_data
;
9113 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9114 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9116 objecter
->mutate(oid
,
9117 OSDMap::file_to_object_locator(in
->layout
),
9119 in
->snaprealm
->get_snap_context(),
9120 ceph::real_clock::now(),
9129 // blocking osd interface
9131 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9133 std::lock_guard
lock(client_lock
);
9134 tout(cct
) << "read" << std::endl
;
9135 tout(cct
) << fd
<< std::endl
;
9136 tout(cct
) << size
<< std::endl
;
9137 tout(cct
) << offset
<< std::endl
;
9142 Fh
*f
= get_filehandle(fd
);
9145 #if defined(__linux__) && defined(O_PATH)
9146 if (f
->flags
& O_PATH
)
9150 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9151 size
= std::min(size
, (loff_t
)INT_MAX
);
9152 int r
= _read(f
, offset
, size
, &bl
);
9153 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9155 bl
.begin().copy(bl
.length(), buf
);
9161 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9165 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9168 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9171 bool movepos
= false;
9172 std::unique_ptr
<C_SaferCond
> onuninline
;
9174 const auto& conf
= cct
->_conf
;
9175 Inode
*in
= f
->inode
.get();
9177 utime_t start
= ceph_clock_now();
9179 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9181 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9188 loff_t start_pos
= offset
;
9190 if (in
->inline_version
== 0) {
9191 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9195 ceph_assert(in
->inline_version
> 0);
9199 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9200 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9202 want
= CEPH_CAP_FILE_CACHE
;
9203 r
= get_caps(in
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9207 if (f
->flags
& O_DIRECT
)
9208 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9210 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9211 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9212 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9213 uninline_data(in
, onuninline
.get());
9215 uint32_t len
= in
->inline_data
.length();
9216 uint64_t endoff
= offset
+ size
;
9217 if (endoff
> in
->size
)
9221 if (endoff
<= len
) {
9222 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9224 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9225 bl
->append_zero(endoff
- len
);
9227 r
= endoff
- offset
;
9228 } else if ((uint64_t)offset
< endoff
) {
9229 bl
->append_zero(endoff
- offset
);
9230 r
= endoff
- offset
;
9238 if (!conf
->client_debug_force_sync_read
&&
9240 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9242 if (f
->flags
& O_RSYNC
) {
9243 _flush_range(in
, offset
, size
);
9245 r
= _read_async(f
, offset
, size
, bl
);
9249 if (f
->flags
& O_DIRECT
)
9250 _flush_range(in
, offset
, size
);
9252 bool checkeof
= false;
9253 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9260 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9263 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9268 if ((uint64_t)offset
< in
->size
)
9274 ceph_assert(r
>= 0);
9277 f
->pos
= start_pos
+ r
;
9280 lat
= ceph_clock_now();
9282 logger
->tinc(l_c_read
, lat
);
9288 client_lock
.unlock();
9289 int ret
= onuninline
->wait();
9291 if (ret
>= 0 || ret
== -ECANCELED
) {
9292 in
->inline_data
.clear();
9293 in
->inline_version
= CEPH_INLINE_NONE
;
9294 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9300 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9308 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9311 f
->readahead
.inc_pending();
9314 Client::C_Readahead::~C_Readahead() {
9315 f
->readahead
.dec_pending();
9319 void Client::C_Readahead::finish(int r
) {
9320 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9321 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9324 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9326 const auto& conf
= cct
->_conf
;
9327 Inode
*in
= f
->inode
.get();
9329 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9331 // trim read based on file size?
9332 if (off
>= in
->size
)
9336 if (off
+ len
> in
->size
) {
9337 len
= in
->size
- off
;
9340 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9341 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9342 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9344 // read (and possibly block)
9346 C_SaferCond
onfinish("Client::_read_async flock");
9347 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9348 off
, len
, bl
, 0, &onfinish
);
9350 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9351 client_lock
.unlock();
9352 r
= onfinish
.wait();
9354 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9357 if(f
->readahead
.get_min_readahead_size() > 0) {
9358 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9359 if (readahead_extent
.second
> 0) {
9360 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9361 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9362 Context
*onfinish2
= new C_Readahead(this, f
);
9363 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9364 readahead_extent
.first
, readahead_extent
.second
,
9365 NULL
, 0, onfinish2
);
9367 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9368 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9370 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9379 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9382 Inode
*in
= f
->inode
.get();
9387 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9390 C_SaferCond
onfinish("Client::_read_sync flock");
9394 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9396 in
->truncate_size
, in
->truncate_seq
,
9398 client_lock
.unlock();
9399 int r
= onfinish
.wait();
9402 // if we get ENOENT from OSD, assume 0 bytes returned
9413 bl
->claim_append(tbl
);
9416 if (r
>= 0 && r
< wanted
) {
9417 if (pos
< in
->size
) {
9418 // zero up to known EOF
9419 int64_t some
= in
->size
- pos
;
9422 auto z
= buffer::ptr_node::create(some
);
9424 bl
->push_back(std::move(z
));
9441 * we keep count of uncommitted sync writes on the inode, so that
9444 void Client::_sync_write_commit(Inode
*in
)
9446 ceph_assert(unsafe_sync_write
> 0);
9447 unsafe_sync_write
--;
9449 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9451 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9452 if (unsafe_sync_write
== 0 && unmounting
) {
9453 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9454 mount_cond
.notify_all();
9458 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9460 std::lock_guard
lock(client_lock
);
9461 tout(cct
) << "write" << std::endl
;
9462 tout(cct
) << fd
<< std::endl
;
9463 tout(cct
) << size
<< std::endl
;
9464 tout(cct
) << offset
<< std::endl
;
9469 Fh
*fh
= get_filehandle(fd
);
9472 #if defined(__linux__) && defined(O_PATH)
9473 if (fh
->flags
& O_PATH
)
9476 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9477 size
= std::min(size
, (loff_t
)INT_MAX
);
9478 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9479 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9483 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9487 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9490 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9491 unsigned iovcnt
, int64_t offset
, bool write
,
9494 #if defined(__linux__) && defined(O_PATH)
9495 if (fh
->flags
& O_PATH
)
9498 loff_t totallen
= 0;
9499 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9500 totallen
+= iov
[i
].iov_len
;
9504 * Some of the API functions take 64-bit size values, but only return
9505 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9506 * we don't do I/Os larger than the values we can return.
9509 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9512 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9513 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9517 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9518 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9522 auto iter
= bl
.cbegin();
9523 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9525 * This piece of code aims to handle the case that bufferlist does not have enough data
9526 * to fill in the iov
9528 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
9529 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
9530 resid
-= round_size
;
9531 /* iter is self-updating */
9537 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9539 std::lock_guard
lock(client_lock
);
9540 tout(cct
) << fd
<< std::endl
;
9541 tout(cct
) << offset
<< std::endl
;
9546 Fh
*fh
= get_filehandle(fd
);
9549 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9552 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9553 const struct iovec
*iov
, int iovcnt
)
9557 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9560 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9561 Inode
*in
= f
->inode
.get();
9563 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9567 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9569 // was Fh opened as writeable?
9570 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9573 // use/adjust fd pos?
9577 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9578 * change out from under us.
9580 if (f
->flags
& O_APPEND
) {
9581 auto r
= _lseek(f
, 0, SEEK_END
);
9593 uint64_t endoff
= offset
+ size
;
9594 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9599 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9601 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9604 utime_t start
= ceph_clock_now();
9606 if (in
->inline_version
== 0) {
9607 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9610 ceph_assert(in
->inline_version
> 0);
9613 // copy into fresh buffer (since our write may be resub, async)
9617 bl
.append(buf
, size
);
9619 for (int i
= 0; i
< iovcnt
; i
++) {
9620 if (iov
[i
].iov_len
> 0) {
9621 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9627 uint64_t totalwritten
;
9629 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9630 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9632 want
= CEPH_CAP_FILE_BUFFER
;
9633 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9637 /* clear the setuid/setgid bits, if any */
9638 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9639 struct ceph_statx stx
= { 0 };
9641 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9642 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9646 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9649 if (f
->flags
& O_DIRECT
)
9650 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9652 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9654 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9656 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9657 if (endoff
> cct
->_conf
->client_max_inline_size
||
9658 endoff
> CEPH_INLINE_MAX_SIZE
||
9659 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9660 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9661 uninline_data(in
, onuninline
.get());
9663 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9665 uint32_t len
= in
->inline_data
.length();
9668 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
9671 in
->inline_data
.splice(offset
, len
- offset
);
9672 else if (offset
> len
)
9673 in
->inline_data
.append_zero(offset
- len
);
9675 in
->inline_data
.append(bl
);
9676 in
->inline_version
++;
9678 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9684 if (cct
->_conf
->client_oc
&&
9685 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9686 // do buffered write
9687 if (!in
->oset
.dirty_or_tx
)
9688 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9690 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9692 // async, caching, non-blocking.
9693 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9694 in
->snaprealm
->get_snap_context(),
9695 offset
, size
, bl
, ceph::real_clock::now(),
9697 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9702 // flush cached write if O_SYNC is set on file fh
9703 // O_DSYNC == O_SYNC on linux < 2.6.33
9704 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9705 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9706 _flush_range(in
, offset
, size
);
9709 if (f
->flags
& O_DIRECT
)
9710 _flush_range(in
, offset
, size
);
9712 // simple, non-atomic sync write
9713 C_SaferCond
onfinish("Client::_write flock");
9714 unsafe_sync_write
++;
9715 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9717 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9718 offset
, size
, bl
, ceph::real_clock::now(), 0,
9719 in
->truncate_size
, in
->truncate_seq
,
9721 client_lock
.unlock();
9724 _sync_write_commit(in
);
9727 // if we get here, write was successful, update client metadata
9730 lat
= ceph_clock_now();
9732 logger
->tinc(l_c_wrlat
, lat
);
9739 totalwritten
= size
;
9740 r
= (int64_t)totalwritten
;
9743 if (totalwritten
+ offset
> in
->size
) {
9744 in
->size
= totalwritten
+ offset
;
9745 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9747 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9748 check_caps(in
, CHECK_CAPS_NODELAY
);
9749 } else if (is_max_size_approaching(in
)) {
9753 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9755 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9759 in
->mtime
= in
->ctime
= ceph_clock_now();
9761 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9765 if (nullptr != onuninline
) {
9766 client_lock
.unlock();
9767 int uninline_ret
= onuninline
->wait();
9770 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9771 in
->inline_data
.clear();
9772 in
->inline_version
= CEPH_INLINE_NONE
;
9773 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9779 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9783 int Client::_flush(Fh
*f
)
9785 Inode
*in
= f
->inode
.get();
9786 int err
= f
->take_async_err();
9788 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9789 << cpp_strerror(err
) << dendl
;
9791 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9797 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9799 struct ceph_statx stx
;
9800 stx
.stx_size
= length
;
9801 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9804 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9806 std::lock_guard
lock(client_lock
);
9807 tout(cct
) << __func__
<< std::endl
;
9808 tout(cct
) << fd
<< std::endl
;
9809 tout(cct
) << length
<< std::endl
;
9814 Fh
*f
= get_filehandle(fd
);
9817 #if defined(__linux__) && defined(O_PATH)
9818 if (f
->flags
& O_PATH
)
9822 attr
.st_size
= length
;
9823 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9826 int Client::fsync(int fd
, bool syncdataonly
)
9828 std::lock_guard
lock(client_lock
);
9829 tout(cct
) << "fsync" << std::endl
;
9830 tout(cct
) << fd
<< std::endl
;
9831 tout(cct
) << syncdataonly
<< std::endl
;
9836 Fh
*f
= get_filehandle(fd
);
9839 #if defined(__linux__) && defined(O_PATH)
9840 if (f
->flags
& O_PATH
)
9843 int r
= _fsync(f
, syncdataonly
);
9845 // The IOs in this fsync were okay, but maybe something happened
9846 // in the background that we shoudl be reporting?
9847 r
= f
->take_async_err();
9848 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9849 << ") = 0, async_err = " << r
<< dendl
;
9851 // Assume that an error we encountered during fsync, even reported
9852 // synchronously, would also have applied the error to the Fh, and we
9853 // should clear it here to avoid returning the same error again on next
9855 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9857 f
->take_async_err();
9862 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9865 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9866 ceph_tid_t flush_tid
= 0;
9869 utime_t start
= ceph_clock_now();
9871 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9873 if (cct
->_conf
->client_oc
) {
9874 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9875 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9876 _flush(in
, object_cacher_completion
.get());
9877 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9880 if (!syncdataonly
&& in
->dirty_caps
) {
9881 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9882 if (in
->flushing_caps
)
9883 flush_tid
= last_flush_tid
;
9884 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9886 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9889 MetaRequest
*req
= in
->unsafe_ops
.back();
9890 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9893 wait_on_list(req
->waitfor_safe
);
9897 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9898 client_lock
.unlock();
9899 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9900 r
= object_cacher_completion
->wait();
9902 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9904 // FIXME: this can starve
9905 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9906 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9907 << " uncommitted, waiting" << dendl
;
9908 wait_on_list(in
->waitfor_commit
);
9914 wait_sync_caps(in
, flush_tid
);
9916 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9918 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9919 << cpp_strerror(-r
) << dendl
;
9922 lat
= ceph_clock_now();
9924 logger
->tinc(l_c_fsync
, lat
);
9929 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9931 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9932 return _fsync(f
->inode
.get(), syncdataonly
);
9935 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9937 std::lock_guard
lock(client_lock
);
9938 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9939 tout(cct
) << fd
<< std::endl
;
9944 Fh
*f
= get_filehandle(fd
);
9947 int r
= _getattr(f
->inode
, mask
, perms
);
9950 fill_stat(f
->inode
, stbuf
, NULL
);
9951 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9955 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9956 unsigned int want
, unsigned int flags
)
9958 std::lock_guard
lock(client_lock
);
9959 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9960 tout(cct
) << fd
<< std::endl
;
9965 Fh
*f
= get_filehandle(fd
);
9969 unsigned mask
= statx_to_mask(flags
, want
);
9972 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9973 r
= _getattr(f
->inode
, mask
, perms
);
9975 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9980 fill_statx(f
->inode
, mask
, stx
);
9981 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9985 // not written yet, but i want to link!
9987 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9988 const UserPerm
& perms
)
9990 std::lock_guard
lock(client_lock
);
9991 tout(cct
) << "chdir" << std::endl
;
9992 tout(cct
) << relpath
<< std::endl
;
9997 filepath
path(relpath
);
9999 int r
= path_walk(path
, &in
, perms
);
10003 if (!(in
.get()->is_dir()))
10008 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
10010 _getcwd(new_cwd
, perms
);
10014 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
10017 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
10019 Inode
*in
= cwd
.get();
10020 while (in
!= root
) {
10021 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
10023 // A cwd or ancester is unlinked
10024 if (in
->dentries
.empty()) {
10028 Dentry
*dn
= in
->get_first_parent();
10033 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
10034 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10035 filepath
path(in
->ino
);
10036 req
->set_filepath(path
);
10037 req
->set_inode(in
);
10038 int res
= make_request(req
, perms
);
10047 path
.push_front_dentry(dn
->name
);
10048 in
= dn
->dir
->parent_inode
;
10051 dir
+= path
.get_path();
10054 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10056 std::lock_guard
l(client_lock
);
10058 _getcwd(dir
, perms
);
10061 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10062 const UserPerm
& perms
)
10064 std::lock_guard
l(client_lock
);
10065 tout(cct
) << __func__
<< std::endl
;
10066 unsigned long int total_files_on_fs
;
10074 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10075 if (data_pools
.size() == 1) {
10076 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10078 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10081 client_lock
.unlock();
10082 int rval
= cond
.wait();
10084 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10085 client_lock
.lock();
10088 ldout(cct
, 1) << "underlying call to statfs returned error: "
10089 << cpp_strerror(rval
)
10094 memset(stbuf
, 0, sizeof(*stbuf
));
10097 * we're going to set a block size of 4MB so we can represent larger
10098 * FSes without overflowing. Additionally convert the space
10099 * measurements from KB to bytes while making them in terms of
10100 * blocks. We use 4MB only because it is big enough, and because it
10101 * actually *is* the (ceph) default block size.
10103 const int CEPH_BLOCK_SHIFT
= 22;
10104 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10105 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10106 stbuf
->f_files
= total_files_on_fs
;
10107 stbuf
->f_ffree
= 0;
10108 stbuf
->f_favail
= -1;
10109 stbuf
->f_fsid
= -1; // ??
10110 stbuf
->f_flag
= 0; // ??
10111 stbuf
->f_namemax
= NAME_MAX
;
10113 // Usually quota_root will == root_ancestor, but if the mount root has no
10114 // quota but we can see a parent of it that does have a quota, we'll
10115 // respect that one instead.
10116 ceph_assert(root
!= nullptr);
10117 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10119 // get_quota_root should always give us something
10120 // because client quotas are always enabled
10121 ceph_assert(quota_root
!= nullptr);
10123 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10125 // Skip the getattr if any sessions are stale, as we don't want to
10126 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10128 if (!_any_stale_sessions()) {
10129 int r
= _getattr(quota_root
, 0, perms
, true);
10131 // Ignore return value: error getting latest inode metadata is not a good
10132 // reason to break "df".
10133 lderr(cct
) << "Error in getattr on quota root 0x"
10134 << std::hex
<< quota_root
->ino
<< std::dec
10135 << " statfs result may be outdated" << dendl
;
10139 // Special case: if there is a size quota set on the Inode acting
10140 // as the root for this client mount, then report the quota status
10141 // as the filesystem statistics.
10142 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10143 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10144 // It is possible for a quota to be exceeded: arithmetic here must
10145 // handle case where used > total.
10146 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10148 stbuf
->f_blocks
= total
;
10149 stbuf
->f_bfree
= free
;
10150 stbuf
->f_bavail
= free
;
10152 // General case: report the cluster statistics returned from RADOS. Because
10153 // multiple pools may be used without one filesystem namespace via
10154 // layouts, this is the most correct thing we can do.
10155 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10156 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10157 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10163 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10164 struct flock
*fl
, uint64_t owner
, bool removing
)
10166 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10167 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10168 << " type " << fl
->l_type
<< " owner " << owner
10169 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10172 if (F_RDLCK
== fl
->l_type
)
10173 lock_cmd
= CEPH_LOCK_SHARED
;
10174 else if (F_WRLCK
== fl
->l_type
)
10175 lock_cmd
= CEPH_LOCK_EXCL
;
10176 else if (F_UNLCK
== fl
->l_type
)
10177 lock_cmd
= CEPH_LOCK_UNLOCK
;
10181 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10185 * Set the most significant bit, so that MDS knows the 'owner'
10186 * is sufficient to identify the owner of lock. (old code uses
10187 * both 'owner' and 'pid')
10189 owner
|= (1ULL << 63);
10191 MetaRequest
*req
= new MetaRequest(op
);
10193 in
->make_nosnap_relative_path(path
);
10194 req
->set_filepath(path
);
10195 req
->set_inode(in
);
10197 req
->head
.args
.filelock_change
.rule
= lock_type
;
10198 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10199 req
->head
.args
.filelock_change
.owner
= owner
;
10200 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10201 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10202 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10203 req
->head
.args
.filelock_change
.wait
= sleep
;
10208 if (sleep
&& switch_interrupt_cb
) {
10209 // enable interrupt
10210 switch_interrupt_cb(callback_handle
, req
->get());
10211 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10212 // disable interrupt
10213 switch_interrupt_cb(callback_handle
, NULL
);
10214 if (ret
== 0 && req
->aborted()) {
10215 // effect of this lock request has been revoked by the 'lock intr' request
10216 ret
= req
->get_abort_code();
10220 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10224 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10225 ceph_filelock filelock
;
10226 auto p
= bl
.cbegin();
10227 decode(filelock
, p
);
10229 if (CEPH_LOCK_SHARED
== filelock
.type
)
10230 fl
->l_type
= F_RDLCK
;
10231 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10232 fl
->l_type
= F_WRLCK
;
10234 fl
->l_type
= F_UNLCK
;
10236 fl
->l_whence
= SEEK_SET
;
10237 fl
->l_start
= filelock
.start
;
10238 fl
->l_len
= filelock
.length
;
10239 fl
->l_pid
= filelock
.pid
;
10240 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10241 ceph_lock_state_t
*lock_state
;
10242 if (lock_type
== CEPH_LOCK_FCNTL
) {
10243 if (!in
->fcntl_locks
)
10244 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10245 lock_state
= in
->fcntl_locks
.get();
10246 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10247 if (!in
->flock_locks
)
10248 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10249 lock_state
= in
->flock_locks
.get();
10254 _update_lock_state(fl
, owner
, lock_state
);
10257 if (lock_type
== CEPH_LOCK_FCNTL
) {
10258 if (!fh
->fcntl_locks
)
10259 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10260 lock_state
= fh
->fcntl_locks
.get();
10262 if (!fh
->flock_locks
)
10263 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10264 lock_state
= fh
->flock_locks
.get();
10266 _update_lock_state(fl
, owner
, lock_state
);
10274 int Client::_interrupt_filelock(MetaRequest
*req
)
10276 // Set abort code, but do not kick. The abort code prevents the request
10277 // from being re-sent.
10278 req
->abort(-EINTR
);
10280 return 0; // haven't sent the request
10282 Inode
*in
= req
->inode();
10285 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10286 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10287 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10288 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10294 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10296 in
->make_nosnap_relative_path(path
);
10297 intr_req
->set_filepath(path
);
10298 intr_req
->set_inode(in
);
10299 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10300 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10301 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10303 UserPerm
perms(req
->get_uid(), req
->get_gid());
10304 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10307 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10309 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10312 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10313 encode(nr_fcntl_locks
, bl
);
10314 if (nr_fcntl_locks
) {
10315 auto &lock_state
= in
->fcntl_locks
;
10316 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10317 p
!= lock_state
->held_locks
.end();
10319 encode(p
->second
, bl
);
10322 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10323 encode(nr_flock_locks
, bl
);
10324 if (nr_flock_locks
) {
10325 auto &lock_state
= in
->flock_locks
;
10326 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10327 p
!= lock_state
->held_locks
.end();
10329 encode(p
->second
, bl
);
10332 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10333 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10336 void Client::_release_filelocks(Fh
*fh
)
10338 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10341 Inode
*in
= fh
->inode
.get();
10342 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10344 list
<pair
<int, ceph_filelock
> > to_release
;
10346 if (fh
->fcntl_locks
) {
10347 auto &lock_state
= fh
->fcntl_locks
;
10348 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10349 p
!= lock_state
->held_locks
.end();
10351 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10352 lock_state
.reset();
10354 if (fh
->flock_locks
) {
10355 auto &lock_state
= fh
->flock_locks
;
10356 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10357 p
!= lock_state
->held_locks
.end();
10359 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10360 lock_state
.reset();
10363 if (to_release
.empty())
10366 // mds has already released filelocks if session was closed.
10367 if (in
->caps
.empty())
10371 memset(&fl
, 0, sizeof(fl
));
10372 fl
.l_whence
= SEEK_SET
;
10373 fl
.l_type
= F_UNLCK
;
10375 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10376 p
!= to_release
.end();
10378 fl
.l_start
= p
->second
.start
;
10379 fl
.l_len
= p
->second
.length
;
10380 fl
.l_pid
= p
->second
.pid
;
10381 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10382 p
->second
.owner
, true);
10386 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10387 ceph_lock_state_t
*lock_state
)
10390 if (F_RDLCK
== fl
->l_type
)
10391 lock_cmd
= CEPH_LOCK_SHARED
;
10392 else if (F_WRLCK
== fl
->l_type
)
10393 lock_cmd
= CEPH_LOCK_EXCL
;
10395 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10397 ceph_filelock filelock
;
10398 filelock
.start
= fl
->l_start
;
10399 filelock
.length
= fl
->l_len
;
10400 filelock
.client
= 0;
10401 // see comment in _do_filelock()
10402 filelock
.owner
= owner
| (1ULL << 63);
10403 filelock
.pid
= fl
->l_pid
;
10404 filelock
.type
= lock_cmd
;
10406 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10407 list
<ceph_filelock
> activated_locks
;
10408 lock_state
->remove_lock(filelock
, activated_locks
);
10410 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10415 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10417 Inode
*in
= fh
->inode
.get();
10418 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10419 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10423 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10425 Inode
*in
= fh
->inode
.get();
10426 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10427 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10428 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10432 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10434 Inode
*in
= fh
->inode
.get();
10435 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10437 int sleep
= !(cmd
& LOCK_NB
);
10456 memset(&fl
, 0, sizeof(fl
));
10458 fl
.l_whence
= SEEK_SET
;
10460 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10461 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10465 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10467 /* Since the only thing this does is wrap a call to statfs, and
10468 statfs takes a lock, it doesn't seem we have a need to split it
10470 return statfs(0, stbuf
, perms
);
10473 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
10477 std::lock_guard
l(client_lock
);
10478 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10479 << " invalidate_ino_cb " << args
->ino_cb
10480 << " invalidate_dentry_cb " << args
->dentry_cb
10481 << " switch_interrupt_cb " << args
->switch_intr_cb
10482 << " remount_cb " << args
->remount_cb
10484 callback_handle
= args
->handle
;
10485 if (args
->ino_cb
) {
10486 ino_invalidate_cb
= args
->ino_cb
;
10487 async_ino_invalidator
.start();
10489 if (args
->dentry_cb
) {
10490 dentry_invalidate_cb
= args
->dentry_cb
;
10491 async_dentry_invalidator
.start();
10493 if (args
->switch_intr_cb
) {
10494 switch_interrupt_cb
= args
->switch_intr_cb
;
10495 interrupt_finisher
.start();
10497 if (args
->remount_cb
) {
10498 remount_cb
= args
->remount_cb
;
10499 remount_finisher
.start();
10501 if (args
->ino_release_cb
) {
10502 ino_release_cb
= args
->ino_release_cb
;
10503 async_ino_releasor
.start();
10505 if (args
->umask_cb
)
10506 umask_cb
= args
->umask_cb
;
10509 int Client::test_dentry_handling(bool can_invalidate
)
10513 can_invalidate_dentries
= can_invalidate
;
10515 if (can_invalidate_dentries
) {
10516 ceph_assert(dentry_invalidate_cb
);
10517 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10520 ceph_assert(remount_cb
);
10521 ldout(cct
, 1) << "using remount_cb" << dendl
;
10522 r
= _do_remount(false);
10528 int Client::_sync_fs()
10530 ldout(cct
, 10) << __func__
<< dendl
;
10533 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10534 if (cct
->_conf
->client_oc
) {
10535 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10536 objectcacher
->flush_all(cond
.get());
10541 ceph_tid_t flush_tid
= last_flush_tid
;
10543 // wait for unsafe mds requests
10544 wait_unsafe_requests();
10546 wait_sync_caps(flush_tid
);
10548 if (nullptr != cond
) {
10549 client_lock
.unlock();
10550 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10552 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10553 client_lock
.lock();
10559 int Client::sync_fs()
10561 std::lock_guard
l(client_lock
);
10569 int64_t Client::drop_caches()
10571 std::lock_guard
l(client_lock
);
10572 return objectcacher
->release_all();
10575 int Client::_lazyio(Fh
*fh
, int enable
)
10577 Inode
*in
= fh
->inode
.get();
10578 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10580 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10583 int orig_mode
= fh
->mode
;
10585 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10586 in
->get_open_ref(fh
->mode
);
10587 in
->put_open_ref(orig_mode
);
10588 check_caps(in
, CHECK_CAPS_NODELAY
);
10590 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10591 in
->get_open_ref(fh
->mode
);
10592 in
->put_open_ref(orig_mode
);
10599 int Client::lazyio(int fd
, int enable
)
10601 std::lock_guard
l(client_lock
);
10602 Fh
*f
= get_filehandle(fd
);
10606 return _lazyio(f
, enable
);
10609 int Client::ll_lazyio(Fh
*fh
, int enable
)
10611 std::lock_guard
lock(client_lock
);
10612 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10613 tout(cct
) << __func__
<< std::endl
;
10615 return _lazyio(fh
, enable
);
10618 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10620 std::lock_guard
l(client_lock
);
10621 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10622 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10624 Fh
*f
= get_filehandle(fd
);
10634 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10636 std::lock_guard
l(client_lock
);
10637 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10638 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10640 Fh
*f
= get_filehandle(fd
);
10643 Inode
*in
= f
->inode
.get();
10646 if (_release(in
)) {
10647 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10655 // =============================
10658 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10660 std::lock_guard
l(client_lock
);
10665 filepath
path(relpath
);
10667 int r
= path_walk(path
, &in
, perm
);
10670 if (cct
->_conf
->client_permissions
) {
10671 r
= may_create(in
.get(), perm
);
10675 Inode
*snapdir
= open_snapdir(in
.get());
10676 return _mkdir(snapdir
, name
, 0, perm
);
10679 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10681 std::lock_guard
l(client_lock
);
10686 filepath
path(relpath
);
10688 int r
= path_walk(path
, &in
, perms
);
10691 if (cct
->_conf
->client_permissions
) {
10692 r
= may_delete(in
.get(), NULL
, perms
);
10696 Inode
*snapdir
= open_snapdir(in
.get());
10697 return _rmdir(snapdir
, name
, perms
);
10700 // =============================
10703 int Client::get_caps_issued(int fd
) {
10705 std::lock_guard
lock(client_lock
);
10710 Fh
*f
= get_filehandle(fd
);
10714 return f
->inode
->caps_issued();
10717 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10719 std::lock_guard
lock(client_lock
);
10726 int r
= path_walk(p
, &in
, perms
, true);
10729 return in
->caps_issued();
10732 // =========================================
10735 Inode
*Client::open_snapdir(Inode
*diri
)
10738 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10739 if (!inode_map
.count(vino
)) {
10740 in
= new Inode(this, vino
, &diri
->layout
);
10742 in
->ino
= diri
->ino
;
10743 in
->snapid
= CEPH_SNAPDIR
;
10744 in
->mode
= diri
->mode
;
10745 in
->uid
= diri
->uid
;
10746 in
->gid
= diri
->gid
;
10748 in
->mtime
= diri
->mtime
;
10749 in
->ctime
= diri
->ctime
;
10750 in
->btime
= diri
->btime
;
10751 in
->size
= diri
->size
;
10752 in
->change_attr
= diri
->change_attr
;
10754 in
->dirfragtree
.clear();
10755 in
->snapdir_parent
= diri
;
10756 diri
->flags
|= I_SNAPDIR_OPEN
;
10757 inode_map
[vino
] = in
;
10758 if (use_faked_inos())
10759 _assign_faked_ino(in
);
10760 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10762 in
= inode_map
[vino
];
10763 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10768 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10769 Inode
**out
, const UserPerm
& perms
)
10771 std::lock_guard
lock(client_lock
);
10772 vinodeno_t vparent
= _get_vino(parent
);
10773 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10774 tout(cct
) << __func__
<< std::endl
;
10775 tout(cct
) << name
<< std::endl
;
10781 if (!fuse_default_permissions
) {
10782 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10783 r
= may_lookup(parent
, perms
);
10789 string
dname(name
);
10792 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10799 fill_stat(in
, attr
);
10803 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10804 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10805 tout(cct
) << attr
->st_ino
<< std::endl
;
10810 int Client::ll_lookup_inode(
10811 struct inodeno_t ino
,
10812 const UserPerm
& perms
,
10815 ceph_assert(inode
!= NULL
);
10816 std::lock_guard
lock(client_lock
);
10817 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10822 // Num1: get inode and *inode
10823 int r
= _lookup_ino(ino
, perms
, inode
);
10827 ceph_assert(*inode
!= NULL
);
10829 if (!(*inode
)->dentries
.empty()) {
10830 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10834 if ((*inode
)->is_root()) {
10835 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10839 // Num2: Request the parent inode, so that we can look up the name
10841 r
= _lookup_parent(*inode
, perms
, &parent
);
10843 _ll_forget(*inode
, 1);
10847 ceph_assert(parent
!= NULL
);
10849 // Num3: Finally, get the name (dentry) of the requested inode
10850 r
= _lookup_name(*inode
, parent
, perms
);
10852 // Unexpected error
10853 _ll_forget(parent
, 1);
10854 _ll_forget(*inode
, 1);
10858 _ll_forget(parent
, 1);
10862 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10863 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10864 const UserPerm
& perms
)
10866 std::lock_guard
lock(client_lock
);
10867 vinodeno_t vparent
= _get_vino(parent
);
10868 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10869 tout(cct
) << "ll_lookupx" << std::endl
;
10870 tout(cct
) << name
<< std::endl
;
10876 if (!fuse_default_permissions
) {
10877 r
= may_lookup(parent
, perms
);
10882 string
dname(name
);
10885 unsigned mask
= statx_to_mask(flags
, want
);
10886 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10892 fill_statx(in
, mask
, stx
);
10896 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10897 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10898 tout(cct
) << stx
->stx_ino
<< std::endl
;
10903 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10904 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10906 std::lock_guard
lock(client_lock
);
10911 filepath
fp(name
, 0);
10914 unsigned mask
= statx_to_mask(flags
, want
);
10916 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10917 tout(cct
) << __func__
<< std::endl
;
10918 tout(cct
) << name
<< std::endl
;
10920 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10922 /* zero out mask, just in case... */
10929 fill_statx(in
, mask
, stx
);
10936 void Client::_ll_get(Inode
*in
)
10938 if (in
->ll_ref
== 0) {
10940 if (in
->is_dir() && !in
->dentries
.empty()) {
10941 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10942 in
->get_first_parent()->get(); // pin dentry
10944 if (in
->snapid
!= CEPH_NOSNAP
)
10945 ll_snap_ref
[in
->snapid
]++;
10948 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10951 int Client::_ll_put(Inode
*in
, uint64_t num
)
10954 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10955 if (in
->ll_ref
== 0) {
10956 if (in
->is_dir() && !in
->dentries
.empty()) {
10957 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
10958 in
->get_first_parent()->put(); // unpin dentry
10960 if (in
->snapid
!= CEPH_NOSNAP
) {
10961 auto p
= ll_snap_ref
.find(in
->snapid
);
10962 ceph_assert(p
!= ll_snap_ref
.end());
10963 ceph_assert(p
->second
> 0);
10964 if (--p
->second
== 0)
10965 ll_snap_ref
.erase(p
);
10974 void Client::_ll_drop_pins()
10976 ldout(cct
, 10) << __func__
<< dendl
;
10977 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10978 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10979 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10980 it
!= inode_map
.end();
10982 Inode
*in
= it
->second
;
10986 to_be_put
.insert(in
);
10987 _ll_put(in
, in
->ll_ref
);
10992 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
10994 inodeno_t ino
= in
->ino
;
10996 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
10997 tout(cct
) << __func__
<< std::endl
;
10998 tout(cct
) << ino
.val
<< std::endl
;
10999 tout(cct
) << count
<< std::endl
;
11001 // Ignore forget if we're no longer mounted
11005 if (ino
== 1) return true; // ignore forget on root.
11008 if (in
->ll_ref
< count
) {
11009 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
11010 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
11011 _ll_put(in
, in
->ll_ref
);
11014 if (_ll_put(in
, count
) == 0)
11021 bool Client::ll_forget(Inode
*in
, uint64_t count
)
11023 std::lock_guard
lock(client_lock
);
11024 return _ll_forget(in
, count
);
11027 bool Client::ll_put(Inode
*in
)
11029 /* ll_forget already takes the lock */
11030 return ll_forget(in
, 1);
11033 int Client::ll_get_snap_ref(snapid_t snap
)
11035 std::lock_guard
lock(client_lock
);
11036 auto p
= ll_snap_ref
.find(snap
);
11037 if (p
!= ll_snap_ref
.end())
11042 snapid_t
Client::ll_get_snapid(Inode
*in
)
11044 std::lock_guard
lock(client_lock
);
11048 Inode
*Client::ll_get_inode(ino_t ino
)
11050 std::lock_guard
lock(client_lock
);
11055 vinodeno_t vino
= _map_faked_ino(ino
);
11056 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11057 if (p
== inode_map
.end())
11059 Inode
*in
= p
->second
;
11064 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11066 std::lock_guard
lock(client_lock
);
11071 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11072 if (p
== inode_map
.end())
11074 Inode
*in
= p
->second
;
11079 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11081 vinodeno_t vino
= _get_vino(in
);
11083 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11084 tout(cct
) << __func__
<< std::endl
;
11085 tout(cct
) << vino
.ino
.val
<< std::endl
;
11087 if (vino
.snapid
< CEPH_NOSNAP
)
11090 return _getattr(in
, caps
, perms
);
11093 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11095 std::lock_guard
lock(client_lock
);
11100 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11103 fill_stat(in
, attr
);
11104 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11108 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11109 unsigned int flags
, const UserPerm
& perms
)
11111 std::lock_guard
lock(client_lock
);
11117 unsigned mask
= statx_to_mask(flags
, want
);
11119 if (mask
&& !in
->caps_issued_mask(mask
, true))
11120 res
= _ll_getattr(in
, mask
, perms
);
11123 fill_statx(in
, mask
, stx
);
11124 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11128 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11129 const UserPerm
& perms
, InodeRef
*inp
)
11131 vinodeno_t vino
= _get_vino(in
);
11133 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11135 tout(cct
) << __func__
<< std::endl
;
11136 tout(cct
) << vino
.ino
.val
<< std::endl
;
11137 tout(cct
) << stx
->stx_mode
<< std::endl
;
11138 tout(cct
) << stx
->stx_uid
<< std::endl
;
11139 tout(cct
) << stx
->stx_gid
<< std::endl
;
11140 tout(cct
) << stx
->stx_size
<< std::endl
;
11141 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11142 tout(cct
) << stx
->stx_atime
<< std::endl
;
11143 tout(cct
) << stx
->stx_btime
<< std::endl
;
11144 tout(cct
) << mask
<< std::endl
;
11146 if (!fuse_default_permissions
) {
11147 int res
= may_setattr(in
, stx
, mask
, perms
);
11152 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11154 return __setattrx(in
, stx
, mask
, perms
, inp
);
11157 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11158 const UserPerm
& perms
)
11160 std::lock_guard
lock(client_lock
);
11165 InodeRef
target(in
);
11166 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11168 ceph_assert(in
== target
.get());
11169 fill_statx(in
, in
->caps_issued(), stx
);
11172 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11176 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11177 const UserPerm
& perms
)
11179 struct ceph_statx stx
;
11180 stat_to_statx(attr
, &stx
);
11182 std::lock_guard
lock(client_lock
);
11187 InodeRef
target(in
);
11188 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11190 ceph_assert(in
== target
.get());
11191 fill_stat(in
, attr
);
11194 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11202 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11203 const UserPerm
& perms
)
11205 std::lock_guard
lock(client_lock
);
11211 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11214 return _getxattr(in
, name
, value
, size
, perms
);
11217 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11218 const UserPerm
& perms
)
11220 std::lock_guard
lock(client_lock
);
11226 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11229 return _getxattr(in
, name
, value
, size
, perms
);
11232 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11233 const UserPerm
& perms
)
11235 std::lock_guard
lock(client_lock
);
11240 Fh
*f
= get_filehandle(fd
);
11243 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11246 int Client::listxattr(const char *path
, char *list
, size_t size
,
11247 const UserPerm
& perms
)
11249 std::lock_guard
lock(client_lock
);
11255 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11258 return Client::_listxattr(in
.get(), list
, size
, perms
);
11261 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11262 const UserPerm
& perms
)
11264 std::lock_guard
lock(client_lock
);
11270 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11273 return Client::_listxattr(in
.get(), list
, size
, perms
);
11276 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11278 std::lock_guard
lock(client_lock
);
11283 Fh
*f
= get_filehandle(fd
);
11286 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11289 int Client::removexattr(const char *path
, const char *name
,
11290 const UserPerm
& perms
)
11292 std::lock_guard
lock(client_lock
);
11298 int r
= Client::path_walk(path
, &in
, perms
, true);
11301 return _removexattr(in
, name
, perms
);
11304 int Client::lremovexattr(const char *path
, const char *name
,
11305 const UserPerm
& perms
)
11307 std::lock_guard
lock(client_lock
);
11313 int r
= Client::path_walk(path
, &in
, perms
, false);
11316 return _removexattr(in
, name
, perms
);
11319 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11321 std::lock_guard
lock(client_lock
);
11326 Fh
*f
= get_filehandle(fd
);
11329 return _removexattr(f
->inode
, name
, perms
);
11332 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11333 size_t size
, int flags
, const UserPerm
& perms
)
11335 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11337 std::lock_guard
lock(client_lock
);
11343 int r
= Client::path_walk(path
, &in
, perms
, true);
11346 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11349 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11350 size_t size
, int flags
, const UserPerm
& perms
)
11352 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11354 std::lock_guard
lock(client_lock
);
11360 int r
= Client::path_walk(path
, &in
, perms
, false);
11363 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11366 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11367 int flags
, const UserPerm
& perms
)
11369 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11371 std::lock_guard
lock(client_lock
);
11376 Fh
*f
= get_filehandle(fd
);
11379 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11382 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11383 const UserPerm
& perms
)
11387 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11391 // Do a force getattr to get the latest quota before returning
11392 // a value to userspace.
11394 if (vxattr
->flags
& VXATTR_RSTAT
) {
11395 flags
|= CEPH_STAT_RSTAT
;
11397 r
= _getattr(in
, flags
, perms
, true);
11399 // Error from getattr!
11403 // call pointer-to-member function
11405 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11406 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11412 if (r
> (int)size
) {
11414 } else if (r
> 0) {
11415 memcpy(value
, buf
, r
);
11421 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11426 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11430 if (in
->xattrs
.count(n
)) {
11431 r
= in
->xattrs
[n
].length();
11432 if (r
> 0 && size
!= 0) {
11433 if (size
>= (unsigned)r
)
11434 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11441 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11445 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11446 const UserPerm
& perms
)
11448 if (cct
->_conf
->client_permissions
) {
11449 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11453 return _getxattr(in
.get(), name
, value
, size
, perms
);
11456 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11457 size_t size
, const UserPerm
& perms
)
11459 std::lock_guard
lock(client_lock
);
11464 vinodeno_t vino
= _get_vino(in
);
11466 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11467 tout(cct
) << __func__
<< std::endl
;
11468 tout(cct
) << vino
.ino
.val
<< std::endl
;
11469 tout(cct
) << name
<< std::endl
;
11471 if (!fuse_default_permissions
) {
11472 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11477 return _getxattr(in
, name
, value
, size
, perms
);
11480 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11481 const UserPerm
& perms
)
11483 bool len_only
= (size
== 0);
11484 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11490 for (const auto& p
: in
->xattrs
) {
11491 size_t this_len
= p
.first
.length() + 1;
11496 if (this_len
> size
) {
11501 memcpy(name
, p
.first
.c_str(), this_len
);
11506 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11510 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11511 const UserPerm
& perms
)
11513 std::lock_guard
lock(client_lock
);
11518 vinodeno_t vino
= _get_vino(in
);
11520 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11521 tout(cct
) << __func__
<< std::endl
;
11522 tout(cct
) << vino
.ino
.val
<< std::endl
;
11523 tout(cct
) << size
<< std::endl
;
11525 return _listxattr(in
, names
, size
, perms
);
11528 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11529 size_t size
, int flags
, const UserPerm
& perms
)
11532 int xattr_flags
= 0;
11534 xattr_flags
|= CEPH_XATTR_REMOVE
;
11535 if (flags
& XATTR_CREATE
)
11536 xattr_flags
|= CEPH_XATTR_CREATE
;
11537 if (flags
& XATTR_REPLACE
)
11538 xattr_flags
|= CEPH_XATTR_REPLACE
;
11540 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11542 in
->make_nosnap_relative_path(path
);
11543 req
->set_filepath(path
);
11544 req
->set_string2(name
);
11545 req
->set_inode(in
);
11546 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11549 assert (value
|| size
== 0);
11550 bl
.append((const char*)value
, size
);
11553 int res
= make_request(req
, perms
);
11556 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11561 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11562 size_t size
, int flags
, const UserPerm
& perms
)
11564 if (in
->snapid
!= CEPH_NOSNAP
) {
11568 bool posix_acl_xattr
= false;
11569 if (acl_type
== POSIX_ACL
)
11570 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11572 if (strncmp(name
, "user.", 5) &&
11573 strncmp(name
, "security.", 9) &&
11574 strncmp(name
, "trusted.", 8) &&
11575 strncmp(name
, "ceph.", 5) &&
11577 return -EOPNOTSUPP
;
11579 bool check_realm
= false;
11581 if (posix_acl_xattr
) {
11582 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11583 mode_t new_mode
= in
->mode
;
11585 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11592 if (new_mode
!= in
->mode
) {
11593 struct ceph_statx stx
;
11594 stx
.stx_mode
= new_mode
;
11595 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11600 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11602 if (!S_ISDIR(in
->mode
))
11604 int ret
= posix_acl_check(value
, size
);
11613 return -EOPNOTSUPP
;
11616 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11618 if (vxattr
->readonly
)
11619 return -EOPNOTSUPP
;
11620 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11621 check_realm
= true;
11625 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11626 if (ret
>= 0 && check_realm
) {
11627 // check if snaprealm was created for quota inode
11628 if (in
->quota
.is_enable() &&
11629 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11636 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11637 size_t size
, int flags
, const UserPerm
& perms
)
11639 if (cct
->_conf
->client_permissions
) {
11640 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11644 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11647 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11650 if (name
== "layout") {
11651 string::iterator begin
= value
.begin();
11652 string::iterator end
= value
.end();
11653 keys_and_values
<string::iterator
> p
; // create instance of parser
11654 std::map
<string
, string
> m
; // map to receive results
11655 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11660 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11661 if (q
->first
== "pool") {
11666 } else if (name
== "layout.pool") {
11670 if (tmp
.length()) {
11673 pool
= boost::lexical_cast
<unsigned>(tmp
);
11674 if (!osdmap
->have_pg_pool(pool
))
11676 } catch (boost::bad_lexical_cast
const&) {
11677 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11687 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11689 // For setting pool of layout, MetaRequest need osdmap epoch.
11690 // There is a race which create a new data pool but client and mds both don't have.
11691 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11692 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11693 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11694 string
rest(strstr(name
, "layout"));
11695 string
v((const char*)value
, size
);
11696 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11697 return _setxattr_check_data_pool(rest
, v
, &o
);
11700 if (r
== -ENOENT
) {
11702 objecter
->wait_for_latest_osdmap(&ctx
);
11708 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11709 size_t size
, int flags
, const UserPerm
& perms
)
11711 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11713 std::lock_guard
lock(client_lock
);
11718 vinodeno_t vino
= _get_vino(in
);
11720 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11721 tout(cct
) << __func__
<< std::endl
;
11722 tout(cct
) << vino
.ino
.val
<< std::endl
;
11723 tout(cct
) << name
<< std::endl
;
11725 if (!fuse_default_permissions
) {
11726 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11730 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11733 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11735 if (in
->snapid
!= CEPH_NOSNAP
) {
11739 // same xattrs supported by kernel client
11740 if (strncmp(name
, "user.", 5) &&
11741 strncmp(name
, "system.", 7) &&
11742 strncmp(name
, "security.", 9) &&
11743 strncmp(name
, "trusted.", 8) &&
11744 strncmp(name
, "ceph.", 5))
11745 return -EOPNOTSUPP
;
11747 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11748 if (vxattr
&& vxattr
->readonly
)
11749 return -EOPNOTSUPP
;
11751 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11753 in
->make_nosnap_relative_path(path
);
11754 req
->set_filepath(path
);
11755 req
->set_filepath2(name
);
11756 req
->set_inode(in
);
11758 int res
= make_request(req
, perms
);
11761 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11765 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11767 if (cct
->_conf
->client_permissions
) {
11768 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11772 return _removexattr(in
.get(), name
, perms
);
11775 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11777 std::lock_guard
lock(client_lock
);
11782 vinodeno_t vino
= _get_vino(in
);
11784 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11785 tout(cct
) << "ll_removexattr" << std::endl
;
11786 tout(cct
) << vino
.ino
.val
<< std::endl
;
11787 tout(cct
) << name
<< std::endl
;
11789 if (!fuse_default_permissions
) {
11790 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11795 return _removexattr(in
, name
, perms
);
11798 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11800 return in
->quota
.is_enable() &&
11801 in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
;
11803 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11805 return snprintf(val
, size
,
11806 "max_bytes=%lld max_files=%lld",
11807 (long long int)in
->quota
.max_bytes
,
11808 (long long int)in
->quota
.max_files
);
11810 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11812 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11814 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11816 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11819 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11821 return in
->layout
!= file_layout_t();
11823 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11825 int r
= snprintf(val
, size
,
11826 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11827 (unsigned long long)in
->layout
.stripe_unit
,
11828 (unsigned long long)in
->layout
.stripe_count
,
11829 (unsigned long long)in
->layout
.object_size
);
11830 objecter
->with_osdmap([&](const OSDMap
& o
) {
11831 if (o
.have_pg_pool(in
->layout
.pool_id
))
11832 r
+= snprintf(val
+ r
, size
- r
, "%s",
11833 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11835 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11836 (uint64_t)in
->layout
.pool_id
);
11838 if (in
->layout
.pool_ns
.length())
11839 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11840 in
->layout
.pool_ns
.c_str());
11843 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11845 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11847 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11849 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11851 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11853 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11855 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11858 objecter
->with_osdmap([&](const OSDMap
& o
) {
11859 if (o
.have_pg_pool(in
->layout
.pool_id
))
11860 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11861 in
->layout
.pool_id
).c_str());
11863 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11867 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11869 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11871 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11873 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11875 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11877 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11879 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11881 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11883 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11885 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11887 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11889 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11891 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11893 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11895 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11897 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11899 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11901 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11902 (long)in
->rstat
.rctime
.nsec());
11904 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11906 return in
->dir_pin
!= -ENODATA
;
11908 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11910 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11913 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11915 return !in
->snap_btime
.is_zero();
11918 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11920 return snprintf(val
, size
, "%llu.%09lu",
11921 (long long unsigned)in
->snap_btime
.sec(),
11922 (long unsigned)in
->snap_btime
.nsec());
11925 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11926 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11928 #define XATTR_NAME_CEPH(_type, _name) \
11930 name: CEPH_XATTR_NAME(_type, _name), \
11931 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11936 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11938 name: CEPH_XATTR_NAME(_type, _name), \
11939 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11944 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11946 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11947 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11949 exists_cb: &Client::_vxattrcb_layout_exists, \
11952 #define XATTR_QUOTA_FIELD(_type, _name) \
11954 name: CEPH_XATTR_NAME(_type, _name), \
11955 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11957 exists_cb: &Client::_vxattrcb_quota_exists, \
11961 const Client::VXattr
Client::_dir_vxattrs
[] = {
11963 name
: "ceph.dir.layout",
11964 getxattr_cb
: &Client::_vxattrcb_layout
,
11966 exists_cb
: &Client::_vxattrcb_layout_exists
,
11969 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11970 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11971 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11972 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11973 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11974 XATTR_NAME_CEPH(dir
, entries
),
11975 XATTR_NAME_CEPH(dir
, files
),
11976 XATTR_NAME_CEPH(dir
, subdirs
),
11977 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11978 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11979 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11980 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11981 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11983 name
: "ceph.quota",
11984 getxattr_cb
: &Client::_vxattrcb_quota
,
11986 exists_cb
: &Client::_vxattrcb_quota_exists
,
11989 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11990 XATTR_QUOTA_FIELD(quota
, max_files
),
11992 name
: "ceph.dir.pin",
11993 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
11995 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
11999 name
: "ceph.snap.btime",
12000 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12002 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12005 { name
: "" } /* Required table terminator */
12008 const Client::VXattr
Client::_file_vxattrs
[] = {
12010 name
: "ceph.file.layout",
12011 getxattr_cb
: &Client::_vxattrcb_layout
,
12013 exists_cb
: &Client::_vxattrcb_layout_exists
,
12016 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
12017 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
12018 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
12019 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
12020 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
12022 name
: "ceph.snap.btime",
12023 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12025 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12028 { name
: "" } /* Required table terminator */
12031 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
12034 return _dir_vxattrs
;
12035 else if (in
->is_file())
12036 return _file_vxattrs
;
12040 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
12042 if (strncmp(name
, "ceph.", 5) == 0) {
12043 const VXattr
*vxattr
= _get_vxattrs(in
);
12045 while (!vxattr
->name
.empty()) {
12046 if (vxattr
->name
== name
)
12055 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
12057 std::lock_guard
lock(client_lock
);
12062 vinodeno_t vino
= _get_vino(in
);
12064 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
12065 tout(cct
) << "ll_readlink" << std::endl
;
12066 tout(cct
) << vino
.ino
.val
<< std::endl
;
12068 for (auto dn
: in
->dentries
) {
12072 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
12073 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12077 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12078 const UserPerm
& perms
, InodeRef
*inp
)
12080 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12081 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12082 << ", gid " << perms
.gid() << ")" << dendl
;
12084 if (strlen(name
) > NAME_MAX
)
12085 return -ENAMETOOLONG
;
12087 if (dir
->snapid
!= CEPH_NOSNAP
) {
12090 if (is_quota_files_exceeded(dir
, perms
)) {
12094 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12097 dir
->make_nosnap_relative_path(path
);
12098 path
.push_dentry(name
);
12099 req
->set_filepath(path
);
12100 req
->set_inode(dir
);
12101 req
->head
.args
.mknod
.rdev
= rdev
;
12102 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12103 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12105 bufferlist xattrs_bl
;
12106 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12109 req
->head
.args
.mknod
.mode
= mode
;
12110 if (xattrs_bl
.length() > 0)
12111 req
->set_data(xattrs_bl
);
12114 res
= get_or_create(dir
, name
, &de
);
12117 req
->set_dentry(de
);
12119 res
= make_request(req
, perms
, inp
);
12123 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12131 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12132 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12133 const UserPerm
& perms
)
12135 std::lock_guard
lock(client_lock
);
12140 vinodeno_t vparent
= _get_vino(parent
);
12142 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12143 tout(cct
) << "ll_mknod" << std::endl
;
12144 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12145 tout(cct
) << name
<< std::endl
;
12146 tout(cct
) << mode
<< std::endl
;
12147 tout(cct
) << rdev
<< std::endl
;
12149 if (!fuse_default_permissions
) {
12150 int r
= may_create(parent
, perms
);
12156 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12158 fill_stat(in
, attr
);
12161 tout(cct
) << attr
->st_ino
<< std::endl
;
12162 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12163 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12168 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12169 dev_t rdev
, Inode
**out
,
12170 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12171 const UserPerm
& perms
)
12173 unsigned caps
= statx_to_mask(flags
, want
);
12174 std::lock_guard
lock(client_lock
);
12179 vinodeno_t vparent
= _get_vino(parent
);
12181 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12182 tout(cct
) << "ll_mknodx" << std::endl
;
12183 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12184 tout(cct
) << name
<< std::endl
;
12185 tout(cct
) << mode
<< std::endl
;
12186 tout(cct
) << rdev
<< std::endl
;
12188 if (!fuse_default_permissions
) {
12189 int r
= may_create(parent
, perms
);
12195 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12197 fill_statx(in
, caps
, stx
);
12200 tout(cct
) << stx
->stx_ino
<< std::endl
;
12201 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12202 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12207 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12208 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12209 int object_size
, const char *data_pool
, bool *created
,
12210 const UserPerm
& perms
)
12212 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12213 mode
<< dec
<< ")" << dendl
;
12215 if (strlen(name
) > NAME_MAX
)
12216 return -ENAMETOOLONG
;
12217 if (dir
->snapid
!= CEPH_NOSNAP
) {
12220 if (is_quota_files_exceeded(dir
, perms
)) {
12224 // use normalized flags to generate cmode
12225 int cflags
= ceph_flags_sys2wire(flags
);
12226 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12227 cflags
|= CEPH_O_LAZY
;
12229 int cmode
= ceph_flags_to_mode(cflags
);
12231 int64_t pool_id
= -1;
12232 if (data_pool
&& *data_pool
) {
12233 pool_id
= objecter
->with_osdmap(
12234 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12237 if (pool_id
> 0xffffffffll
)
12238 return -ERANGE
; // bummer!
12241 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12244 dir
->make_nosnap_relative_path(path
);
12245 path
.push_dentry(name
);
12246 req
->set_filepath(path
);
12247 req
->set_inode(dir
);
12248 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12250 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12251 req
->head
.args
.open
.stripe_count
= stripe_count
;
12252 req
->head
.args
.open
.object_size
= object_size
;
12253 if (cct
->_conf
->client_debug_getattr_caps
)
12254 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12256 req
->head
.args
.open
.mask
= 0;
12257 req
->head
.args
.open
.pool
= pool_id
;
12258 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12259 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12262 bufferlist xattrs_bl
;
12263 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12266 req
->head
.args
.open
.mode
= mode
;
12267 if (xattrs_bl
.length() > 0)
12268 req
->set_data(xattrs_bl
);
12271 res
= get_or_create(dir
, name
, &de
);
12274 req
->set_dentry(de
);
12276 res
= make_request(req
, perms
, inp
, created
);
12281 /* If the caller passed a value in fhp, do the open */
12283 (*inp
)->get_open_ref(cmode
);
12284 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12290 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12291 << " layout " << stripe_unit
12292 << ' ' << stripe_count
12293 << ' ' << object_size
12294 <<") = " << res
<< dendl
;
12303 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12306 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12307 << mode
<< dec
<< ", uid " << perm
.uid()
12308 << ", gid " << perm
.gid() << ")" << dendl
;
12310 if (strlen(name
) > NAME_MAX
)
12311 return -ENAMETOOLONG
;
12313 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12316 if (is_quota_files_exceeded(dir
, perm
)) {
12319 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12320 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12323 dir
->make_nosnap_relative_path(path
);
12324 path
.push_dentry(name
);
12325 req
->set_filepath(path
);
12326 req
->set_inode(dir
);
12327 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12328 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12331 bufferlist xattrs_bl
;
12332 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12335 req
->head
.args
.mkdir
.mode
= mode
;
12336 if (xattrs_bl
.length() > 0)
12337 req
->set_data(xattrs_bl
);
12340 res
= get_or_create(dir
, name
, &de
);
12343 req
->set_dentry(de
);
12345 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12346 res
= make_request(req
, perm
, inp
);
12347 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12351 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12359 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12360 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12362 std::lock_guard
lock(client_lock
);
12367 vinodeno_t vparent
= _get_vino(parent
);
12369 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12370 tout(cct
) << "ll_mkdir" << std::endl
;
12371 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12372 tout(cct
) << name
<< std::endl
;
12373 tout(cct
) << mode
<< std::endl
;
12375 if (!fuse_default_permissions
) {
12376 int r
= may_create(parent
, perm
);
12382 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12384 fill_stat(in
, attr
);
12387 tout(cct
) << attr
->st_ino
<< std::endl
;
12388 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12389 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12394 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12395 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12396 const UserPerm
& perms
)
12398 std::lock_guard
lock(client_lock
);
12403 vinodeno_t vparent
= _get_vino(parent
);
12405 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12406 tout(cct
) << "ll_mkdirx" << std::endl
;
12407 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12408 tout(cct
) << name
<< std::endl
;
12409 tout(cct
) << mode
<< std::endl
;
12411 if (!fuse_default_permissions
) {
12412 int r
= may_create(parent
, perms
);
12418 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12420 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12426 tout(cct
) << stx
->stx_ino
<< std::endl
;
12427 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12428 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12433 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12434 const UserPerm
& perms
, InodeRef
*inp
)
12436 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12437 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12440 if (strlen(name
) > NAME_MAX
)
12441 return -ENAMETOOLONG
;
12443 if (dir
->snapid
!= CEPH_NOSNAP
) {
12446 if (is_quota_files_exceeded(dir
, perms
)) {
12450 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12453 dir
->make_nosnap_relative_path(path
);
12454 path
.push_dentry(name
);
12455 req
->set_filepath(path
);
12456 req
->set_inode(dir
);
12457 req
->set_string2(target
);
12458 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12459 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12462 int res
= get_or_create(dir
, name
, &de
);
12465 req
->set_dentry(de
);
12467 res
= make_request(req
, perms
, inp
);
12470 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12479 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12480 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12482 std::lock_guard
lock(client_lock
);
12487 vinodeno_t vparent
= _get_vino(parent
);
12489 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12491 tout(cct
) << "ll_symlink" << std::endl
;
12492 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12493 tout(cct
) << name
<< std::endl
;
12494 tout(cct
) << value
<< std::endl
;
12496 if (!fuse_default_permissions
) {
12497 int r
= may_create(parent
, perms
);
12503 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12505 fill_stat(in
, attr
);
12508 tout(cct
) << attr
->st_ino
<< std::endl
;
12509 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12510 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12515 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12516 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12517 unsigned flags
, const UserPerm
& perms
)
12519 std::lock_guard
lock(client_lock
);
12524 vinodeno_t vparent
= _get_vino(parent
);
12526 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12528 tout(cct
) << "ll_symlinkx" << std::endl
;
12529 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12530 tout(cct
) << name
<< std::endl
;
12531 tout(cct
) << value
<< std::endl
;
12533 if (!fuse_default_permissions
) {
12534 int r
= may_create(parent
, perms
);
12540 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12542 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12545 tout(cct
) << stx
->stx_ino
<< std::endl
;
12546 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12547 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12552 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12554 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12555 << " uid " << perm
.uid() << " gid " << perm
.gid()
12558 if (dir
->snapid
!= CEPH_NOSNAP
) {
12562 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12565 dir
->make_nosnap_relative_path(path
);
12566 path
.push_dentry(name
);
12567 req
->set_filepath(path
);
12573 int res
= get_or_create(dir
, name
, &de
);
12576 req
->set_dentry(de
);
12577 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12578 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12580 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12584 in
= otherin
.get();
12585 req
->set_other_inode(in
);
12586 in
->break_all_delegs();
12587 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12589 req
->set_inode(dir
);
12591 res
= make_request(req
, perm
);
12594 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12602 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12604 std::lock_guard
lock(client_lock
);
12609 vinodeno_t vino
= _get_vino(in
);
12611 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12612 tout(cct
) << "ll_unlink" << std::endl
;
12613 tout(cct
) << vino
.ino
.val
<< std::endl
;
12614 tout(cct
) << name
<< std::endl
;
12616 if (!fuse_default_permissions
) {
12617 int r
= may_delete(in
, name
, perm
);
12621 return _unlink(in
, name
, perm
);
12624 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12626 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12627 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12629 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12633 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12634 MetaRequest
*req
= new MetaRequest(op
);
12636 dir
->make_nosnap_relative_path(path
);
12637 path
.push_dentry(name
);
12638 req
->set_filepath(path
);
12639 req
->set_inode(dir
);
12641 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12642 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12643 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12648 int res
= get_or_create(dir
, name
, &de
);
12651 if (op
== CEPH_MDS_OP_RMDIR
)
12652 req
->set_dentry(de
);
12656 res
= _lookup(dir
, name
, 0, &in
, perms
);
12660 if (op
== CEPH_MDS_OP_RMSNAP
) {
12661 unlink(de
, true, true);
12664 req
->set_other_inode(in
.get());
12666 res
= make_request(req
, perms
);
12669 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12677 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12679 std::lock_guard
lock(client_lock
);
12684 vinodeno_t vino
= _get_vino(in
);
12686 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12687 tout(cct
) << "ll_rmdir" << std::endl
;
12688 tout(cct
) << vino
.ino
.val
<< std::endl
;
12689 tout(cct
) << name
<< std::endl
;
12691 if (!fuse_default_permissions
) {
12692 int r
= may_delete(in
, name
, perms
);
12697 return _rmdir(in
, name
, perms
);
12700 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12702 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12703 << todir
->ino
<< " " << toname
12704 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12707 if (fromdir
->snapid
!= todir
->snapid
)
12710 int op
= CEPH_MDS_OP_RENAME
;
12711 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12712 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12713 op
= CEPH_MDS_OP_RENAMESNAP
;
12719 MetaRequest
*req
= new MetaRequest(op
);
12722 fromdir
->make_nosnap_relative_path(from
);
12723 from
.push_dentry(fromname
);
12725 todir
->make_nosnap_relative_path(to
);
12726 to
.push_dentry(toname
);
12727 req
->set_filepath(to
);
12728 req
->set_filepath2(from
);
12731 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12735 res
= get_or_create(todir
, toname
, &de
);
12739 if (op
== CEPH_MDS_OP_RENAME
) {
12740 req
->set_old_dentry(oldde
);
12741 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12742 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12744 req
->set_dentry(de
);
12745 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12746 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12748 InodeRef oldin
, otherin
;
12749 Inode
*fromdir_root
= nullptr;
12750 Inode
*todir_root
= nullptr;
12752 bool quota_check
= false;
12753 if (fromdir
!= todir
) {
12755 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12757 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12759 if (todir_root
->quota
.is_enable() && fromdir_root
!= todir_root
) {
12760 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12761 // to auth MDS to get latest rstat for todir_root and source dir
12762 // even if their dentry caches and inode caps are satisfied.
12763 res
= _getattr(todir_root
, CEPH_STAT_RSTAT
, perm
, true);
12767 quota_check
= true;
12768 if (oldde
->inode
&& oldde
->inode
->is_dir()) {
12769 mask
|= CEPH_STAT_RSTAT
;
12774 res
= _lookup(fromdir
, fromname
, mask
, &oldin
, perm
);
12778 Inode
*oldinode
= oldin
.get();
12779 oldinode
->break_all_delegs();
12780 req
->set_old_inode(oldinode
);
12781 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12784 int64_t old_bytes
, old_files
;
12785 if (oldinode
->is_dir()) {
12786 old_bytes
= oldinode
->rstat
.rbytes
;
12787 old_files
= oldinode
->rstat
.rsize();
12789 old_bytes
= oldinode
->size
;
12793 bool quota_exceed
= false;
12794 if (todir_root
&& todir_root
->quota
.max_bytes
&&
12795 (old_bytes
+ todir_root
->rstat
.rbytes
) >= todir_root
->quota
.max_bytes
) {
12796 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " bytes="
12797 << old_bytes
<< ") to (" << todir
->ino
12798 << ") will exceed quota on " << *todir_root
<< dendl
;
12799 quota_exceed
= true;
12802 if (todir_root
&& todir_root
->quota
.max_files
&&
12803 (old_files
+ todir_root
->rstat
.rsize()) >= todir_root
->quota
.max_files
) {
12804 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " files="
12805 << old_files
<< ") to (" << todir
->ino
12806 << ") will exceed quota on " << *todir_root
<< dendl
;
12807 quota_exceed
= true;
12810 if (quota_exceed
) {
12811 res
= (oldinode
->is_dir()) ? -EXDEV
: -EDQUOT
;
12816 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12820 Inode
*in
= otherin
.get();
12821 req
->set_other_inode(in
);
12822 in
->break_all_delegs();
12824 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12832 req
->set_inode(todir
);
12834 // renamesnap reply contains no tracedn, so we need to invalidate
12836 unlink(oldde
, true, true);
12837 unlink(de
, true, true);
12839 req
->set_inode(todir
);
12842 res
= make_request(req
, perm
, &target
);
12843 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12845 // renamed item from our cache
12848 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12856 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12857 const char *newname
, const UserPerm
& perm
)
12859 std::lock_guard
lock(client_lock
);
12864 vinodeno_t vparent
= _get_vino(parent
);
12865 vinodeno_t vnewparent
= _get_vino(newparent
);
12867 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12868 << vnewparent
<< " " << newname
<< dendl
;
12869 tout(cct
) << "ll_rename" << std::endl
;
12870 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12871 tout(cct
) << name
<< std::endl
;
12872 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12873 tout(cct
) << newname
<< std::endl
;
12875 if (!fuse_default_permissions
) {
12876 int r
= may_delete(parent
, name
, perm
);
12879 r
= may_delete(newparent
, newname
, perm
);
12880 if (r
< 0 && r
!= -ENOENT
)
12884 return _rename(parent
, name
, newparent
, newname
, perm
);
12887 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12889 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12890 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12892 if (strlen(newname
) > NAME_MAX
)
12893 return -ENAMETOOLONG
;
12895 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12898 if (is_quota_files_exceeded(dir
, perm
)) {
12902 in
->break_all_delegs();
12903 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12905 filepath
path(newname
, dir
->ino
);
12906 req
->set_filepath(path
);
12907 filepath
existing(in
->ino
);
12908 req
->set_filepath2(existing
);
12910 req
->set_inode(dir
);
12911 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12912 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12915 int res
= get_or_create(dir
, newname
, &de
);
12918 req
->set_dentry(de
);
12920 res
= make_request(req
, perm
, inp
);
12921 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12924 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12932 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12933 const UserPerm
& perm
)
12935 std::lock_guard
lock(client_lock
);
12940 vinodeno_t vino
= _get_vino(in
);
12941 vinodeno_t vnewparent
= _get_vino(newparent
);
12943 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12945 tout(cct
) << "ll_link" << std::endl
;
12946 tout(cct
) << vino
.ino
.val
<< std::endl
;
12947 tout(cct
) << vnewparent
<< std::endl
;
12948 tout(cct
) << newname
<< std::endl
;
12952 if (!fuse_default_permissions
) {
12953 if (S_ISDIR(in
->mode
))
12956 int r
= may_hardlink(in
, perm
);
12960 r
= may_create(newparent
, perm
);
12965 return _link(in
, newparent
, newname
, perm
, &target
);
12968 int Client::ll_num_osds(void)
12970 std::lock_guard
lock(client_lock
);
12971 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12974 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12976 std::lock_guard
lock(client_lock
);
12979 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12980 if (!o
.exists(osd
))
12982 g
= o
.get_addrs(osd
).front();
12987 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12988 *addr
= ntohl(nb_addr
);
12992 uint32_t Client::ll_stripe_unit(Inode
*in
)
12994 std::lock_guard
lock(client_lock
);
12995 return in
->layout
.stripe_unit
;
12998 uint64_t Client::ll_snap_seq(Inode
*in
)
13000 std::lock_guard
lock(client_lock
);
13001 return in
->snaprealm
->seq
;
13004 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
13006 std::lock_guard
lock(client_lock
);
13007 *layout
= in
->layout
;
13011 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
13013 return ll_file_layout(fh
->inode
.get(), layout
);
13016 /* Currently we cannot take advantage of redundancy in reads, since we
13017 would have to go through all possible placement groups (a
13018 potentially quite large number determined by a hash), and use CRUSH
13019 to calculate the appropriate set of OSDs for each placement group,
13020 then index into that. An array with one entry per OSD is much more
13021 tractable and works for demonstration purposes. */
13023 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
13024 file_layout_t
* layout
)
13026 std::lock_guard
lock(client_lock
);
13028 inodeno_t ino
= in
->ino
;
13029 uint32_t object_size
= layout
->object_size
;
13030 uint32_t su
= layout
->stripe_unit
;
13031 uint32_t stripe_count
= layout
->stripe_count
;
13032 uint64_t stripes_per_object
= object_size
/ su
;
13033 uint64_t stripeno
= 0, stripepos
= 0;
13036 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
13037 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
13039 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
13040 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
13042 object_t oid
= file_object_t(ino
, objectno
);
13043 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13044 ceph_object_layout olayout
=
13045 o
.file_to_object_layout(oid
, *layout
);
13046 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
13049 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
13054 /* Return the offset of the block, internal to the object */
13056 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
13058 std::lock_guard
lock(client_lock
);
13059 file_layout_t
*layout
=&(in
->layout
);
13060 uint32_t object_size
= layout
->object_size
;
13061 uint32_t su
= layout
->stripe_unit
;
13062 uint64_t stripes_per_object
= object_size
/ su
;
13064 return (blockno
% stripes_per_object
) * su
;
13067 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
13068 const UserPerm
& perms
)
13070 std::lock_guard
lock(client_lock
);
13075 vinodeno_t vino
= _get_vino(in
);
13077 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13078 tout(cct
) << "ll_opendir" << std::endl
;
13079 tout(cct
) << vino
.ino
.val
<< std::endl
;
13081 if (!fuse_default_permissions
) {
13082 int r
= may_open(in
, flags
, perms
);
13087 int r
= _opendir(in
, dirpp
, perms
);
13088 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
13090 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13095 int Client::ll_releasedir(dir_result_t
*dirp
)
13097 std::lock_guard
lock(client_lock
);
13098 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13099 tout(cct
) << "ll_releasedir" << std::endl
;
13100 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13109 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13111 std::lock_guard
lock(client_lock
);
13112 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13113 tout(cct
) << "ll_fsyncdir" << std::endl
;
13114 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13119 return _fsync(dirp
->inode
.get(), false);
13122 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13124 ceph_assert(!(flags
& O_CREAT
));
13126 std::lock_guard
lock(client_lock
);
13131 vinodeno_t vino
= _get_vino(in
);
13133 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13134 tout(cct
) << "ll_open" << std::endl
;
13135 tout(cct
) << vino
.ino
.val
<< std::endl
;
13136 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13139 if (!fuse_default_permissions
) {
13140 r
= may_open(in
, flags
, perms
);
13145 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13148 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13150 ll_unclosed_fh_set
.insert(fhptr
);
13152 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13153 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13154 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13158 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13159 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13160 const UserPerm
& perms
)
13164 vinodeno_t vparent
= _get_vino(parent
);
13166 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13167 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13168 << ", gid " << perms
.gid() << dendl
;
13169 tout(cct
) << "ll_create" << std::endl
;
13170 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13171 tout(cct
) << name
<< std::endl
;
13172 tout(cct
) << mode
<< std::endl
;
13173 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13175 bool created
= false;
13176 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13178 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13181 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13182 if (!fuse_default_permissions
) {
13183 r
= may_create(parent
, perms
);
13187 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13198 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13200 if (!fuse_default_permissions
) {
13201 r
= may_open(in
->get(), flags
, perms
);
13204 int release_r
= _release_fh(*fhp
);
13205 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13210 if (*fhp
== NULL
) {
13211 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13219 ll_unclosed_fh_set
.insert(*fhp
);
13224 Inode
*inode
= in
->get();
13225 if (use_faked_inos())
13226 ino
= inode
->faked_ino
;
13231 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13232 tout(cct
) << ino
<< std::endl
;
13233 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13234 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13235 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13240 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13241 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13242 const UserPerm
& perms
)
13244 std::lock_guard
lock(client_lock
);
13250 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13255 // passing an Inode in outp requires an additional ref
13260 fill_stat(in
, attr
);
13268 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13269 int oflags
, Inode
**outp
, Fh
**fhp
,
13270 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13271 const UserPerm
& perms
)
13273 unsigned caps
= statx_to_mask(lflags
, want
);
13274 std::lock_guard
lock(client_lock
);
13280 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13284 // passing an Inode in outp requires an additional ref
13289 fill_statx(in
, caps
, stx
);
13298 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13300 std::lock_guard
lock(client_lock
);
13301 tout(cct
) << "ll_lseek" << std::endl
;
13302 tout(cct
) << offset
<< std::endl
;
13303 tout(cct
) << whence
<< std::endl
;
13308 return _lseek(fh
, offset
, whence
);
13311 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13313 std::lock_guard
lock(client_lock
);
13314 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13315 tout(cct
) << "ll_read" << std::endl
;
13316 tout(cct
) << (unsigned long)fh
<< std::endl
;
13317 tout(cct
) << off
<< std::endl
;
13318 tout(cct
) << len
<< std::endl
;
13323 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13324 len
= std::min(len
, (loff_t
)INT_MAX
);
13325 return _read(fh
, off
, len
, bl
);
13328 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13332 file_layout_t
* layout
)
13334 std::lock_guard
lock(client_lock
);
13339 vinodeno_t vino
= _get_vino(in
);
13340 object_t oid
= file_object_t(vino
.ino
, blockid
);
13341 C_SaferCond onfinish
;
13344 objecter
->read(oid
,
13345 object_locator_t(layout
->pool_id
),
13350 CEPH_OSD_FLAG_READ
,
13353 client_lock
.unlock();
13354 int r
= onfinish
.wait();
13355 client_lock
.lock();
13358 bl
.begin().copy(bl
.length(), buf
);
13365 /* It appears that the OSD doesn't return success unless the entire
13366 buffer was written, return the write length on success. */
13368 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13369 char* buf
, uint64_t offset
,
13370 uint64_t length
, file_layout_t
* layout
,
13371 uint64_t snapseq
, uint32_t sync
)
13373 vinodeno_t vino
= ll_get_vino(in
);
13375 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13380 if (true || sync
) {
13381 /* if write is stable, the epilogue is waiting on
13383 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13385 object_t oid
= file_object_t(vino
.ino
, blockid
);
13386 SnapContext fakesnap
;
13387 ceph::bufferlist bl
;
13389 bl
.push_back(buffer::copy(buf
, length
));
13392 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13395 fakesnap
.seq
= snapseq
;
13397 /* lock just in time */
13398 client_lock
.lock();
13400 client_lock
.unlock();
13404 objecter
->write(oid
,
13405 object_locator_t(layout
->pool_id
),
13410 ceph::real_clock::now(),
13414 client_lock
.unlock();
13415 if (nullptr != onsafe
) {
13416 r
= onsafe
->wait();
13426 int Client::ll_commit_blocks(Inode
*in
,
13430 std::lock_guard
lock(client_lock
);
13432 BarrierContext *bctx;
13433 vinodeno_t vino = _get_vino(in);
13434 uint64_t ino = vino.ino;
13436 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13437 << offset << " to " << length << dendl;
13443 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13444 if (p != barriers.end()) {
13445 barrier_interval civ(offset, offset + length);
13446 p->second->commit_barrier(civ);
13452 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13454 std::lock_guard
lock(client_lock
);
13455 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13456 "~" << len
<< dendl
;
13457 tout(cct
) << "ll_write" << std::endl
;
13458 tout(cct
) << (unsigned long)fh
<< std::endl
;
13459 tout(cct
) << off
<< std::endl
;
13460 tout(cct
) << len
<< std::endl
;
13465 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13466 len
= std::min(len
, (loff_t
)INT_MAX
);
13467 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13468 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13473 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13475 std::lock_guard
lock(client_lock
);
13478 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13481 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13483 std::lock_guard
lock(client_lock
);
13486 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13489 int Client::ll_flush(Fh
*fh
)
13491 std::lock_guard
lock(client_lock
);
13492 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13493 tout(cct
) << "ll_flush" << std::endl
;
13494 tout(cct
) << (unsigned long)fh
<< std::endl
;
13502 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13504 std::lock_guard
lock(client_lock
);
13505 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13506 tout(cct
) << "ll_fsync" << std::endl
;
13507 tout(cct
) << (unsigned long)fh
<< std::endl
;
13512 int r
= _fsync(fh
, syncdataonly
);
13514 // If we're returning an error, clear it from the FH
13515 fh
->take_async_err();
13520 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13522 std::lock_guard
lock(client_lock
);
13523 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13524 tout(cct
) << "ll_sync_inode" << std::endl
;
13525 tout(cct
) << (unsigned long)in
<< std::endl
;
13530 return _fsync(in
, syncdataonly
);
13533 #ifdef FALLOC_FL_PUNCH_HOLE
13535 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13537 if (offset
< 0 || length
<= 0)
13540 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13541 return -EOPNOTSUPP
;
13543 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13544 return -EOPNOTSUPP
;
13546 Inode
*in
= fh
->inode
.get();
13548 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13549 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13553 if (in
->snapid
!= CEPH_NOSNAP
)
13556 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13559 uint64_t size
= offset
+ length
;
13560 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13562 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13567 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13571 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13572 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13573 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13574 (have
& CEPH_CAP_FILE_BUFFER
)) {
13576 auto inline_iter
= in
->inline_data
.cbegin();
13577 int len
= in
->inline_data
.length();
13578 if (offset
< len
) {
13580 inline_iter
.copy(offset
, bl
);
13582 if (offset
+ size
> len
)
13583 size
= len
- offset
;
13585 bl
.append_zero(size
);
13586 if (offset
+ size
< len
) {
13587 inline_iter
+= size
;
13588 inline_iter
.copy(len
- offset
- size
, bl
);
13590 in
->inline_data
= bl
;
13591 in
->inline_version
++;
13593 in
->mtime
= in
->ctime
= ceph_clock_now();
13595 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13597 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13598 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13599 uninline_data(in
, onuninline
.get());
13602 C_SaferCond
onfinish("Client::_punch_hole flock");
13604 unsafe_sync_write
++;
13605 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13607 _invalidate_inode_cache(in
, offset
, length
);
13608 filer
->zero(in
->ino
, &in
->layout
,
13609 in
->snaprealm
->get_snap_context(),
13611 ceph::real_clock::now(),
13612 0, true, &onfinish
);
13613 in
->mtime
= in
->ctime
= ceph_clock_now();
13615 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13617 client_lock
.unlock();
13619 client_lock
.lock();
13620 _sync_write_commit(in
);
13622 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13623 uint64_t size
= offset
+ length
;
13624 if (size
> in
->size
) {
13626 in
->mtime
= in
->ctime
= ceph_clock_now();
13628 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13630 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13631 check_caps(in
, CHECK_CAPS_NODELAY
);
13632 } else if (is_max_size_approaching(in
)) {
13638 if (nullptr != onuninline
) {
13639 client_lock
.unlock();
13640 int ret
= onuninline
->wait();
13641 client_lock
.lock();
13643 if (ret
>= 0 || ret
== -ECANCELED
) {
13644 in
->inline_data
.clear();
13645 in
->inline_version
= CEPH_INLINE_NONE
;
13646 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13652 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13657 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13659 return -EOPNOTSUPP
;
13665 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13667 std::lock_guard
lock(client_lock
);
13668 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13669 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13670 tout(cct
) << (unsigned long)fh
<< std::endl
;
13675 return _fallocate(fh
, mode
, offset
, length
);
13678 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13680 std::lock_guard
lock(client_lock
);
13681 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13686 Fh
*fh
= get_filehandle(fd
);
13689 #if defined(__linux__) && defined(O_PATH)
13690 if (fh
->flags
& O_PATH
)
13693 return _fallocate(fh
, mode
, offset
, length
);
13696 int Client::ll_release(Fh
*fh
)
13698 std::lock_guard
lock(client_lock
);
13703 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13705 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13706 tout(cct
) << (unsigned long)fh
<< std::endl
;
13708 if (ll_unclosed_fh_set
.count(fh
))
13709 ll_unclosed_fh_set
.erase(fh
);
13710 return _release_fh(fh
);
13713 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13715 std::lock_guard
lock(client_lock
);
13717 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13718 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13723 return _getlk(fh
, fl
, owner
);
13726 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13728 std::lock_guard
lock(client_lock
);
13730 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13731 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13736 return _setlk(fh
, fl
, owner
, sleep
);
13739 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13741 std::lock_guard
lock(client_lock
);
13743 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13744 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13749 return _flock(fh
, cmd
, owner
);
13752 int Client::set_deleg_timeout(uint32_t timeout
)
13754 std::lock_guard
lock(client_lock
);
13757 * The whole point is to prevent blacklisting so we must time out the
13758 * delegation before the session autoclose timeout kicks in.
13760 if (timeout
>= mdsmap
->get_session_autoclose())
13763 deleg_timeout
= timeout
;
13767 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13771 std::lock_guard
lock(client_lock
);
13776 Inode
*inode
= fh
->inode
.get();
13779 case CEPH_DELEGATION_NONE
:
13780 inode
->unset_deleg(fh
);
13785 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13786 } catch (std::bad_alloc
&) {
13794 class C_Client_RequestInterrupt
: public Context
{
13799 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13802 void finish(int r
) override
{
13803 std::lock_guard
l(client
->client_lock
);
13804 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13805 client
->_interrupt_filelock(req
);
13806 client
->put_request(req
);
13810 void Client::ll_interrupt(void *d
)
13812 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13813 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13814 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13815 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13818 // =========================================
13821 // expose file layouts
13823 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13824 const UserPerm
& perms
)
13826 std::lock_guard
lock(client_lock
);
13831 filepath
path(relpath
);
13833 int r
= path_walk(path
, &in
, perms
);
13839 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13843 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13845 std::lock_guard
lock(client_lock
);
13850 Fh
*f
= get_filehandle(fd
);
13853 Inode
*in
= f
->inode
.get();
13857 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13861 int64_t Client::get_default_pool_id()
13863 std::lock_guard
lock(client_lock
);
13868 /* first data pool is the default */
13869 return mdsmap
->get_first_data_pool();
13874 int64_t Client::get_pool_id(const char *pool_name
)
13876 std::lock_guard
lock(client_lock
);
13881 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13885 string
Client::get_pool_name(int64_t pool
)
13887 std::lock_guard
lock(client_lock
);
13892 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13893 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13897 int Client::get_pool_replication(int64_t pool
)
13899 std::lock_guard
lock(client_lock
);
13904 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13905 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13909 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13911 std::lock_guard
lock(client_lock
);
13916 Fh
*f
= get_filehandle(fd
);
13919 Inode
*in
= f
->inode
.get();
13921 vector
<ObjectExtent
> extents
;
13922 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13923 ceph_assert(extents
.size() == 1);
13925 objecter
->with_osdmap([&](const OSDMap
& o
) {
13926 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13927 o
.pg_to_acting_osds(pg
, osds
);
13934 * Return the remainder of the extent (stripe unit)
13936 * If length = 1 is passed to Striper::file_to_extents we get a single
13937 * extent back, but its length is one so we still need to compute the length
13938 * to the end of the stripe unit.
13940 * If length = su then we may get 1 or 2 objects back in the extents vector
13941 * which would have to be examined. Even then, the offsets are local to the
13942 * object, so matching up to the file offset is extra work.
13944 * It seems simpler to stick with length = 1 and manually compute the
13948 uint64_t su
= in
->layout
.stripe_unit
;
13949 *len
= su
- (off
% su
);
13955 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13957 std::lock_guard
lock(client_lock
);
13964 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13965 return o
.crush
->get_full_location_ordered(id
, path
);
13969 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13970 vector
<entity_addr_t
>& address
)
13972 std::lock_guard
lock(client_lock
);
13977 Fh
*f
= get_filehandle(fd
);
13980 Inode
*in
= f
->inode
.get();
13983 vector
<ObjectExtent
> extents
;
13984 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13985 in
->truncate_size
, extents
);
13986 ceph_assert(extents
.size() == 1);
13988 // now we have the object and its 'layout'
13989 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13990 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13992 o
.pg_to_acting_osds(pg
, osds
);
13995 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13996 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
13997 address
.push_back(addr
);
14003 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
14005 std::lock_guard
lock(client_lock
);
14010 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14011 if (!o
.exists(osd
))
14014 addr
= o
.get_addrs(osd
).front();
14019 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
14020 loff_t length
, loff_t offset
)
14022 std::lock_guard
lock(client_lock
);
14027 Fh
*f
= get_filehandle(fd
);
14030 Inode
*in
= f
->inode
.get();
14032 // map to a list of extents
14033 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
14035 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
14040 /* find an osd with the same ip. -ENXIO if none. */
14041 int Client::get_local_osd()
14043 std::lock_guard
lock(client_lock
);
14048 objecter
->with_osdmap([this](const OSDMap
& o
) {
14049 if (o
.get_epoch() != local_osd_epoch
) {
14050 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
14051 local_osd_epoch
= o
.get_epoch();
14062 // ===============================
14064 void Client::ms_handle_connect(Connection
*con
)
14066 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14069 bool Client::ms_handle_reset(Connection
*con
)
14071 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14075 void Client::ms_handle_remote_reset(Connection
*con
)
14077 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14078 std::lock_guard
l(client_lock
);
14079 switch (con
->get_peer_type()) {
14080 case CEPH_ENTITY_TYPE_MDS
:
14082 // kludge to figure out which mds this is; fixme with a Connection* state
14083 mds_rank_t mds
= MDS_RANK_NONE
;
14084 MetaSession
*s
= NULL
;
14085 for (auto &p
: mds_sessions
) {
14086 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14092 assert (s
!= NULL
);
14093 switch (s
->state
) {
14094 case MetaSession::STATE_CLOSING
:
14095 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14096 _closed_mds_session(s
);
14099 case MetaSession::STATE_OPENING
:
14101 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14102 list
<Context
*> waiters
;
14103 waiters
.swap(s
->waiting_for_open
);
14104 _closed_mds_session(s
);
14105 MetaSession
*news
= _get_or_open_mds_session(mds
);
14106 news
->waiting_for_open
.swap(waiters
);
14110 case MetaSession::STATE_OPEN
:
14112 objecter
->maybe_request_map(); /* to check if we are blacklisted */
14113 const auto& conf
= cct
->_conf
;
14114 if (conf
->client_reconnect_stale
) {
14115 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14116 _closed_mds_session(s
);
14118 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14119 s
->state
= MetaSession::STATE_STALE
;
14124 case MetaSession::STATE_NEW
:
14125 case MetaSession::STATE_CLOSED
:
14135 bool Client::ms_handle_refused(Connection
*con
)
14137 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14141 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14143 Inode
*quota_in
= root_ancestor
;
14144 SnapRealm
*realm
= in
->snaprealm
;
14146 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14147 if (realm
->ino
!= in
->ino
) {
14148 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14149 if (p
== inode_map
.end())
14152 if (p
->second
->quota
.is_enable()) {
14153 quota_in
= p
->second
;
14157 realm
= realm
->pparent
;
14159 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14164 * Traverse quota ancestors of the Inode, return true
14165 * if any of them passes the passed function
14167 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14168 std::function
<bool (const Inode
&in
)> test
)
14171 ceph_assert(in
!= NULL
);
14176 if (in
== root_ancestor
) {
14177 // We're done traversing, drop out
14180 // Continue up the tree
14181 in
= get_quota_root(in
, perms
);
14188 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14190 return check_quota_condition(in
, perms
,
14191 [](const Inode
&in
) {
14192 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14196 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14197 const UserPerm
& perms
)
14199 return check_quota_condition(in
, perms
,
14200 [&new_bytes
](const Inode
&in
) {
14201 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14202 > in
.quota
.max_bytes
;
14206 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14208 ceph_assert(in
->size
>= in
->reported_size
);
14209 const uint64_t size
= in
->size
- in
->reported_size
;
14210 return check_quota_condition(in
, perms
,
14211 [&size
](const Inode
&in
) {
14212 if (in
.quota
.max_bytes
) {
14213 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14217 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14218 return (space
>> 4) < size
;
14232 int Client::check_pool_perm(Inode
*in
, int need
)
14234 if (!cct
->_conf
->client_check_pool_perm
)
14237 int64_t pool_id
= in
->layout
.pool_id
;
14238 std::string pool_ns
= in
->layout
.pool_ns
;
14239 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14242 auto it
= pool_perms
.find(perm_key
);
14243 if (it
== pool_perms
.end())
14245 if (it
->second
== POOL_CHECKING
) {
14246 // avoid concurrent checkings
14247 wait_on_list(waiting_for_pool_perm
);
14250 ceph_assert(have
& POOL_CHECKED
);
14256 if (in
->snapid
!= CEPH_NOSNAP
) {
14257 // pool permission check needs to write to the first object. But for snapshot,
14258 // head of the first object may have alread been deleted. To avoid creating
14259 // orphan object, skip the check for now.
14263 pool_perms
[perm_key
] = POOL_CHECKING
;
14266 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14267 object_t oid
= oid_buf
;
14269 SnapContext nullsnapc
;
14271 C_SaferCond rd_cond
;
14272 ObjectOperation rd_op
;
14273 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14275 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14276 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14278 C_SaferCond wr_cond
;
14279 ObjectOperation wr_op
;
14280 wr_op
.create(true);
14282 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14283 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14285 client_lock
.unlock();
14286 int rd_ret
= rd_cond
.wait();
14287 int wr_ret
= wr_cond
.wait();
14288 client_lock
.lock();
14290 bool errored
= false;
14292 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14294 else if (rd_ret
!= -EPERM
) {
14295 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14296 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14300 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14301 have
|= POOL_WRITE
;
14302 else if (wr_ret
!= -EPERM
) {
14303 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14304 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14309 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14310 // Raise EIO because actual error code might be misleading for
14311 // userspace filesystem user.
14312 pool_perms
.erase(perm_key
);
14313 signal_cond_list(waiting_for_pool_perm
);
14317 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14318 signal_cond_list(waiting_for_pool_perm
);
14321 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14322 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14323 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14326 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14327 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14328 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14335 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14337 if (acl_type
== POSIX_ACL
) {
14338 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14339 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14341 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14347 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14349 if (acl_type
== NO_ACL
)
14352 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14356 if (acl_type
== POSIX_ACL
) {
14357 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14358 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14359 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14360 r
= posix_acl_access_chmod(acl
, mode
);
14363 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14369 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14373 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14374 const UserPerm
& perms
)
14376 if (acl_type
== NO_ACL
)
14379 if (S_ISLNK(*mode
))
14382 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14386 if (acl_type
== POSIX_ACL
) {
14387 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14388 map
<string
, bufferptr
> xattrs
;
14390 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14391 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14392 r
= posix_acl_inherit_mode(acl
, mode
);
14397 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14401 xattrs
[ACL_EA_ACCESS
] = acl
;
14404 if (S_ISDIR(*mode
))
14405 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14409 encode(xattrs
, xattrs_bl
);
14412 *mode
&= ~umask_cb(callback_handle
);
14417 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14421 void Client::set_filer_flags(int flags
)
14423 std::lock_guard
l(client_lock
);
14424 ceph_assert(flags
== 0 ||
14425 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14426 objecter
->add_global_op_flags(flags
);
14429 void Client::clear_filer_flags(int flags
)
14431 std::lock_guard
l(client_lock
);
14432 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14433 objecter
->clear_global_op_flag(flags
);
14436 // called before mount
14437 void Client::set_uuid(const std::string
& uuid
)
14439 std::lock_guard
l(client_lock
);
14440 assert(initialized
);
14441 assert(!uuid
.empty());
14443 metadata
["uuid"] = uuid
;
14447 // called before mount. 0 means infinite
14448 void Client::set_session_timeout(unsigned timeout
)
14450 std::lock_guard
l(client_lock
);
14451 assert(initialized
);
14453 metadata
["timeout"] = stringify(timeout
);
14456 // called before mount
14457 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14458 const std::string
& fs_name
)
14460 std::lock_guard
l(client_lock
);
14468 auto it
= metadata
.find("uuid");
14469 if (it
!= metadata
.end() && it
->second
== uuid
)
14473 int r
= subscribe_mdsmap(fs_name
);
14475 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14479 if (metadata
.empty())
14480 populate_metadata("");
14482 while (mdsmap
->get_epoch() == 0)
14483 wait_on_list(waiting_for_mdsmap
);
14486 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14487 if (!mdsmap
->is_up(mds
)) {
14488 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14489 wait_on_list(waiting_for_mdsmap
);
14493 MetaSession
*session
;
14494 if (!have_open_session(mds
)) {
14495 session
= _get_or_open_mds_session(mds
);
14496 if (session
->state
!= MetaSession::STATE_OPENING
) {
14500 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14501 wait_on_context_list(session
->waiting_for_open
);
14502 if (rejected_by_mds
.count(mds
))
14507 session
= &mds_sessions
.at(mds
);
14508 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14509 return -EOPNOTSUPP
;
14511 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14512 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14513 session
->reclaim_state
= MetaSession::RECLAIMING
;
14514 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
14515 session
->con
->send_message2(std::move(m
));
14516 wait_on_list(waiting_for_reclaim
);
14517 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14518 return reclaim_errno
? : -ENOTRECOVERABLE
;
14524 // didn't find target session in any mds
14525 if (reclaim_target_addrs
.empty()) {
14526 if (flags
& CEPH_RECLAIM_RESET
)
14528 return -ENOTRECOVERABLE
;
14531 if (flags
& CEPH_RECLAIM_RESET
)
14534 // use blacklist to check if target session was killed
14535 // (config option mds_session_blacklist_on_evict needs to be true)
14537 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14538 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14539 client_lock
.unlock();
14541 client_lock
.lock();
14544 bool blacklisted
= objecter
->with_osdmap(
14545 [this](const OSDMap
&osd_map
) -> bool {
14546 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14549 return -ENOTRECOVERABLE
;
14551 metadata
["reclaiming_uuid"] = uuid
;
14555 void Client::finish_reclaim()
14557 auto it
= metadata
.find("reclaiming_uuid");
14558 if (it
== metadata
.end()) {
14559 for (auto &p
: mds_sessions
)
14560 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14564 for (auto &p
: mds_sessions
) {
14565 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14566 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
14567 p
.second
.con
->send_message2(std::move(m
));
14570 metadata
["uuid"] = it
->second
;
14571 metadata
.erase(it
);
14574 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14576 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14577 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14579 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14581 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14585 if (reply
->get_result() >= 0) {
14586 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14587 if (reply
->get_epoch() > reclaim_osd_epoch
)
14588 reclaim_osd_epoch
= reply
->get_epoch();
14589 if (!reply
->get_addrs().empty())
14590 reclaim_target_addrs
= reply
->get_addrs();
14592 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14593 reclaim_errno
= reply
->get_result();
14596 signal_cond_list(waiting_for_reclaim
);
14600 * This is included in cap release messages, to cause
14601 * the MDS to wait until this OSD map epoch. It is necessary
14602 * in corner cases where we cancel RADOS ops, so that
14603 * nobody else tries to do IO to the same objects in
14604 * the same epoch as the cancelled ops.
14606 void Client::set_cap_epoch_barrier(epoch_t e
)
14608 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14609 cap_epoch_barrier
= e
;
14612 const char** Client::get_tracked_conf_keys() const
14614 static const char* keys
[] = {
14615 "client_cache_size",
14616 "client_cache_mid",
14618 "client_deleg_timeout",
14619 "client_deleg_break_on_open",
14625 void Client::handle_conf_change(const ConfigProxy
& conf
,
14626 const std::set
<std::string
> &changed
)
14628 std::lock_guard
lock(client_lock
);
14630 if (changed
.count("client_cache_mid")) {
14631 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14633 if (changed
.count("client_acl_type")) {
14635 if (cct
->_conf
->client_acl_type
== "posix_acl")
14636 acl_type
= POSIX_ACL
;
14640 void intrusive_ptr_add_ref(Inode
*in
)
14645 void intrusive_ptr_release(Inode
*in
)
14647 in
->client
->put_inode(in
);
14650 mds_rank_t
Client::_get_random_up_mds() const
14652 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14654 std::set
<mds_rank_t
> up
;
14655 mdsmap
->get_up_mds_set(up
);
14658 return MDS_RANK_NONE
;
14659 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14660 for (int n
= rand() % up
.size(); n
; n
--)
14666 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14667 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14669 monclient
->set_messenger(m
);
14670 objecter
->set_client_incarnation(0);
14673 StandaloneClient::~StandaloneClient()
14676 objecter
= nullptr;
14679 int StandaloneClient::init()
14684 client_lock
.lock();
14685 ceph_assert(!is_initialized());
14687 messenger
->add_dispatcher_tail(objecter
);
14688 messenger
->add_dispatcher_tail(this);
14690 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14691 int r
= monclient
->init();
14693 // need to do cleanup because we're in an intermediate init state
14695 client_lock
.unlock();
14696 objecter
->shutdown();
14697 objectcacher
->stop();
14698 monclient
->shutdown();
14703 client_lock
.unlock();
14709 void StandaloneClient::shutdown()
14711 Client::shutdown();
14712 objecter
->shutdown();
14713 monclient
->shutdown();