1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
23 #include <sys/param.h>
26 #include <sys/utsname.h>
29 #include <boost/lexical_cast.hpp>
30 #include <boost/fusion/include/std_pair.hpp>
32 #if defined(__FreeBSD__)
33 #define XATTR_CREATE 0x1
34 #define XATTR_REPLACE 0x2
36 #include <sys/xattr.h>
39 #if defined(__linux__)
40 #include <linux/falloc.h>
43 #include <sys/statvfs.h>
45 #include "common/config.h"
46 #include "common/version.h"
48 #include "mon/MonClient.h"
50 #include "messages/MClientCaps.h"
51 #include "messages/MClientLease.h"
52 #include "messages/MClientQuota.h"
53 #include "messages/MClientReclaim.h"
54 #include "messages/MClientReclaimReply.h"
55 #include "messages/MClientReconnect.h"
56 #include "messages/MClientReply.h"
57 #include "messages/MClientRequest.h"
58 #include "messages/MClientRequestForward.h"
59 #include "messages/MClientSession.h"
60 #include "messages/MClientSnap.h"
61 #include "messages/MCommandReply.h"
62 #include "messages/MFSMap.h"
63 #include "messages/MFSMapUser.h"
64 #include "messages/MMDSMap.h"
65 #include "messages/MOSDMap.h"
67 #include "mds/flock.h"
68 #include "mds/cephfs_features.h"
69 #include "osd/OSDMap.h"
70 #include "osdc/Filer.h"
72 #include "common/Cond.h"
73 #include "common/perf_counters.h"
74 #include "common/admin_socket.h"
75 #include "common/errno.h"
76 #include "include/str_list.h"
78 #define dout_subsys ceph_subsys_client
80 #include "include/lru.h"
81 #include "include/compat.h"
82 #include "include/stringify.h"
87 #include "Delegation.h"
89 #include "ClientSnapRealm.h"
91 #include "MetaSession.h"
92 #include "MetaRequest.h"
93 #include "ObjecterWriteback.h"
94 #include "posix_acl.h"
96 #include "include/ceph_assert.h"
97 #include "include/stat.h"
99 #include "include/cephfs/ceph_ll_client.h"
101 #if HAVE_GETGROUPLIST
108 #define dout_prefix *_dout << "client." << whoami << " "
110 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112 // FreeBSD fails to define this
116 // Darwin fails to define this
125 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
129 Client
*client
= static_cast<Client
*>(p
);
130 client
->flush_set_callback(oset
);
136 Client::CommandHook::CommandHook(Client
*client
) :
141 int Client::CommandHook::call(
142 std::string_view command
,
143 const cmdmap_t
& cmdmap
,
148 f
->open_object_section("result");
150 std::lock_guard l
{m_client
->client_lock
};
151 if (command
== "mds_requests")
152 m_client
->dump_mds_requests(f
);
153 else if (command
== "mds_sessions")
154 m_client
->dump_mds_sessions(f
);
155 else if (command
== "dump_cache")
156 m_client
->dump_cache(f
);
157 else if (command
== "kick_stale_sessions")
158 m_client
->_kick_stale_sessions();
159 else if (command
== "status")
160 m_client
->dump_status(f
);
162 ceph_abort_msg("bad command registered");
171 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
172 : inode(in
), offset(0), next_offset(2),
173 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
177 void Client::_reset_faked_inos()
180 free_faked_inos
.clear();
181 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
182 last_used_faked_ino
= 0;
183 last_used_faked_root
= 0;
184 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
187 void Client::_assign_faked_ino(Inode
*in
)
189 if (0 == last_used_faked_ino
)
190 last_used_faked_ino
= last_used_faked_ino
+ 2048; // start(1024)~2048 reserved for _assign_faked_root
191 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
192 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
193 last_used_faked_ino
= 2048;
194 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
196 ceph_assert(it
!= free_faked_inos
.end());
197 if (last_used_faked_ino
< it
.get_start()) {
198 ceph_assert(it
.get_len() > 0);
199 last_used_faked_ino
= it
.get_start();
201 ++last_used_faked_ino
;
202 ceph_assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
204 in
->faked_ino
= last_used_faked_ino
;
205 free_faked_inos
.erase(in
->faked_ino
);
206 faked_ino_map
[in
->faked_ino
] = in
->vino();
210 * In the faked mode, if you export multiple subdirectories,
211 * you will see that the inode numbers of the exported subdirectories
212 * are the same. so we distinguish the mount point by reserving
213 * the "fake ids" between "1024~2048" and combining the last
214 * 10bits(0x3ff) of the "root inodes".
216 void Client::_assign_faked_root(Inode
*in
)
218 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
219 if (it
== free_faked_inos
.end() && last_used_faked_root
> 0) {
220 last_used_faked_root
= 0;
221 it
= free_faked_inos
.lower_bound(last_used_faked_root
+ 1);
223 assert(it
!= free_faked_inos
.end());
224 vinodeno_t inode_info
= in
->vino();
225 uint64_t inode_num
= (uint64_t)inode_info
.ino
;
226 ldout(cct
, 10) << "inode_num " << inode_num
<< "inode_num & 0x3ff=" << (inode_num
& 0x3ff)<< dendl
;
227 last_used_faked_root
= it
.get_start() + (inode_num
& 0x3ff); // 0x3ff mask and get_start will not exceed 2048
228 assert(it
.get_start() + it
.get_len() > last_used_faked_root
);
230 in
->faked_ino
= last_used_faked_root
;
231 free_faked_inos
.erase(in
->faked_ino
);
232 faked_ino_map
[in
->faked_ino
] = in
->vino();
235 void Client::_release_faked_ino(Inode
*in
)
237 free_faked_inos
.insert(in
->faked_ino
);
238 faked_ino_map
.erase(in
->faked_ino
);
241 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
246 else if (faked_ino_map
.count(ino
))
247 vino
= faked_ino_map
[ino
];
249 vino
= vinodeno_t(0, CEPH_NOSNAP
);
250 ldout(cct
, 10) << __func__
<< " " << ino
<< " -> " << vino
<< dendl
;
254 vinodeno_t
Client::map_faked_ino(ino_t ino
)
256 std::lock_guard
lock(client_lock
);
257 return _map_faked_ino(ino
);
262 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
263 : Dispatcher(m
->cct
),
264 timer(m
->cct
, client_lock
),
268 whoami(mc
->get_global_id()),
269 async_ino_invalidator(m
->cct
),
270 async_dentry_invalidator(m
->cct
),
271 interrupt_finisher(m
->cct
),
272 remount_finisher(m
->cct
),
273 async_ino_releasor(m
->cct
),
274 objecter_finisher(m
->cct
),
275 m_command_hook(this),
280 user_id
= cct
->_conf
->client_mount_uid
;
281 group_id
= cct
->_conf
->client_mount_gid
;
282 fuse_default_permissions
= cct
->_conf
.get_val
<bool>(
283 "fuse_default_permissions");
285 if (cct
->_conf
->client_acl_type
== "posix_acl")
286 acl_type
= POSIX_ACL
;
288 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
291 free_fd_set
.insert(10, 1<<30);
293 mdsmap
.reset(new MDSMap
);
296 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
298 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
299 client_flush_set_callback
, // all commit callback
301 cct
->_conf
->client_oc_size
,
302 cct
->_conf
->client_oc_max_objects
,
303 cct
->_conf
->client_oc_max_dirty
,
304 cct
->_conf
->client_oc_target_dirty
,
305 cct
->_conf
->client_oc_max_dirty_age
,
312 ceph_assert(ceph_mutex_is_not_locked(client_lock
));
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 std::lock_guard l
{client_lock
};
321 void Client::tear_down_cache()
324 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
328 ldout(cct
, 1) << __func__
<< " forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
333 while (!opened_dirs
.empty()) {
334 dir_result_t
*dirp
= *opened_dirs
.begin();
335 ldout(cct
, 1) << __func__
<< " forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
344 ceph_assert(lru
.lru_get_size() == 0);
347 ceph_assert(inode_map
.size() <= 1 + root_parents
.size());
348 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
352 while (!root_parents
.empty())
353 root_parents
.erase(root_parents
.begin());
358 ceph_assert(inode_map
.empty());
361 inodeno_t
Client::get_root_ino()
363 std::lock_guard
l(client_lock
);
364 if (use_faked_inos())
365 return root
->faked_ino
;
370 Inode
*Client::get_root()
372 std::lock_guard
l(client_lock
);
380 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
383 in
->make_long_path(path
);
384 ldout(cct
, 1) << "dump_inode: "
385 << (disconnected
? "DISCONNECTED ":"")
386 << "inode " << in
->ino
388 << " ref " << in
->get_num_ref()
392 f
->open_object_section("inode");
393 f
->dump_stream("path") << path
;
395 f
->dump_int("disconnected", 1);
402 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
403 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
404 it
!= in
->dir
->dentries
.end();
406 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
408 f
->open_object_section("dentry");
412 if (it
->second
->inode
)
413 dump_inode(f
, it
->second
->inode
.get(), did
, false);
418 void Client::dump_cache(Formatter
*f
)
422 ldout(cct
, 1) << __func__
<< dendl
;
425 f
->open_array_section("cache");
428 dump_inode(f
, root
, did
, true);
430 // make a second pass to catch anything disconnected
431 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
432 it
!= inode_map
.end();
434 if (did
.count(it
->second
))
436 dump_inode(f
, it
->second
, did
, true);
443 void Client::dump_status(Formatter
*f
)
445 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
447 ldout(cct
, 1) << __func__
<< dendl
;
449 const epoch_t osd_epoch
450 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
453 f
->open_object_section("metadata");
454 for (const auto& kv
: metadata
)
455 f
->dump_string(kv
.first
.c_str(), kv
.second
);
458 f
->dump_int("dentry_count", lru
.lru_get_size());
459 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
460 f
->dump_int("id", get_nodeid().v
);
461 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
462 f
->dump_object("inst", inst
);
463 f
->dump_object("addr", inst
.addr
);
464 f
->dump_stream("inst_str") << inst
.name
<< " " << inst
.addr
.get_legacy_str();
465 f
->dump_string("addr_str", inst
.addr
.get_legacy_str());
466 f
->dump_int("inode_count", inode_map
.size());
467 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
468 f
->dump_int("osd_epoch", osd_epoch
);
469 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
470 f
->dump_bool("blacklisted", blacklisted
);
474 void Client::_pre_init()
478 objecter_finisher
.start();
479 filer
.reset(new Filer(objecter
, &objecter_finisher
));
480 objecter
->enable_blacklist_events();
482 objectcacher
->start();
489 std::lock_guard l
{client_lock
};
490 ceph_assert(!initialized
);
491 messenger
->add_dispatcher_tail(this);
497 void Client::_finish_init()
500 std::lock_guard l
{client_lock
};
502 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
503 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
504 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
505 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
506 plb
.add_time_avg(l_c_read
, "rdlat", "Latency of a file data read operation");
507 plb
.add_time_avg(l_c_fsync
, "fsync", "Latency of a file sync operation");
508 logger
.reset(plb
.create_perf_counters());
509 cct
->get_perfcounters_collection()->add(logger
.get());
512 cct
->_conf
.add_observer(this);
514 AdminSocket
* admin_socket
= cct
->get_admin_socket();
515 int ret
= admin_socket
->register_command("mds_requests",
517 "show in-progress mds requests");
519 lderr(cct
) << "error registering admin socket command: "
520 << cpp_strerror(-ret
) << dendl
;
522 ret
= admin_socket
->register_command("mds_sessions",
524 "show mds session state");
526 lderr(cct
) << "error registering admin socket command: "
527 << cpp_strerror(-ret
) << dendl
;
529 ret
= admin_socket
->register_command("dump_cache",
531 "show in-memory metadata cache contents");
533 lderr(cct
) << "error registering admin socket command: "
534 << cpp_strerror(-ret
) << dendl
;
536 ret
= admin_socket
->register_command("kick_stale_sessions",
538 "kick sessions that were remote reset");
540 lderr(cct
) << "error registering admin socket command: "
541 << cpp_strerror(-ret
) << dendl
;
543 ret
= admin_socket
->register_command("status",
545 "show overall client status");
547 lderr(cct
) << "error registering admin socket command: "
548 << cpp_strerror(-ret
) << dendl
;
551 std::lock_guard l
{client_lock
};
555 void Client::shutdown()
557 ldout(cct
, 1) << __func__
<< dendl
;
559 // If we were not mounted, but were being used for sending
560 // MDS commands, we may have sessions that need closing.
562 std::lock_guard l
{client_lock
};
565 cct
->_conf
.remove_observer(this);
567 cct
->get_admin_socket()->unregister_commands(&m_command_hook
);
569 if (ino_invalidate_cb
) {
570 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
571 async_ino_invalidator
.wait_for_empty();
572 async_ino_invalidator
.stop();
575 if (dentry_invalidate_cb
) {
576 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
577 async_dentry_invalidator
.wait_for_empty();
578 async_dentry_invalidator
.stop();
581 if (switch_interrupt_cb
) {
582 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
583 interrupt_finisher
.wait_for_empty();
584 interrupt_finisher
.stop();
588 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
589 remount_finisher
.wait_for_empty();
590 remount_finisher
.stop();
593 if (ino_release_cb
) {
594 ldout(cct
, 10) << "shutdown stopping inode release finisher" << dendl
;
595 async_ino_releasor
.wait_for_empty();
596 async_ino_releasor
.stop();
599 objectcacher
->stop(); // outside of client_lock! this does a join.
601 std::lock_guard l
{client_lock
};
602 ceph_assert(initialized
);
606 objecter_finisher
.wait_for_empty();
607 objecter_finisher
.stop();
610 cct
->get_perfcounters_collection()->remove(logger
.get());
616 // ===================
617 // metadata cache stuff
619 void Client::trim_cache(bool trim_kernel_dcache
)
621 uint64_t max
= cct
->_conf
->client_cache_size
;
622 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
624 while (lru
.lru_get_size() != last
) {
625 last
= lru
.lru_get_size();
627 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
630 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
637 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
638 _invalidate_kernel_dcache();
641 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
642 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
646 while (!root_parents
.empty())
647 root_parents
.erase(root_parents
.begin());
653 void Client::trim_cache_for_reconnect(MetaSession
*s
)
655 mds_rank_t mds
= s
->mds_num
;
656 ldout(cct
, 20) << __func__
<< " mds." << mds
<< dendl
;
659 list
<Dentry
*> skipped
;
660 while (lru
.lru_get_size() > 0) {
661 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
665 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
666 dn
->dir
->parent_inode
->caps
.count(mds
)) {
670 skipped
.push_back(dn
);
673 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
674 lru
.lru_insert_mid(*p
);
676 ldout(cct
, 20) << __func__
<< " mds." << mds
677 << " trimmed " << trimmed
<< " dentries" << dendl
;
679 if (s
->caps
.size() > 0)
680 _invalidate_kernel_dcache();
683 void Client::trim_dentry(Dentry
*dn
)
685 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
687 << std::hex
<< dn
->dir
->parent_inode
->ino
<< std::dec
690 Inode
*diri
= dn
->dir
->parent_inode
;
691 diri
->dir_release_count
++;
692 clear_dir_complete_and_ordered(diri
, true);
694 unlink(dn
, false, false); // drop dir, drop dentry
698 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
699 uint64_t truncate_seq
, uint64_t truncate_size
)
701 uint64_t prior_size
= in
->size
;
703 if (truncate_seq
> in
->truncate_seq
||
704 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
705 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
707 in
->reported_size
= size
;
708 if (truncate_seq
!= in
->truncate_seq
) {
709 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
710 << truncate_seq
<< dendl
;
711 in
->truncate_seq
= truncate_seq
;
712 in
->oset
.truncate_seq
= truncate_seq
;
714 // truncate cached file data
715 if (prior_size
> size
) {
716 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
720 // truncate inline data
721 if (in
->inline_version
< CEPH_INLINE_NONE
) {
722 uint32_t len
= in
->inline_data
.length();
724 in
->inline_data
.splice(size
, len
- size
);
727 if (truncate_seq
>= in
->truncate_seq
&&
728 in
->truncate_size
!= truncate_size
) {
730 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
731 << truncate_size
<< dendl
;
732 in
->truncate_size
= truncate_size
;
733 in
->oset
.truncate_size
= truncate_size
;
735 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
740 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
741 utime_t ctime
, utime_t mtime
, utime_t atime
)
743 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
744 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
746 if (time_warp_seq
> in
->time_warp_seq
)
747 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
748 << " is higher than local time_warp_seq "
749 << in
->time_warp_seq
<< dendl
;
752 // be careful with size, mtime, atime
753 if (issued
& (CEPH_CAP_FILE_EXCL
|
755 CEPH_CAP_FILE_BUFFER
|
757 CEPH_CAP_XATTR_EXCL
)) {
758 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
759 if (ctime
> in
->ctime
)
761 if (time_warp_seq
> in
->time_warp_seq
) {
762 //the mds updated times, so take those!
765 in
->time_warp_seq
= time_warp_seq
;
766 } else if (time_warp_seq
== in
->time_warp_seq
) {
768 if (mtime
> in
->mtime
)
770 if (atime
> in
->atime
)
772 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
773 //ignore mds values as we have a higher seq
776 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
777 if (time_warp_seq
>= in
->time_warp_seq
) {
781 in
->time_warp_seq
= time_warp_seq
;
785 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
786 << time_warp_seq
<< " is lower than local time_warp_seq "
792 void Client::_fragmap_remove_non_leaves(Inode
*in
)
794 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
795 if (!in
->dirfragtree
.is_leaf(p
->first
))
796 in
->fragmap
.erase(p
++);
801 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
803 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
804 if (p
->second
== mds
)
805 in
->fragmap
.erase(p
++);
810 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
811 MetaSession
*session
,
812 const UserPerm
& request_perms
)
815 bool was_new
= false;
816 if (inode_map
.count(st
->vino
)) {
817 in
= inode_map
[st
->vino
];
818 ldout(cct
, 12) << __func__
<< " had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
820 in
= new Inode(this, st
->vino
, &st
->layout
);
821 inode_map
[st
->vino
] = in
;
823 if (use_faked_inos())
824 _assign_faked_ino(in
);
828 if (use_faked_inos())
829 _assign_faked_root(root
);
832 } else if (!mounted
) {
833 root_parents
[root_ancestor
] = in
;
838 in
->ino
= st
->vino
.ino
;
839 in
->snapid
= st
->vino
.snapid
;
840 in
->mode
= st
->mode
& S_IFMT
;
845 if (in
->is_symlink())
846 in
->symlink
= st
->symlink
;
848 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
849 bool new_version
= false;
850 if (in
->version
== 0 ||
851 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
852 (in
->version
& ~1) < st
->version
))
856 in
->caps_issued(&issued
);
857 issued
|= in
->caps_dirty();
858 int new_issued
= ~issued
& (int)st
->cap
.caps
;
860 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
861 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
865 in
->btime
= st
->btime
;
866 in
->snap_btime
= st
->snap_btime
;
869 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
870 !(issued
& CEPH_CAP_LINK_EXCL
)) {
871 in
->nlink
= st
->nlink
;
874 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
875 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
876 st
->ctime
, st
->mtime
, st
->atime
);
880 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
881 in
->layout
= st
->layout
;
882 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
886 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
887 in
->dirstat
= st
->dirstat
;
889 // dir_layout/rstat/quota are not tracked by capability, update them only if
890 // the inode stat is from auth mds
891 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
892 in
->dir_layout
= st
->dir_layout
;
893 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
894 in
->rstat
= st
->rstat
;
895 in
->quota
= st
->quota
;
896 in
->dir_pin
= st
->dir_pin
;
898 // move me if/when version reflects fragtree changes.
899 if (in
->dirfragtree
!= st
->dirfragtree
) {
900 in
->dirfragtree
= st
->dirfragtree
;
901 _fragmap_remove_non_leaves(in
);
905 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
906 st
->xattrbl
.length() &&
907 st
->xattr_version
> in
->xattr_version
) {
908 auto p
= st
->xattrbl
.cbegin();
909 decode(in
->xattrs
, p
);
910 in
->xattr_version
= st
->xattr_version
;
913 if (st
->inline_version
> in
->inline_version
) {
914 in
->inline_data
= st
->inline_data
;
915 in
->inline_version
= st
->inline_version
;
918 /* always take a newer change attr */
919 if (st
->change_attr
> in
->change_attr
)
920 in
->change_attr
= st
->change_attr
;
922 if (st
->version
> in
->version
)
923 in
->version
= st
->version
;
926 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
929 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
931 if (in
->snapid
== CEPH_NOSNAP
) {
932 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.wanted
,
933 st
->cap
.seq
, st
->cap
.mseq
, inodeno_t(st
->cap
.realm
),
934 st
->cap
.flags
, request_perms
);
935 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
936 in
->max_size
= st
->max_size
;
937 in
->rstat
= st
->rstat
;
940 // setting I_COMPLETE needs to happen after adding the cap
942 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
943 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
944 in
->dirstat
.nfiles
== 0 &&
945 in
->dirstat
.nsubdirs
== 0) {
946 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
947 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
949 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
950 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
951 in
->dir
->readdir_cache
.clear();
952 for (const auto& p
: in
->dir
->dentries
) {
953 unlink(p
.second
, true, true); // keep dir, keep dentry
955 if (in
->dir
->dentries
.empty())
960 in
->snap_caps
|= st
->cap
.caps
;
968 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
970 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
971 Inode
*in
, utime_t from
, MetaSession
*session
,
975 if (dir
->dentries
.count(dname
))
976 dn
= dir
->dentries
[dname
];
978 ldout(cct
, 12) << __func__
<< " '" << dname
<< "' vino " << in
->vino()
979 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
982 if (dn
&& dn
->inode
) {
983 if (dn
->inode
->vino() == in
->vino()) {
985 ldout(cct
, 12) << " had dentry " << dname
986 << " with correct vino " << dn
->inode
->vino()
989 ldout(cct
, 12) << " had dentry " << dname
990 << " with WRONG vino " << dn
->inode
->vino()
992 unlink(dn
, true, true); // keep dir, keep dentry
996 if (!dn
|| !dn
->inode
) {
997 InodeRef
tmp_ref(in
);
999 if (old_dentry
->dir
!= dir
) {
1000 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
1001 old_diri
->dir_ordered_count
++;
1002 clear_dir_complete_and_ordered(old_diri
, false);
1004 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
1006 Inode
*diri
= dir
->parent_inode
;
1007 diri
->dir_ordered_count
++;
1008 clear_dir_complete_and_ordered(diri
, false);
1009 dn
= link(dir
, dname
, in
, dn
);
1012 update_dentry_lease(dn
, dlease
, from
, session
);
1016 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
1018 utime_t dttl
= from
;
1019 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1023 if (dlease
->mask
& CEPH_LEASE_VALID
) {
1024 if (dttl
> dn
->lease_ttl
) {
1025 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1026 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1027 dn
->lease_ttl
= dttl
;
1028 dn
->lease_mds
= session
->mds_num
;
1029 dn
->lease_seq
= dlease
->seq
;
1030 dn
->lease_gen
= session
->cap_gen
;
1033 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1038 * update MDS location cache for a single inode
1040 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1043 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1044 if (dst
->auth
>= 0) {
1045 in
->fragmap
[dst
->frag
] = dst
->auth
;
1047 in
->fragmap
.erase(dst
->frag
);
1049 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1050 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1051 _fragmap_remove_non_leaves(in
);
1055 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1058 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1060 if (diri
->flags
& I_COMPLETE
) {
1062 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1063 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1065 if (diri
->flags
& I_DIR_ORDERED
) {
1066 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1067 diri
->flags
&= ~I_DIR_ORDERED
;
1071 diri
->dir
->readdir_cache
.clear();
1076 * insert results from readdir or lssnap into the metadata cache.
1078 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1080 auto& reply
= request
->reply
;
1081 ConnectionRef con
= request
->reply
->get_connection();
1083 if(session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1084 features
= (uint64_t)-1;
1087 features
= con
->get_features();
1090 dir_result_t
*dirp
= request
->dirp
;
1093 // the extra buffer list is only set for readdir and lssnap replies
1094 auto p
= reply
->get_extra_bl().cbegin();
1097 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1099 diri
= open_snapdir(diri
);
1102 // only open dir if we're actually adding stuff to it!
1103 Dir
*dir
= diri
->open_dir();
1107 DirStat
dst(p
, features
);
1113 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1114 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1116 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1117 unsigned readdir_offset
= dirp
->next_offset
;
1118 string readdir_start
= dirp
->last_name
;
1119 ceph_assert(!readdir_start
.empty() || readdir_offset
== 2);
1121 unsigned last_hash
= 0;
1123 if (!readdir_start
.empty()) {
1124 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1125 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1126 /* mds understands offset_hash */
1127 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1131 if (fg
!= dst
.frag
) {
1132 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1136 readdir_start
.clear();
1137 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1141 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1142 << ", hash_order=" << hash_order
1143 << ", readdir_start " << readdir_start
1144 << ", last_hash " << last_hash
1145 << ", next_offset " << readdir_offset
<< dendl
;
1147 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1148 fg
.is_leftmost() && readdir_offset
== 2 &&
1149 !(hash_order
&& last_hash
)) {
1150 dirp
->release_count
= diri
->dir_release_count
;
1151 dirp
->ordered_count
= diri
->dir_ordered_count
;
1152 dirp
->start_shared_gen
= diri
->shared_gen
;
1153 dirp
->cache_index
= 0;
1156 dirp
->buffer_frag
= fg
;
1158 _readdir_drop_dirp_buffer(dirp
);
1159 dirp
->buffer
.reserve(numdn
);
1163 for (unsigned i
=0; i
<numdn
; i
++) {
1165 dlease
.decode(p
, features
);
1166 InodeStat
ist(p
, features
);
1168 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1170 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1173 if (diri
->dir
->dentries
.count(dname
)) {
1174 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1175 if (olddn
->inode
!= in
) {
1176 // replace incorrect dentry
1177 unlink(olddn
, true, true); // keep dir, dentry
1178 dn
= link(dir
, dname
, in
, olddn
);
1179 ceph_assert(dn
== olddn
);
1187 dn
= link(dir
, dname
, in
, NULL
);
1190 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1192 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1193 if (hash
!= last_hash
)
1196 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1198 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1200 // add to readdir cache
1201 if (dirp
->release_count
== diri
->dir_release_count
&&
1202 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1203 dirp
->start_shared_gen
== diri
->shared_gen
) {
1204 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1206 ceph_assert(!dirp
->inode
->is_complete_and_ordered());
1207 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1209 dir
->readdir_cache
.push_back(dn
);
1210 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1211 if (dirp
->inode
->is_complete_and_ordered())
1212 ceph_assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1214 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1216 ceph_abort_msg("unexpected readdir buffer idx");
1218 dirp
->cache_index
++;
1220 // add to cached result list
1221 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1222 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1226 dirp
->last_name
= dname
;
1228 dirp
->next_offset
= 2;
1230 dirp
->next_offset
= readdir_offset
;
1232 if (dir
->is_empty())
1239 * insert a trace from a MDS reply into the cache.
1241 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1243 auto& reply
= request
->reply
;
1244 int op
= request
->get_op();
1246 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1247 << " is_target=" << (int)reply
->head
.is_target
1248 << " is_dentry=" << (int)reply
->head
.is_dentry
1251 auto p
= reply
->get_trace_bl().cbegin();
1252 if (request
->got_unsafe
) {
1253 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1254 ceph_assert(p
.end());
1259 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1261 Dentry
*d
= request
->dentry();
1263 Inode
*diri
= d
->dir
->parent_inode
;
1264 diri
->dir_release_count
++;
1265 clear_dir_complete_and_ordered(diri
, true);
1268 if (d
&& reply
->get_result() == 0) {
1269 if (op
== CEPH_MDS_OP_RENAME
) {
1271 Dentry
*od
= request
->old_dentry();
1272 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1274 unlink(od
, true, true); // keep dir, dentry
1275 } else if (op
== CEPH_MDS_OP_RMDIR
||
1276 op
== CEPH_MDS_OP_UNLINK
) {
1278 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1279 unlink(d
, true, true); // keep dir, dentry
1285 ConnectionRef con
= request
->reply
->get_connection();
1287 if (session
->mds_features
.test(CEPHFS_FEATURE_REPLY_ENCODING
)) {
1288 features
= (uint64_t)-1;
1291 features
= con
->get_features();
1293 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1296 SnapRealm
*realm
= NULL
;
1297 if (reply
->snapbl
.length())
1298 update_snap_trace(reply
->snapbl
, &realm
);
1300 ldout(cct
, 10) << " hrm "
1301 << " is_target=" << (int)reply
->head
.is_target
1302 << " is_dentry=" << (int)reply
->head
.is_dentry
1311 if (reply
->head
.is_dentry
) {
1312 dirst
.decode(p
, features
);
1313 dst
.decode(p
, features
);
1315 dlease
.decode(p
, features
);
1319 if (reply
->head
.is_target
) {
1320 ist
.decode(p
, features
);
1321 if (cct
->_conf
->client_debug_getattr_caps
) {
1322 unsigned wanted
= 0;
1323 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1324 wanted
= request
->head
.args
.getattr
.mask
;
1325 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1326 wanted
= request
->head
.args
.open
.mask
;
1328 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1329 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1330 ceph_abort_msg("MDS reply does not contain xattrs");
1333 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1338 if (reply
->head
.is_dentry
) {
1339 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1341 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1344 Dir
*dir
= diri
->open_dir();
1345 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1346 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1349 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1350 dn
= diri
->dir
->dentries
[dname
];
1352 diri
->dir_ordered_count
++;
1353 clear_dir_complete_and_ordered(diri
, false);
1354 unlink(dn
, true, true); // keep dir, dentry
1357 if (dlease
.duration_ms
> 0) {
1359 Dir
*dir
= diri
->open_dir();
1360 dn
= link(dir
, dname
, NULL
, NULL
);
1362 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1365 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1366 op
== CEPH_MDS_OP_MKSNAP
) {
1367 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1368 // fake it for snap lookup
1369 vinodeno_t vino
= ist
.vino
;
1370 vino
.snapid
= CEPH_SNAPDIR
;
1371 ceph_assert(inode_map
.count(vino
));
1372 diri
= inode_map
[vino
];
1374 string dname
= request
->path
.last_dentry();
1377 dlease
.duration_ms
= 0;
1380 Dir
*dir
= diri
->open_dir();
1381 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1383 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1384 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1386 unlink(dn
, true, true); // keep dir, dentry
1392 if (op
== CEPH_MDS_OP_READDIR
||
1393 op
== CEPH_MDS_OP_LSSNAP
) {
1394 insert_readdir_results(request
, session
, in
);
1395 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1396 // hack: return parent inode instead
1400 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1401 // pin the target inode if its parent dentry is not pinned
1402 request
->set_other_inode(in
);
1407 put_snap_realm(realm
);
1409 request
->target
= in
;
1415 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1417 mds_rank_t mds
= MDS_RANK_NONE
;
1419 bool is_hash
= false;
1424 if (req
->resend_mds
>= 0) {
1425 mds
= req
->resend_mds
;
1426 req
->resend_mds
= -1;
1427 ldout(cct
, 10) << __func__
<< " resend_mds specified as mds." << mds
<< dendl
;
1431 if (cct
->_conf
->client_use_random_mds
)
1437 ldout(cct
, 20) << __func__
<< " starting with req->inode " << *in
<< dendl
;
1438 if (req
->path
.depth()) {
1439 hash
= in
->hash_dentry_name(req
->path
[0]);
1440 ldout(cct
, 20) << __func__
<< " inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1441 << " on " << req
->path
[0]
1442 << " => " << hash
<< dendl
;
1447 in
= de
->inode
.get();
1448 ldout(cct
, 20) << __func__
<< " starting with req->dentry inode " << *in
<< dendl
;
1450 in
= de
->dir
->parent_inode
;
1451 hash
= in
->hash_dentry_name(de
->name
);
1452 ldout(cct
, 20) << __func__
<< " dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1453 << " on " << de
->name
1454 << " => " << hash
<< dendl
;
1459 if (in
->snapid
!= CEPH_NOSNAP
) {
1460 ldout(cct
, 10) << __func__
<< " " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1461 while (in
->snapid
!= CEPH_NOSNAP
) {
1462 if (in
->snapid
== CEPH_SNAPDIR
)
1463 in
= in
->snapdir_parent
.get();
1464 else if (!in
->dentries
.empty())
1465 /* In most cases there will only be one dentry, so getting it
1466 * will be the correct action. If there are multiple hard links,
1467 * I think the MDS should be able to redirect as needed*/
1468 in
= in
->get_first_parent()->dir
->parent_inode
;
1470 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1477 ldout(cct
, 20) << __func__
<< " " << *in
<< " is_hash=" << is_hash
1478 << " hash=" << hash
<< dendl
;
1480 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1481 frag_t fg
= in
->dirfragtree
[hash
];
1482 if (in
->fragmap
.count(fg
)) {
1483 mds
= in
->fragmap
[fg
];
1486 } else if (in
->auth_cap
) {
1487 mds
= in
->auth_cap
->session
->mds_num
;
1490 ldout(cct
, 10) << __func__
<< " from dirfragtree hash" << dendl
;
1495 if (in
->auth_cap
&& req
->auth_is_best()) {
1496 mds
= in
->auth_cap
->session
->mds_num
;
1497 } else if (!in
->caps
.empty()) {
1498 mds
= in
->caps
.begin()->second
.session
->mds_num
;
1502 ldout(cct
, 10) << __func__
<< " from caps on inode " << *in
<< dendl
;
1509 mds
= _get_random_up_mds();
1510 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1514 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1519 void Client::connect_mds_targets(mds_rank_t mds
)
1521 ldout(cct
, 10) << __func__
<< " for mds." << mds
<< dendl
;
1522 ceph_assert(mds_sessions
.count(mds
));
1523 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1524 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1525 q
!= info
.export_targets
.end();
1527 if (mds_sessions
.count(*q
) == 0 &&
1528 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1529 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1530 << " export target mds." << *q
<< dendl
;
1531 _open_mds_session(*q
);
1536 void Client::dump_mds_sessions(Formatter
*f
)
1538 f
->dump_int("id", get_nodeid().v
);
1539 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr_legacy());
1540 f
->dump_object("inst", inst
);
1541 f
->dump_stream("inst_str") << inst
;
1542 f
->dump_stream("addr_str") << inst
.addr
;
1543 f
->open_array_section("sessions");
1544 for (const auto &p
: mds_sessions
) {
1545 f
->open_object_section("session");
1550 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1552 void Client::dump_mds_requests(Formatter
*f
)
1554 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1555 p
!= mds_requests
.end();
1557 f
->open_object_section("request");
1563 int Client::verify_reply_trace(int r
, MetaSession
*session
,
1564 MetaRequest
*request
, const MConstRef
<MClientReply
>& reply
,
1565 InodeRef
*ptarget
, bool *pcreated
,
1566 const UserPerm
& perms
)
1568 // check whether this request actually did the create, and set created flag
1569 bufferlist extra_bl
;
1570 inodeno_t created_ino
;
1571 bool got_created_ino
= false;
1572 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1574 extra_bl
= reply
->get_extra_bl();
1575 if (extra_bl
.length() >= 8) {
1576 if (session
->mds_features
.test(CEPHFS_FEATURE_DELEG_INO
)) {
1577 struct openc_response_t ocres
;
1579 decode(ocres
, extra_bl
);
1580 created_ino
= ocres
.created_ino
;
1582 * The userland cephfs client doesn't have a way to do an async create
1583 * (yet), so just discard delegated_inos for now. Eventually we should
1584 * store them and use them in create calls, even if they are synchronous,
1585 * if only for testing purposes.
1587 ldout(cct
, 10) << "delegated_inos: " << ocres
.delegated_inos
<< dendl
;
1589 // u64 containing number of created ino
1590 decode(created_ino
, extra_bl
);
1592 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1593 got_created_ino
= true;
1597 *pcreated
= got_created_ino
;
1599 if (request
->target
) {
1600 *ptarget
= request
->target
;
1601 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1603 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1604 (*ptarget
) = p
->second
;
1605 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1607 // we got a traceless reply, and need to look up what we just
1608 // created. for now, do this by name. someday, do this by the
1609 // ino... which we know! FIXME.
1611 Dentry
*d
= request
->dentry();
1614 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1615 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1616 << " got_ino " << got_created_ino
1617 << " ino " << created_ino
1619 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1622 // if the dentry is not linked, just do our best. see #5021.
1623 ceph_abort_msg("how did this happen? i want logs!");
1626 Inode
*in
= request
->inode();
1627 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1628 << in
->ino
<< dendl
;
1629 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1633 // verify ino returned in reply and trace_dist are the same
1634 if (got_created_ino
&&
1635 created_ino
.val
!= target
->ino
.val
) {
1636 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1640 ptarget
->swap(target
);
1652 * Blocking helper to make an MDS request.
1654 * If the ptarget flag is set, behavior changes slightly: the caller
1655 * expects to get a pointer to the inode we are creating or operating
1656 * on. As a result, we will follow up any traceless mutation reply
1657 * with a getattr or lookup to transparently handle a traceless reply
1658 * from the MDS (as when the MDS restarts and the client has to replay
1661 * @param request the MetaRequest to execute
1662 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1663 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1664 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1665 * @param use_mds [optional] prefer a specific mds (-1 for default)
1666 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1668 int Client::make_request(MetaRequest
*request
,
1669 const UserPerm
& perms
,
1670 InodeRef
*ptarget
, bool *pcreated
,
1676 // assign a unique tid
1677 ceph_tid_t tid
= ++last_tid
;
1678 request
->set_tid(tid
);
1681 request
->op_stamp
= ceph_clock_now();
1684 mds_requests
[tid
] = request
->get();
1685 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1688 request
->set_caller_perms(perms
);
1690 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1691 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1692 request
->set_oldest_client_tid(1);
1694 request
->set_oldest_client_tid(oldest_tid
);
1699 request
->resend_mds
= use_mds
;
1701 MetaSession
*session
= NULL
;
1703 if (request
->aborted())
1707 request
->abort(-EBLACKLISTED
);
1712 ceph::condition_variable caller_cond
;
1713 request
->caller_cond
= &caller_cond
;
1716 Inode
*hash_diri
= NULL
;
1717 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1718 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1719 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1720 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1722 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1723 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1725 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1726 request
->resend_mds
= _get_random_up_mds();
1729 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1730 wait_on_list(waiting_for_mdsmap
);
1736 if (!have_open_session(mds
)) {
1737 session
= _get_or_open_mds_session(mds
);
1738 if (session
->state
== MetaSession::STATE_REJECTED
) {
1739 request
->abort(-EPERM
);
1743 if (session
->state
== MetaSession::STATE_OPENING
) {
1744 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1745 wait_on_context_list(session
->waiting_for_open
);
1749 if (!have_open_session(mds
))
1752 session
= &mds_sessions
.at(mds
);
1756 send_request(request
, session
);
1759 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1760 request
->kick
= false;
1761 std::unique_lock l
{client_lock
, std::adopt_lock
};
1762 caller_cond
.wait(l
, [request
] {
1763 return (request
->reply
|| // reply
1764 request
->resend_mds
>= 0 || // forward
1768 request
->caller_cond
= nullptr;
1770 // did we get a reply?
1775 if (!request
->reply
) {
1776 ceph_assert(request
->aborted());
1777 ceph_assert(!request
->got_unsafe
);
1778 r
= request
->get_abort_code();
1779 request
->item
.remove_myself();
1780 unregister_request(request
);
1781 put_request(request
);
1786 auto reply
= std::move(request
->reply
);
1787 r
= reply
->get_result();
1789 request
->success
= true;
1791 // kick dispatcher (we've got it!)
1792 ceph_assert(request
->dispatch_cond
);
1793 request
->dispatch_cond
->notify_all();
1794 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1795 request
->dispatch_cond
= 0;
1797 if (r
>= 0 && ptarget
)
1798 r
= verify_reply_trace(r
, session
, request
, reply
, ptarget
, pcreated
, perms
);
1801 *pdirbl
= reply
->get_extra_bl();
1804 utime_t lat
= ceph_clock_now();
1805 lat
-= request
->sent_stamp
;
1806 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1807 logger
->tinc(l_c_lat
, lat
);
1808 logger
->tinc(l_c_reply
, lat
);
1810 put_request(request
);
1814 void Client::unregister_request(MetaRequest
*req
)
1816 mds_requests
.erase(req
->tid
);
1817 if (req
->tid
== oldest_tid
) {
1818 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1820 if (p
== mds_requests
.end()) {
1824 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1825 oldest_tid
= p
->first
;
1834 void Client::put_request(MetaRequest
*request
)
1836 if (request
->_put()) {
1838 if (request
->success
)
1839 op
= request
->get_op();
1841 request
->take_other_inode(&other_in
);
1845 (op
== CEPH_MDS_OP_RMDIR
||
1846 op
== CEPH_MDS_OP_RENAME
||
1847 op
== CEPH_MDS_OP_RMSNAP
)) {
1848 _try_to_trim_inode(other_in
.get(), false);
1853 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1854 mds_rank_t mds
, int drop
,
1855 int unless
, int force
)
1857 ldout(cct
, 20) << __func__
<< " enter(in:" << *in
<< ", req:" << req
1858 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1859 << ", force:" << force
<< ")" << dendl
;
1861 auto it
= in
->caps
.find(mds
);
1862 if (it
!= in
->caps
.end()) {
1863 Cap
&cap
= it
->second
;
1864 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1865 if ((drop
& cap
.issued
) &&
1866 !(unless
& cap
.issued
)) {
1867 ldout(cct
, 25) << "dropping caps " << ccap_string(drop
) << dendl
;
1868 cap
.issued
&= ~drop
;
1869 cap
.implemented
&= ~drop
;
1875 cap
.wanted
= in
->caps_wanted();
1876 if (&cap
== in
->auth_cap
&&
1877 !(cap
.wanted
& CEPH_CAP_ANY_FILE_WR
)) {
1878 in
->requested_max_size
= 0;
1879 ldout(cct
, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl
;
1881 ceph_mds_request_release rel
;
1883 rel
.cap_id
= cap
.cap_id
;
1885 rel
.issue_seq
= cap
.issue_seq
;
1886 rel
.mseq
= cap
.mseq
;
1887 rel
.caps
= cap
.implemented
;
1888 rel
.wanted
= cap
.wanted
;
1891 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1894 ldout(cct
, 25) << __func__
<< " exit(in:" << *in
<< ") released:"
1895 << released
<< dendl
;
1899 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1900 mds_rank_t mds
, int drop
, int unless
)
1902 ldout(cct
, 20) << __func__
<< " enter(dn:"
1903 << dn
<< ")" << dendl
;
1906 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1907 mds
, drop
, unless
, 1);
1908 if (released
&& dn
->lease_mds
== mds
) {
1909 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1910 auto& rel
= req
->cap_releases
.back();
1911 rel
.item
.dname_len
= dn
->name
.length();
1912 rel
.item
.dname_seq
= dn
->lease_seq
;
1913 rel
.dname
= dn
->name
;
1915 ldout(cct
, 25) << __func__
<< " exit(dn:"
1916 << dn
<< ")" << dendl
;
1921 * This requires the MClientRequest *request member to be set.
1922 * It will error out horribly without one.
1923 * Additionally, if you set any *drop member, you'd better have
1924 * set the corresponding dentry!
1926 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1928 ldout(cct
, 20) << __func__
<< " enter (req: "
1929 << req
<< ", mds: " << mds
<< ")" << dendl
;
1930 if (req
->inode_drop
&& req
->inode())
1931 encode_inode_release(req
->inode(), req
,
1932 mds
, req
->inode_drop
,
1935 if (req
->old_inode_drop
&& req
->old_inode())
1936 encode_inode_release(req
->old_inode(), req
,
1937 mds
, req
->old_inode_drop
,
1938 req
->old_inode_unless
);
1939 if (req
->other_inode_drop
&& req
->other_inode())
1940 encode_inode_release(req
->other_inode(), req
,
1941 mds
, req
->other_inode_drop
,
1942 req
->other_inode_unless
);
1944 if (req
->dentry_drop
&& req
->dentry())
1945 encode_dentry_release(req
->dentry(), req
,
1946 mds
, req
->dentry_drop
,
1947 req
->dentry_unless
);
1949 if (req
->old_dentry_drop
&& req
->old_dentry())
1950 encode_dentry_release(req
->old_dentry(), req
,
1951 mds
, req
->old_dentry_drop
,
1952 req
->old_dentry_unless
);
1953 ldout(cct
, 25) << __func__
<< " exit (req: "
1954 << req
<< ", mds " << mds
<<dendl
;
1957 bool Client::have_open_session(mds_rank_t mds
)
1959 const auto &it
= mds_sessions
.find(mds
);
1960 return it
!= mds_sessions
.end() &&
1961 (it
->second
.state
== MetaSession::STATE_OPEN
||
1962 it
->second
.state
== MetaSession::STATE_STALE
);
1965 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1967 const auto &it
= mds_sessions
.find(mds
);
1968 if (it
== mds_sessions
.end() || it
->second
.con
!= con
) {
1975 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1977 auto it
= mds_sessions
.find(mds
);
1978 return it
== mds_sessions
.end() ? _open_mds_session(mds
) : &it
->second
;
1982 * Populate a map of strings with client-identifying metadata,
1983 * such as the hostname. Call this once at initialization.
1985 void Client::populate_metadata(const std::string
&mount_root
)
1991 metadata
["hostname"] = u
.nodename
;
1992 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1994 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1997 metadata
["pid"] = stringify(getpid());
1999 // Ceph entity id (the '0' in "client.0")
2000 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
2002 // Our mount position
2003 if (!mount_root
.empty()) {
2004 metadata
["root"] = mount_root
;
2008 metadata
["ceph_version"] = pretty_version_to_str();
2009 metadata
["ceph_sha1"] = git_version_to_str();
2011 // Apply any metadata from the user's configured overrides
2012 std::vector
<std::string
> tokens
;
2013 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
2014 for (const auto &i
: tokens
) {
2015 auto eqpos
= i
.find("=");
2016 // Throw out anything that isn't of the form "<str>=<str>"
2017 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
2018 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
2021 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
2026 * Optionally add or override client metadata fields.
2028 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
2030 std::lock_guard
l(client_lock
);
2031 ceph_assert(initialized
);
2033 auto it
= metadata
.find(k
);
2034 if (it
!= metadata
.end()) {
2035 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2036 << "' from '" << it
->second
<< "' to '" << v
<< "'" << dendl
;
2042 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2044 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
2045 auto addrs
= mdsmap
->get_addrs(mds
);
2046 auto em
= mds_sessions
.emplace(std::piecewise_construct
,
2047 std::forward_as_tuple(mds
),
2048 std::forward_as_tuple(mds
, messenger
->connect_to_mds(addrs
), addrs
));
2049 ceph_assert(em
.second
); /* not already present */
2050 MetaSession
*session
= &em
.first
->second
;
2052 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_OPEN
);
2053 m
->metadata
= metadata
;
2054 m
->supported_features
= feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED
);
2055 session
->con
->send_message2(std::move(m
));
2059 void Client::_close_mds_session(MetaSession
*s
)
2061 ldout(cct
, 2) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2062 s
->state
= MetaSession::STATE_CLOSING
;
2063 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2066 void Client::_closed_mds_session(MetaSession
*s
, int err
, bool rejected
)
2068 ldout(cct
, 5) << __func__
<< " mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2069 if (rejected
&& s
->state
!= MetaSession::STATE_CLOSING
)
2070 s
->state
= MetaSession::STATE_REJECTED
;
2072 s
->state
= MetaSession::STATE_CLOSED
;
2073 s
->con
->mark_down();
2074 signal_context_list(s
->waiting_for_open
);
2075 mount_cond
.notify_all();
2076 remove_session_caps(s
, err
);
2077 kick_requests_closed(s
);
2078 mds_ranks_closing
.erase(s
->mds_num
);
2079 if (s
->state
== MetaSession::STATE_CLOSED
)
2080 mds_sessions
.erase(s
->mds_num
);
2083 void Client::handle_client_session(const MConstRef
<MClientSession
>& m
)
2085 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2086 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << from
<< dendl
;
2088 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2090 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2094 switch (m
->get_op()) {
2095 case CEPH_SESSION_OPEN
:
2097 feature_bitset_t
missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED
);
2098 missing_features
-= m
->supported_features
;
2099 if (!missing_features
.empty()) {
2100 lderr(cct
) << "mds." << from
<< " lacks required features '"
2101 << missing_features
<< "', closing session " << dendl
;
2102 _close_mds_session(session
);
2103 _closed_mds_session(session
, -EPERM
, true);
2106 session
->mds_features
= std::move(m
->supported_features
);
2108 renew_caps(session
);
2109 session
->state
= MetaSession::STATE_OPEN
;
2111 mount_cond
.notify_all();
2113 connect_mds_targets(from
);
2114 signal_context_list(session
->waiting_for_open
);
2118 case CEPH_SESSION_CLOSE
:
2119 _closed_mds_session(session
);
2122 case CEPH_SESSION_RENEWCAPS
:
2123 if (session
->cap_renew_seq
== m
->get_seq()) {
2124 bool was_stale
= ceph_clock_now() >= session
->cap_ttl
;
2126 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2128 wake_up_session_caps(session
, false);
2132 case CEPH_SESSION_STALE
:
2133 // invalidate session caps/leases
2135 session
->cap_ttl
= ceph_clock_now();
2136 session
->cap_ttl
-= 1;
2137 renew_caps(session
);
2140 case CEPH_SESSION_RECALL_STATE
:
2141 trim_caps(session
, m
->get_max_caps());
2144 case CEPH_SESSION_FLUSHMSG
:
2145 /* flush cap release */
2146 if (auto& m
= session
->release
; m
) {
2147 session
->con
->send_message2(std::move(m
));
2149 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2152 case CEPH_SESSION_FORCE_RO
:
2153 force_session_readonly(session
);
2156 case CEPH_SESSION_REJECT
:
2158 std::string_view error_str
;
2159 auto it
= m
->metadata
.find("error_string");
2160 if (it
!= m
->metadata
.end())
2161 error_str
= it
->second
;
2163 error_str
= "unknown error";
2164 lderr(cct
) << "mds." << from
<< " rejected us (" << error_str
<< ")" << dendl
;
2166 _closed_mds_session(session
, -EPERM
, true);
2175 bool Client::_any_stale_sessions() const
2177 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
2179 for (const auto &p
: mds_sessions
) {
2180 if (p
.second
.state
== MetaSession::STATE_STALE
) {
2188 void Client::_kick_stale_sessions()
2190 ldout(cct
, 1) << __func__
<< dendl
;
2192 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
2193 MetaSession
&s
= it
->second
;
2194 if (s
.state
== MetaSession::STATE_REJECTED
) {
2195 mds_sessions
.erase(it
++);
2199 if (s
.state
== MetaSession::STATE_STALE
)
2200 _closed_mds_session(&s
);
2204 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2205 bool drop_cap_releases
)
2208 mds_rank_t mds
= session
->mds_num
;
2209 ldout(cct
, 10) << __func__
<< " rebuilding request " << request
->get_tid()
2210 << " for mds." << mds
<< dendl
;
2211 auto r
= build_client_request(request
);
2212 if (request
->dentry()) {
2213 r
->set_dentry_wanted();
2215 if (request
->got_unsafe
) {
2216 r
->set_replayed_op();
2217 if (request
->target
)
2218 r
->head
.ino
= request
->target
->ino
;
2220 encode_cap_releases(request
, mds
);
2221 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2222 request
->cap_releases
.clear();
2224 r
->releases
.swap(request
->cap_releases
);
2226 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2227 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2228 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2229 r
->set_osdmap_epoch(o
.get_epoch());
2233 if (request
->mds
== -1) {
2234 request
->sent_stamp
= ceph_clock_now();
2235 ldout(cct
, 20) << __func__
<< " set sent_stamp to " << request
->sent_stamp
<< dendl
;
2239 Inode
*in
= request
->inode();
2241 auto it
= in
->caps
.find(mds
);
2242 if (it
!= in
->caps
.end()) {
2243 request
->sent_on_mseq
= it
->second
.mseq
;
2247 session
->requests
.push_back(&request
->item
);
2249 ldout(cct
, 10) << __func__
<< " " << *r
<< " to mds." << mds
<< dendl
;
2250 session
->con
->send_message2(std::move(r
));
2253 ref_t
<MClientRequest
> Client::build_client_request(MetaRequest
*request
)
2255 auto req
= make_message
<MClientRequest
>(request
->get_op());
2256 req
->set_tid(request
->tid
);
2257 req
->set_stamp(request
->op_stamp
);
2258 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2260 // if the filepath's haven't been set, set them!
2261 if (request
->path
.empty()) {
2262 Inode
*in
= request
->inode();
2263 Dentry
*de
= request
->dentry();
2265 in
->make_nosnap_relative_path(request
->path
);
2268 de
->inode
->make_nosnap_relative_path(request
->path
);
2270 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2271 request
->path
.push_dentry(de
->name
);
2273 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or appropriately-endowed dentry given!"
2276 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2277 << " No path, inode, or dentry given!"
2280 req
->set_filepath(request
->get_filepath());
2281 req
->set_filepath2(request
->get_filepath2());
2282 req
->set_data(request
->data
);
2283 req
->set_retry_attempt(request
->retry_attempt
++);
2284 req
->head
.num_fwd
= request
->num_fwd
;
2286 int gid_count
= request
->perms
.get_gids(&_gids
);
2287 req
->set_gid_list(gid_count
, _gids
);
2293 void Client::handle_client_request_forward(const MConstRef
<MClientRequestForward
>& fwd
)
2295 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2296 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2300 ceph_tid_t tid
= fwd
->get_tid();
2302 if (mds_requests
.count(tid
) == 0) {
2303 ldout(cct
, 10) << __func__
<< " no pending request on tid " << tid
<< dendl
;
2307 MetaRequest
*request
= mds_requests
[tid
];
2308 ceph_assert(request
);
2310 // reset retry counter
2311 request
->retry_attempt
= 0;
2313 // request not forwarded, or dest mds has no session.
2315 ldout(cct
, 10) << __func__
<< " tid " << tid
2316 << " fwd " << fwd
->get_num_fwd()
2317 << " to mds." << fwd
->get_dest_mds()
2318 << ", resending to " << fwd
->get_dest_mds()
2322 request
->item
.remove_myself();
2323 request
->num_fwd
= fwd
->get_num_fwd();
2324 request
->resend_mds
= fwd
->get_dest_mds();
2325 request
->caller_cond
->notify_all();
2328 bool Client::is_dir_operation(MetaRequest
*req
)
2330 int op
= req
->get_op();
2331 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2332 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2333 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2334 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2339 void Client::handle_client_reply(const MConstRef
<MClientReply
>& reply
)
2341 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2342 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2347 ceph_tid_t tid
= reply
->get_tid();
2348 bool is_safe
= reply
->is_safe();
2350 if (mds_requests
.count(tid
) == 0) {
2351 lderr(cct
) << __func__
<< " no pending request on tid " << tid
2352 << " safe is:" << is_safe
<< dendl
;
2355 MetaRequest
*request
= mds_requests
.at(tid
);
2357 ldout(cct
, 20) << __func__
<< " got a reply. Safe:" << is_safe
2358 << " tid " << tid
<< dendl
;
2360 if (request
->got_unsafe
&& !is_safe
) {
2361 //duplicate response
2362 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2363 << mds_num
<< " safe:" << is_safe
<< dendl
;
2367 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2368 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2369 << " from mds." << request
->mds
<< dendl
;
2370 request
->send_to_auth
= true;
2371 request
->resend_mds
= choose_target_mds(request
);
2372 Inode
*in
= request
->inode();
2373 std::map
<mds_rank_t
, Cap
>::const_iterator it
;
2374 if (request
->resend_mds
>= 0 &&
2375 request
->resend_mds
== request
->mds
&&
2377 (it
= in
->caps
.find(request
->resend_mds
)) != in
->caps
.end() ||
2378 request
->sent_on_mseq
== it
->second
.mseq
)) {
2379 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2381 request
->caller_cond
->notify_all();
2386 ceph_assert(!request
->reply
);
2387 request
->reply
= reply
;
2388 insert_trace(request
, session
);
2390 // Handle unsafe reply
2392 request
->got_unsafe
= true;
2393 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2394 if (is_dir_operation(request
)) {
2395 Inode
*dir
= request
->inode();
2397 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2399 if (request
->target
) {
2400 InodeRef
&in
= request
->target
;
2401 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2405 // Only signal the caller once (on the first reply):
2406 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2407 if (!is_safe
|| !request
->got_unsafe
) {
2408 ceph::condition_variable cond
;
2409 request
->dispatch_cond
= &cond
;
2412 ldout(cct
, 20) << __func__
<< " signalling caller " << (void*)request
->caller_cond
<< dendl
;
2413 request
->caller_cond
->notify_all();
2415 // wake for kick back
2416 std::unique_lock l
{client_lock
, std::adopt_lock
};
2417 cond
.wait(l
, [tid
, request
, &cond
, this] {
2418 if (request
->dispatch_cond
) {
2419 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid "
2420 << tid
<< " " << &cond
<< dendl
;
2422 return !request
->dispatch_cond
;
2428 // the filesystem change is committed to disk
2429 // we're done, clean up
2430 if (request
->got_unsafe
) {
2431 request
->unsafe_item
.remove_myself();
2432 request
->unsafe_dir_item
.remove_myself();
2433 request
->unsafe_target_item
.remove_myself();
2434 signal_cond_list(request
->waitfor_safe
);
2436 request
->item
.remove_myself();
2437 unregister_request(request
);
2440 mount_cond
.notify_all();
2443 void Client::_handle_full_flag(int64_t pool
)
2445 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2446 << "on " << pool
<< dendl
;
2447 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2448 // to do this rather than blocking, because otherwise when we fill up we
2449 // potentially lock caps forever on files with dirty pages, and we need
2450 // to be able to release those caps to the MDS so that it can delete files
2451 // and free up space.
2452 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2454 // For all inodes with layouts in this pool and a pending flush write op
2455 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2456 // from ObjectCacher so that it doesn't re-issue the write in response to
2457 // the ENOSPC error.
2458 // Fortunately since we're cancelling everything in a given pool, we don't
2459 // need to know which ops belong to which ObjectSet, we can just blow all
2460 // the un-flushed cached data away and mark any dirty inodes' async_err
2461 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2462 // affecting this pool, and all the objectsets we're purging were also
2464 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2465 i
!= inode_map
.end(); ++i
)
2467 Inode
*inode
= i
->second
;
2468 if (inode
->oset
.dirty_or_tx
2469 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2470 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2471 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2472 objectcacher
->purge_set(&inode
->oset
);
2473 inode
->set_async_err(-ENOSPC
);
2477 if (cancelled_epoch
!= (epoch_t
)-1) {
2478 set_cap_epoch_barrier(cancelled_epoch
);
2482 void Client::handle_osd_map(const MConstRef
<MOSDMap
>& m
)
2484 std::set
<entity_addr_t
> new_blacklists
;
2485 objecter
->consume_blacklist_events(&new_blacklists
);
2487 const auto myaddrs
= messenger
->get_myaddrs();
2488 bool new_blacklist
= false;
2489 bool prenautilus
= objecter
->with_osdmap(
2490 [&](const OSDMap
& o
) {
2491 return o
.require_osd_release
< ceph_release_t::nautilus
;
2494 for (auto a
: myaddrs
.v
) {
2495 // blacklist entries are always TYPE_ANY for nautilus+
2496 a
.set_type(entity_addr_t::TYPE_ANY
);
2497 if (new_blacklists
.count(a
)) {
2498 new_blacklist
= true;
2502 // ...except pre-nautilus, they were TYPE_LEGACY
2503 a
.set_type(entity_addr_t::TYPE_LEGACY
);
2504 if (new_blacklists
.count(a
)) {
2505 new_blacklist
= true;
2511 if (new_blacklist
) {
2512 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2513 return o
.get_epoch();
2515 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2518 _abort_mds_sessions(-EBLACKLISTED
);
2520 // Since we know all our OSD ops will fail, cancel them all preemtively,
2521 // so that on an unhealthy cluster we can umount promptly even if e.g.
2522 // some PGs were inaccessible.
2523 objecter
->op_cancel_writes(-EBLACKLISTED
);
2525 } else if (blacklisted
) {
2526 // Handle case where we were blacklisted but no longer are
2527 blacklisted
= objecter
->with_osdmap([myaddrs
](const OSDMap
&o
){
2528 return o
.is_blacklisted(myaddrs
);});
2531 // Always subscribe to next osdmap for blacklisted client
2532 // until this client is not blacklisted.
2534 objecter
->maybe_request_map();
2537 if (objecter
->osdmap_full_flag()) {
2538 _handle_full_flag(-1);
2540 // Accumulate local list of full pools so that I can drop
2541 // the objecter lock before re-entering objecter in
2543 std::vector
<int64_t> full_pools
;
2545 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2546 for (const auto& kv
: o
.get_pools()) {
2547 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2548 full_pools
.push_back(kv
.first
);
2553 for (auto p
: full_pools
)
2554 _handle_full_flag(p
);
2556 // Subscribe to subsequent maps to watch for the full flag going
2557 // away. For the global full flag objecter does this for us, but
2558 // it pays no attention to the per-pool full flag so in this branch
2559 // we do it ourselves.
2560 if (!full_pools
.empty()) {
2561 objecter
->maybe_request_map();
2567 // ------------------------
2568 // incoming messages
2571 bool Client::ms_dispatch2(const MessageRef
&m
)
2573 std::lock_guard
l(client_lock
);
2575 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2579 switch (m
->get_type()) {
2580 // mounting and mds sessions
2581 case CEPH_MSG_MDS_MAP
:
2582 handle_mds_map(ref_cast
<MMDSMap
>(m
));
2584 case CEPH_MSG_FS_MAP
:
2585 handle_fs_map(ref_cast
<MFSMap
>(m
));
2587 case CEPH_MSG_FS_MAP_USER
:
2588 handle_fs_map_user(ref_cast
<MFSMapUser
>(m
));
2590 case CEPH_MSG_CLIENT_SESSION
:
2591 handle_client_session(ref_cast
<MClientSession
>(m
));
2594 case CEPH_MSG_OSD_MAP
:
2595 handle_osd_map(ref_cast
<MOSDMap
>(m
));
2599 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2600 handle_client_request_forward(ref_cast
<MClientRequestForward
>(m
));
2602 case CEPH_MSG_CLIENT_REPLY
:
2603 handle_client_reply(ref_cast
<MClientReply
>(m
));
2607 case CEPH_MSG_CLIENT_RECLAIM_REPLY
:
2608 handle_client_reclaim_reply(ref_cast
<MClientReclaimReply
>(m
));
2611 case CEPH_MSG_CLIENT_SNAP
:
2612 handle_snap(ref_cast
<MClientSnap
>(m
));
2614 case CEPH_MSG_CLIENT_CAPS
:
2615 handle_caps(ref_cast
<MClientCaps
>(m
));
2617 case CEPH_MSG_CLIENT_LEASE
:
2618 handle_lease(ref_cast
<MClientLease
>(m
));
2620 case MSG_COMMAND_REPLY
:
2621 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2622 handle_command_reply(ref_cast
<MCommandReply
>(m
));
2627 case CEPH_MSG_CLIENT_QUOTA
:
2628 handle_quota(ref_cast
<MClientQuota
>(m
));
2637 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2638 << "+" << inode_map
.size() << dendl
;
2639 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2641 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2642 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2643 mount_cond
.notify_all();
2645 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2646 << "+" << inode_map
.size() << dendl
;
2653 void Client::handle_fs_map(const MConstRef
<MFSMap
>& m
)
2655 fsmap
.reset(new FSMap(m
->get_fsmap()));
2657 signal_cond_list(waiting_for_fsmap
);
2659 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2662 void Client::handle_fs_map_user(const MConstRef
<MFSMapUser
>& m
)
2664 fsmap_user
.reset(new FSMapUser
);
2665 *fsmap_user
= m
->get_fsmap();
2667 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2668 signal_cond_list(waiting_for_fsmap
);
2671 void Client::handle_mds_map(const MConstRef
<MMDSMap
>& m
)
2673 mds_gid_t old_inc
, new_inc
;
2674 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2675 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch()
2676 << " is identical to or older than our "
2677 << mdsmap
->get_epoch() << dendl
;
2681 ldout(cct
, 1) << __func__
<< " epoch " << m
->get_epoch() << dendl
;
2683 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2684 oldmap
.swap(mdsmap
);
2686 mdsmap
->decode(m
->get_encoded());
2688 // Cancel any commands for missing or laggy GIDs
2689 std::list
<ceph_tid_t
> cancel_ops
;
2690 auto &commands
= command_table
.get_commands();
2691 for (const auto &i
: commands
) {
2692 auto &op
= i
.second
;
2693 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2694 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2695 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2696 cancel_ops
.push_back(i
.first
);
2698 std::ostringstream ss
;
2699 ss
<< "MDS " << op_mds_gid
<< " went away";
2700 *(op
.outs
) = ss
.str();
2702 op
.con
->mark_down();
2704 op
.on_finish
->complete(-ETIMEDOUT
);
2709 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2710 i
!= cancel_ops
.end(); ++i
) {
2711 command_table
.erase(*i
);
2715 for (auto p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ) {
2716 mds_rank_t mds
= p
->first
;
2717 MetaSession
*session
= &p
->second
;
2720 int oldstate
= oldmap
->get_state(mds
);
2721 int newstate
= mdsmap
->get_state(mds
);
2722 if (!mdsmap
->is_up(mds
)) {
2723 session
->con
->mark_down();
2724 } else if (mdsmap
->get_addrs(mds
) != session
->addrs
) {
2725 old_inc
= oldmap
->get_incarnation(mds
);
2726 new_inc
= mdsmap
->get_incarnation(mds
);
2727 if (old_inc
!= new_inc
) {
2728 ldout(cct
, 1) << "mds incarnation changed from "
2729 << old_inc
<< " to " << new_inc
<< dendl
;
2730 oldstate
= MDSMap::STATE_NULL
;
2732 session
->con
->mark_down();
2733 session
->addrs
= mdsmap
->get_addrs(mds
);
2734 // When new MDS starts to take over, notify kernel to trim unused entries
2735 // in its dcache/icache. Hopefully, the kernel will release some unused
2736 // inodes before the new MDS enters reconnect state.
2737 trim_cache_for_reconnect(session
);
2738 } else if (oldstate
== newstate
)
2739 continue; // no change
2741 session
->mds_state
= newstate
;
2742 if (newstate
== MDSMap::STATE_RECONNECT
) {
2743 session
->con
= messenger
->connect_to_mds(session
->addrs
);
2744 send_reconnect(session
);
2745 } else if (newstate
> MDSMap::STATE_RECONNECT
) {
2746 if (oldstate
< MDSMap::STATE_RECONNECT
) {
2747 ldout(cct
, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl
;
2748 _closed_mds_session(session
);
2751 if (newstate
>= MDSMap::STATE_ACTIVE
) {
2752 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2753 // kick new requests
2754 kick_requests(session
);
2755 kick_flushing_caps(session
);
2756 signal_context_list(session
->waiting_for_open
);
2757 wake_up_session_caps(session
, true);
2759 connect_mds_targets(mds
);
2761 } else if (newstate
== MDSMap::STATE_NULL
&&
2762 mds
>= mdsmap
->get_max_mds()) {
2763 _closed_mds_session(session
);
2767 // kick any waiting threads
2768 signal_cond_list(waiting_for_mdsmap
);
2770 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2773 void Client::send_reconnect(MetaSession
*session
)
2775 mds_rank_t mds
= session
->mds_num
;
2776 ldout(cct
, 10) << __func__
<< " to mds." << mds
<< dendl
;
2778 // trim unused caps to reduce MDS's cache rejoin time
2779 trim_cache_for_reconnect(session
);
2781 session
->readonly
= false;
2783 session
->release
.reset();
2785 // reset my cap seq number
2787 //connect to the mds' offload targets
2788 connect_mds_targets(mds
);
2789 //make sure unsafe requests get saved
2790 resend_unsafe_requests(session
);
2792 early_kick_flushing_caps(session
);
2794 auto m
= make_message
<MClientReconnect
>();
2795 bool allow_multi
= session
->mds_features
.test(CEPHFS_FEATURE_MULTI_RECONNECT
);
2797 // i have an open session.
2798 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2799 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2800 p
!= inode_map
.end();
2802 Inode
*in
= p
->second
;
2803 auto it
= in
->caps
.find(mds
);
2804 if (it
!= in
->caps
.end()) {
2806 m
->get_approx_size() >=
2807 static_cast<size_t>((std::numeric_limits
<int>::max() >> 1))) {
2809 session
->con
->send_message2(std::move(m
));
2811 m
= make_message
<MClientReconnect
>();
2814 Cap
&cap
= it
->second
;
2815 ldout(cct
, 10) << " caps on " << p
->first
2816 << " " << ccap_string(cap
.issued
)
2817 << " wants " << ccap_string(in
->caps_wanted())
2820 in
->make_long_path(path
);
2821 ldout(cct
, 10) << " path " << path
<< dendl
;
2824 _encode_filelocks(in
, flockbl
);
2826 cap
.seq
= 0; // reset seq.
2827 cap
.issue_seq
= 0; // reset seq.
2828 cap
.mseq
= 0; // reset seq.
2829 // cap gen should catch up with session cap_gen
2830 if (cap
.gen
< session
->cap_gen
) {
2831 cap
.gen
= session
->cap_gen
;
2832 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
2834 cap
.issued
= cap
.implemented
;
2836 snapid_t snap_follows
= 0;
2837 if (!in
->cap_snaps
.empty())
2838 snap_follows
= in
->cap_snaps
.begin()->first
;
2840 m
->add_cap(p
->first
.ino
,
2842 path
.get_ino(), path
.get_path(), // ino
2843 in
->caps_wanted(), // wanted
2844 cap
.issued
, // issued
2849 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2850 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2851 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2852 did_snaprealm
.insert(in
->snaprealm
->ino
);
2858 m
->set_encoding_version(0); // use connection features to choose encoding
2859 session
->con
->send_message2(std::move(m
));
2861 mount_cond
.notify_all();
2863 if (session
->reclaim_state
== MetaSession::RECLAIMING
)
2864 signal_cond_list(waiting_for_reclaim
);
2868 void Client::kick_requests(MetaSession
*session
)
2870 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2871 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2872 p
!= mds_requests
.end();
2874 MetaRequest
*req
= p
->second
;
2875 if (req
->got_unsafe
)
2877 if (req
->aborted()) {
2878 if (req
->caller_cond
) {
2880 req
->caller_cond
->notify_all();
2884 if (req
->retry_attempt
> 0)
2885 continue; // new requests only
2886 if (req
->mds
== session
->mds_num
) {
2887 send_request(p
->second
, session
);
2892 void Client::resend_unsafe_requests(MetaSession
*session
)
2894 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2897 send_request(*iter
, session
);
2899 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2900 // process completed requests in clientreplay stage.
2901 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2902 p
!= mds_requests
.end();
2904 MetaRequest
*req
= p
->second
;
2905 if (req
->got_unsafe
)
2909 if (req
->retry_attempt
== 0)
2910 continue; // old requests only
2911 if (req
->mds
== session
->mds_num
)
2912 send_request(req
, session
, true);
2916 void Client::wait_unsafe_requests()
2918 list
<MetaRequest
*> last_unsafe_reqs
;
2919 for (const auto &p
: mds_sessions
) {
2920 const MetaSession
&s
= p
.second
;
2921 if (!s
.unsafe_requests
.empty()) {
2922 MetaRequest
*req
= s
.unsafe_requests
.back();
2924 last_unsafe_reqs
.push_back(req
);
2928 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2929 p
!= last_unsafe_reqs
.end();
2931 MetaRequest
*req
= *p
;
2932 if (req
->unsafe_item
.is_on_list())
2933 wait_on_list(req
->waitfor_safe
);
2938 void Client::kick_requests_closed(MetaSession
*session
)
2940 ldout(cct
, 10) << __func__
<< " for mds." << session
->mds_num
<< dendl
;
2941 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2942 p
!= mds_requests
.end(); ) {
2943 MetaRequest
*req
= p
->second
;
2945 if (req
->mds
== session
->mds_num
) {
2946 if (req
->caller_cond
) {
2948 req
->caller_cond
->notify_all();
2950 req
->item
.remove_myself();
2951 if (req
->got_unsafe
) {
2952 lderr(cct
) << __func__
<< " removing unsafe request " << req
->get_tid() << dendl
;
2953 req
->unsafe_item
.remove_myself();
2954 if (is_dir_operation(req
)) {
2955 Inode
*dir
= req
->inode();
2957 dir
->set_async_err(-EIO
);
2958 lderr(cct
) << "kick_requests_closed drop req of inode(dir) : "
2959 << dir
->ino
<< " " << req
->get_tid() << dendl
;
2960 req
->unsafe_dir_item
.remove_myself();
2963 InodeRef
&in
= req
->target
;
2964 in
->set_async_err(-EIO
);
2965 lderr(cct
) << "kick_requests_closed drop req of inode : "
2966 << in
->ino
<< " " << req
->get_tid() << dendl
;
2967 req
->unsafe_target_item
.remove_myself();
2969 signal_cond_list(req
->waitfor_safe
);
2970 unregister_request(req
);
2974 ceph_assert(session
->requests
.empty());
2975 ceph_assert(session
->unsafe_requests
.empty());
2985 void Client::got_mds_push(MetaSession
*s
)
2988 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2989 if (s
->state
== MetaSession::STATE_CLOSING
) {
2990 s
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2994 void Client::handle_lease(const MConstRef
<MClientLease
>& m
)
2996 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
2998 ceph_assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
3000 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
3001 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
3006 got_mds_push(session
);
3008 ceph_seq_t seq
= m
->get_seq();
3011 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
3012 if (inode_map
.count(vino
) == 0) {
3013 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
3016 in
= inode_map
[vino
];
3018 if (m
->get_mask() & CEPH_LEASE_VALID
) {
3019 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
3020 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
3023 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
3024 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
3030 auto reply
= make_message
<MClientLease
>(CEPH_MDS_LEASE_RELEASE
, seq
,
3031 m
->get_mask(), m
->get_ino(),
3032 m
->get_first(), m
->get_last(), m
->dname
);
3033 m
->get_connection()->send_message2(std::move(reply
));
3037 void Client::put_inode(Inode
*in
, int n
)
3039 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3040 int left
= in
->_put(n
);
3043 remove_all_caps(in
);
3045 ldout(cct
, 10) << __func__
<< " deleting " << *in
<< dendl
;
3046 bool unclean
= objectcacher
->release_set(&in
->oset
);
3047 ceph_assert(!unclean
);
3048 inode_map
.erase(in
->vino());
3049 if (use_faked_inos())
3050 _release_faked_ino(in
);
3055 while (!root_parents
.empty())
3056 root_parents
.erase(root_parents
.begin());
3063 void Client::close_dir(Dir
*dir
)
3065 Inode
*in
= dir
->parent_inode
;
3066 ldout(cct
, 15) << __func__
<< " dir " << dir
<< " on " << in
<< dendl
;
3067 ceph_assert(dir
->is_empty());
3068 ceph_assert(in
->dir
== dir
);
3069 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
3070 if (!in
->dentries
.empty())
3071 in
->get_first_parent()->put(); // unpin dentry
3075 put_inode(in
); // unpin inode
3079 * Don't call this with in==NULL, use get_or_create for that
3080 * leave dn set to default NULL unless you're trying to add
3081 * a new inode to a pre-created Dentry
3083 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3086 // create a new Dentry
3087 dn
= new Dentry(dir
, name
);
3089 lru
.lru_insert_mid(dn
); // mid or top?
3091 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3092 << " dn " << dn
<< " (new dn)" << dendl
;
3094 ceph_assert(!dn
->inode
);
3095 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3096 << " dn " << dn
<< " (old dn)" << dendl
;
3099 if (in
) { // link to inode
3101 // only one parent for directories!
3102 if (in
->is_dir() && !in
->dentries
.empty()) {
3103 tmp_ref
= in
; // prevent unlink below from freeing the inode.
3104 Dentry
*olddn
= in
->get_first_parent();
3105 ceph_assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3106 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3107 old_diri
->dir_release_count
++;
3108 clear_dir_complete_and_ordered(old_diri
, true);
3109 unlink(olddn
, true, true); // keep dir, dentry
3113 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3119 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3121 InodeRef
in(dn
->inode
);
3122 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3123 << " inode " << dn
->inode
<< dendl
;
3125 // unlink from inode
3128 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dentries
<< dendl
;
3134 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3144 if (dir
->is_empty() && !keepdir
)
3150 * For asynchronous flushes, check for errors from the IO and
3151 * update the inode if necessary
3153 class C_Client_FlushComplete
: public Context
{
3158 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3159 void finish(int r
) override
{
3160 ceph_assert(ceph_mutex_is_locked_by_me(client
->client_lock
));
3162 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3163 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3164 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3165 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3166 inode
->set_async_err(r
);
3176 void Client::get_cap_ref(Inode
*in
, int cap
)
3178 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3179 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3180 ldout(cct
, 5) << __func__
<< " got first FILE_BUFFER ref on " << *in
<< dendl
;
3183 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3184 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3185 ldout(cct
, 5) << __func__
<< " got first FILE_CACHE ref on " << *in
<< dendl
;
3188 in
->get_cap_ref(cap
);
3191 void Client::put_cap_ref(Inode
*in
, int cap
)
3193 int last
= in
->put_cap_ref(cap
);
3196 int drop
= last
& ~in
->caps_issued();
3197 if (in
->snapid
== CEPH_NOSNAP
) {
3198 if ((last
& CEPH_CAP_FILE_WR
) &&
3199 !in
->cap_snaps
.empty() &&
3200 in
->cap_snaps
.rbegin()->second
.writing
) {
3201 ldout(cct
, 10) << __func__
<< " finishing pending cap_snap on " << *in
<< dendl
;
3202 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3203 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3204 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3206 if (last
& CEPH_CAP_FILE_BUFFER
) {
3207 for (auto &p
: in
->cap_snaps
)
3208 p
.second
.dirty_data
= 0;
3209 signal_cond_list(in
->waitfor_commit
);
3210 ldout(cct
, 5) << __func__
<< " dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3214 if (last
& CEPH_CAP_FILE_CACHE
) {
3215 ldout(cct
, 5) << __func__
<< " dropped last FILE_CACHE ref on " << *in
<< dendl
;
3221 put_inode(in
, put_nref
);
3225 int Client::get_caps(Fh
*fh
, int need
, int want
, int *phave
, loff_t endoff
)
3227 Inode
*in
= fh
->inode
.get();
3229 int r
= check_pool_perm(in
, need
);
3234 int file_wanted
= in
->caps_file_wanted();
3235 if ((file_wanted
& need
) != need
) {
3236 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3237 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3242 if ((fh
->mode
& CEPH_FILE_MODE_WR
) && fh
->gen
!= fd_gen
)
3245 if ((in
->flags
& I_ERROR_FILELOCK
) && fh
->has_any_filelocks())
3249 int have
= in
->caps_issued(&implemented
);
3251 bool waitfor_caps
= false;
3252 bool waitfor_commit
= false;
3254 if (have
& need
& CEPH_CAP_FILE_WR
) {
3256 if ((endoff
>= (loff_t
)in
->max_size
||
3257 endoff
> (loff_t
)(in
->size
<< 1)) &&
3258 endoff
> (loff_t
)in
->wanted_max_size
) {
3259 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3260 in
->wanted_max_size
= endoff
;
3262 if (in
->wanted_max_size
> in
->max_size
&&
3263 in
->wanted_max_size
> in
->requested_max_size
)
3267 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3268 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3269 waitfor_caps
= true;
3271 if (!in
->cap_snaps
.empty()) {
3272 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3273 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3274 waitfor_caps
= true;
3276 for (auto &p
: in
->cap_snaps
) {
3277 if (p
.second
.dirty_data
) {
3278 waitfor_commit
= true;
3282 if (waitfor_commit
) {
3283 _flush(in
, new C_Client_FlushComplete(this, in
));
3284 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3289 if (!waitfor_caps
&& !waitfor_commit
) {
3290 if ((have
& need
) == need
) {
3291 int revoking
= implemented
& ~have
;
3292 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3293 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3294 << " revoking " << ccap_string(revoking
)
3296 if ((revoking
& want
) == 0) {
3297 *phave
= need
| (have
& want
);
3298 in
->get_cap_ref(need
);
3302 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3303 waitfor_caps
= true;
3306 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3307 in
->auth_cap
->session
->readonly
)
3310 if (in
->flags
& I_CAP_DROPPED
) {
3311 int mds_wanted
= in
->caps_mds_wanted();
3312 if ((mds_wanted
& need
) != need
) {
3313 int ret
= _renew_caps(in
);
3318 if (!(file_wanted
& ~mds_wanted
))
3319 in
->flags
&= ~I_CAP_DROPPED
;
3323 wait_on_list(in
->waitfor_caps
);
3324 else if (waitfor_commit
)
3325 wait_on_list(in
->waitfor_commit
);
3329 int Client::get_caps_used(Inode
*in
)
3331 unsigned used
= in
->caps_used();
3332 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3333 !objectcacher
->set_is_empty(&in
->oset
))
3334 used
|= CEPH_CAP_FILE_CACHE
;
3338 void Client::cap_delay_requeue(Inode
*in
)
3340 ldout(cct
, 10) << __func__
<< " on " << *in
<< dendl
;
3341 in
->hold_caps_until
= ceph_clock_now();
3342 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3343 delayed_list
.push_back(&in
->delay_cap_item
);
3346 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3347 int flags
, int used
, int want
, int retain
,
3348 int flush
, ceph_tid_t flush_tid
)
3350 int held
= cap
->issued
| cap
->implemented
;
3351 int revoking
= cap
->implemented
& ~cap
->issued
;
3352 retain
&= ~revoking
;
3353 int dropping
= cap
->issued
& ~retain
;
3354 int op
= CEPH_CAP_OP_UPDATE
;
3356 ldout(cct
, 10) << __func__
<< " " << *in
3357 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3358 << " used " << ccap_string(used
)
3359 << " want " << ccap_string(want
)
3360 << " flush " << ccap_string(flush
)
3361 << " retain " << ccap_string(retain
)
3362 << " held "<< ccap_string(held
)
3363 << " revoking " << ccap_string(revoking
)
3364 << " dropping " << ccap_string(dropping
)
3367 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3368 const int would_have_issued
= cap
->issued
& retain
;
3369 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3371 // - tell the server we think issued is whatever they issued plus whatever we implemented
3372 // - leave what we have implemented in place
3373 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3374 cap
->issued
= cap
->issued
| cap
->implemented
;
3376 // Make an exception for revoking xattr caps: we are injecting
3377 // failure to release other caps, but allow xattr because client
3378 // will block on xattr ops if it can't release these to MDS (#9800)
3379 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3380 cap
->issued
^= xattr_mask
& revoking
;
3381 cap
->implemented
^= xattr_mask
& revoking
;
3383 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3384 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3387 cap
->issued
&= retain
;
3388 cap
->implemented
&= cap
->issued
| used
;
3391 snapid_t follows
= 0;
3394 follows
= in
->snaprealm
->get_snap_context().seq
;
3396 auto m
= make_message
<MClientCaps
>(op
,
3399 cap
->cap_id
, cap
->seq
,
3405 m
->caller_uid
= in
->cap_dirtier_uid
;
3406 m
->caller_gid
= in
->cap_dirtier_gid
;
3408 m
->head
.issue_seq
= cap
->issue_seq
;
3409 m
->set_tid(flush_tid
);
3411 m
->head
.uid
= in
->uid
;
3412 m
->head
.gid
= in
->gid
;
3413 m
->head
.mode
= in
->mode
;
3415 m
->head
.nlink
= in
->nlink
;
3417 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3418 encode(in
->xattrs
, m
->xattrbl
);
3419 m
->head
.xattr_version
= in
->xattr_version
;
3423 m
->max_size
= in
->max_size
;
3424 m
->truncate_seq
= in
->truncate_seq
;
3425 m
->truncate_size
= in
->truncate_size
;
3426 m
->mtime
= in
->mtime
;
3427 m
->atime
= in
->atime
;
3428 m
->ctime
= in
->ctime
;
3429 m
->btime
= in
->btime
;
3430 m
->time_warp_seq
= in
->time_warp_seq
;
3431 m
->change_attr
= in
->change_attr
;
3433 if (!(flags
& MClientCaps::FLAG_PENDING_CAPSNAP
) &&
3434 !in
->cap_snaps
.empty() &&
3435 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3436 flags
|= MClientCaps::FLAG_PENDING_CAPSNAP
;
3439 if (flush
& CEPH_CAP_FILE_WR
) {
3440 m
->inline_version
= in
->inline_version
;
3441 m
->inline_data
= in
->inline_data
;
3444 in
->reported_size
= in
->size
;
3445 m
->set_snap_follows(follows
);
3447 if (cap
== in
->auth_cap
) {
3448 if (want
& CEPH_CAP_ANY_FILE_WR
) {
3449 m
->set_max_size(in
->wanted_max_size
);
3450 in
->requested_max_size
= in
->wanted_max_size
;
3451 ldout(cct
, 15) << "auth cap, requesting max_size " << in
->requested_max_size
<< dendl
;
3453 in
->requested_max_size
= 0;
3454 ldout(cct
, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl
;
3458 if (!session
->flushing_caps_tids
.empty())
3459 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3461 session
->con
->send_message2(std::move(m
));
3464 static bool is_max_size_approaching(Inode
*in
)
3466 /* mds will adjust max size according to the reported size */
3467 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3469 if (in
->size
>= in
->max_size
)
3471 /* half of previous max_size increment has been used */
3472 if (in
->max_size
> in
->reported_size
&&
3473 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3478 static int adjust_caps_used_for_lazyio(int used
, int issued
, int implemented
)
3480 if (!(used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
)))
3482 if (!(implemented
& CEPH_CAP_FILE_LAZYIO
))
3485 if (issued
& CEPH_CAP_FILE_LAZYIO
) {
3486 if (!(issued
& CEPH_CAP_FILE_CACHE
)) {
3487 used
&= ~CEPH_CAP_FILE_CACHE
;
3488 used
|= CEPH_CAP_FILE_LAZYIO
;
3490 if (!(issued
& CEPH_CAP_FILE_BUFFER
)) {
3491 used
&= ~CEPH_CAP_FILE_BUFFER
;
3492 used
|= CEPH_CAP_FILE_LAZYIO
;
3495 if (!(implemented
& CEPH_CAP_FILE_CACHE
)) {
3496 used
&= ~CEPH_CAP_FILE_CACHE
;
3497 used
|= CEPH_CAP_FILE_LAZYIO
;
3499 if (!(implemented
& CEPH_CAP_FILE_BUFFER
)) {
3500 used
&= ~CEPH_CAP_FILE_BUFFER
;
3501 used
|= CEPH_CAP_FILE_LAZYIO
;
3510 * Examine currently used and wanted versus held caps. Release, flush or ack
3511 * revoked caps to the MDS as appropriate.
3513 * @param in the inode to check
3514 * @param flags flags to apply to cap check
3516 void Client::check_caps(Inode
*in
, unsigned flags
)
3518 unsigned wanted
= in
->caps_wanted();
3519 unsigned used
= get_caps_used(in
);
3523 int issued
= in
->caps_issued(&implemented
);
3524 int revoking
= implemented
& ~issued
;
3526 int orig_used
= used
;
3527 used
= adjust_caps_used_for_lazyio(used
, issued
, implemented
);
3529 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3530 if (!unmounting
&& in
->nlink
> 0) {
3532 retain
|= CEPH_CAP_ANY
;
3533 } else if (in
->is_dir() &&
3534 (issued
& CEPH_CAP_FILE_SHARED
) &&
3535 (in
->flags
& I_COMPLETE
)) {
3536 // we do this here because we don't want to drop to Fs (and then
3537 // drop the Fs if we do a create!) if that alone makes us send lookups
3538 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3539 wanted
= CEPH_CAP_ANY_SHARED
| CEPH_CAP_FILE_EXCL
;
3542 retain
|= CEPH_CAP_ANY_SHARED
;
3543 // keep RD only if we didn't have the file open RW,
3544 // because then the mds would revoke it anyway to
3545 // journal max_size=0.
3546 if (in
->max_size
== 0)
3547 retain
|= CEPH_CAP_ANY_RD
;
3551 ldout(cct
, 10) << __func__
<< " on " << *in
3552 << " wanted " << ccap_string(wanted
)
3553 << " used " << ccap_string(used
)
3554 << " issued " << ccap_string(issued
)
3555 << " revoking " << ccap_string(revoking
)
3556 << " flags=" << flags
3559 if (in
->snapid
!= CEPH_NOSNAP
)
3560 return; //snap caps last forever, can't write
3562 if (in
->caps
.empty())
3563 return; // guard if at end of func
3565 if (!(orig_used
& CEPH_CAP_FILE_BUFFER
) &&
3566 (revoking
& used
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
3568 used
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
3572 for (auto &p
: in
->caps
) {
3573 mds_rank_t mds
= p
.first
;
3574 Cap
&cap
= p
.second
;
3576 MetaSession
*session
= &mds_sessions
.at(mds
);
3579 if (in
->auth_cap
&& &cap
!= in
->auth_cap
)
3580 cap_used
&= ~in
->auth_cap
->issued
;
3582 revoking
= cap
.implemented
& ~cap
.issued
;
3584 ldout(cct
, 10) << " cap mds." << mds
3585 << " issued " << ccap_string(cap
.issued
)
3586 << " implemented " << ccap_string(cap
.implemented
)
3587 << " revoking " << ccap_string(revoking
) << dendl
;
3589 if (in
->wanted_max_size
> in
->max_size
&&
3590 in
->wanted_max_size
> in
->requested_max_size
&&
3591 &cap
== in
->auth_cap
)
3594 /* approaching file_max? */
3595 if ((cap
.issued
& CEPH_CAP_FILE_WR
) &&
3596 &cap
== in
->auth_cap
&&
3597 is_max_size_approaching(in
)) {
3598 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3599 << ", reported " << in
->reported_size
<< dendl
;
3603 /* completed revocation? */
3604 if (revoking
&& (revoking
& cap_used
) == 0) {
3605 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
.implemented
& ~cap
.issued
) << dendl
;
3609 /* want more caps from mds? */
3610 if (wanted
& ~(cap
.wanted
| cap
.issued
))
3613 if (!revoking
&& unmounting
&& (cap_used
== 0))
3616 if ((cap
.issued
& ~retain
) == 0 && // and we don't have anything we wouldn't like
3617 !in
->dirty_caps
) // and we have no dirty caps
3620 if (!(flags
& CHECK_CAPS_NODELAY
)) {
3621 ldout(cct
, 10) << "delaying cap release" << dendl
;
3622 cap_delay_requeue(in
);
3627 if (&cap
== in
->auth_cap
) {
3628 if (in
->flags
& I_KICK_FLUSH
) {
3629 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3630 << " to mds." << mds
<< dendl
;
3631 kick_flushing_caps(in
, session
);
3633 if (!in
->cap_snaps
.empty() &&
3634 in
->cap_snaps
.rbegin()->second
.flush_tid
== 0)
3640 ceph_tid_t flush_tid
;
3641 if (in
->auth_cap
== &cap
&& in
->dirty_caps
) {
3642 flushing
= mark_caps_flushing(in
, &flush_tid
);
3643 if (flags
& CHECK_CAPS_SYNCHRONOUS
)
3644 msg_flags
|= MClientCaps::FLAG_SYNC
;
3650 send_cap(in
, session
, &cap
, msg_flags
, cap_used
, wanted
, retain
,
3651 flushing
, flush_tid
);
3656 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3658 int used
= get_caps_used(in
);
3659 int dirty
= in
->caps_dirty();
3660 ldout(cct
, 10) << __func__
<< " " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3662 if (in
->cap_snaps
.size() &&
3663 in
->cap_snaps
.rbegin()->second
.writing
) {
3664 ldout(cct
, 10) << __func__
<< " already have pending cap_snap on " << *in
<< dendl
;
3666 } else if (in
->caps_dirty() ||
3667 (used
& CEPH_CAP_FILE_WR
) ||
3668 (dirty
& CEPH_CAP_ANY_WR
)) {
3669 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3670 ceph_assert(capsnapem
.second
); /* element inserted */
3671 CapSnap
&capsnap
= capsnapem
.first
->second
;
3672 capsnap
.context
= old_snapc
;
3673 capsnap
.issued
= in
->caps_issued();
3674 capsnap
.dirty
= in
->caps_dirty();
3676 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3678 capsnap
.uid
= in
->uid
;
3679 capsnap
.gid
= in
->gid
;
3680 capsnap
.mode
= in
->mode
;
3681 capsnap
.btime
= in
->btime
;
3682 capsnap
.xattrs
= in
->xattrs
;
3683 capsnap
.xattr_version
= in
->xattr_version
;
3684 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3685 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3687 if (used
& CEPH_CAP_FILE_WR
) {
3688 ldout(cct
, 10) << __func__
<< " WR used on " << *in
<< dendl
;
3689 capsnap
.writing
= 1;
3691 finish_cap_snap(in
, capsnap
, used
);
3694 ldout(cct
, 10) << __func__
<< " not dirty|writing on " << *in
<< dendl
;
3698 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3700 ldout(cct
, 10) << __func__
<< " " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3701 capsnap
.size
= in
->size
;
3702 capsnap
.mtime
= in
->mtime
;
3703 capsnap
.atime
= in
->atime
;
3704 capsnap
.ctime
= in
->ctime
;
3705 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3706 capsnap
.change_attr
= in
->change_attr
;
3707 capsnap
.dirty
|= in
->caps_dirty();
3709 /* Only reset it if it wasn't set before */
3710 if (capsnap
.cap_dirtier_uid
== -1) {
3711 capsnap
.cap_dirtier_uid
= in
->cap_dirtier_uid
;
3712 capsnap
.cap_dirtier_gid
= in
->cap_dirtier_gid
;
3715 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3716 capsnap
.inline_data
= in
->inline_data
;
3717 capsnap
.inline_version
= in
->inline_version
;
3720 if (used
& CEPH_CAP_FILE_BUFFER
) {
3721 ldout(cct
, 10) << __func__
<< " " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3722 << " WRBUFFER, delaying" << dendl
;
3724 capsnap
.dirty_data
= 0;
3729 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3731 ldout(cct
, 10) << __func__
<< " seq " << seq
<< " on " << *in
<< dendl
;
3732 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3736 void Client::send_flush_snap(Inode
*in
, MetaSession
*session
,
3737 snapid_t follows
, CapSnap
& capsnap
)
3739 auto m
= make_message
<MClientCaps
>(CEPH_CAP_OP_FLUSHSNAP
,
3740 in
->ino
, in
->snaprealm
->ino
, 0,
3741 in
->auth_cap
->mseq
, cap_epoch_barrier
);
3742 m
->caller_uid
= capsnap
.cap_dirtier_uid
;
3743 m
->caller_gid
= capsnap
.cap_dirtier_gid
;
3745 m
->set_client_tid(capsnap
.flush_tid
);
3746 m
->head
.snap_follows
= follows
;
3748 m
->head
.caps
= capsnap
.issued
;
3749 m
->head
.dirty
= capsnap
.dirty
;
3751 m
->head
.uid
= capsnap
.uid
;
3752 m
->head
.gid
= capsnap
.gid
;
3753 m
->head
.mode
= capsnap
.mode
;
3754 m
->btime
= capsnap
.btime
;
3756 m
->size
= capsnap
.size
;
3758 m
->head
.xattr_version
= capsnap
.xattr_version
;
3759 encode(capsnap
.xattrs
, m
->xattrbl
);
3761 m
->ctime
= capsnap
.ctime
;
3762 m
->btime
= capsnap
.btime
;
3763 m
->mtime
= capsnap
.mtime
;
3764 m
->atime
= capsnap
.atime
;
3765 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3766 m
->change_attr
= capsnap
.change_attr
;
3768 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3769 m
->inline_version
= in
->inline_version
;
3770 m
->inline_data
= in
->inline_data
;
3773 ceph_assert(!session
->flushing_caps_tids
.empty());
3774 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3776 session
->con
->send_message2(std::move(m
));
3779 void Client::flush_snaps(Inode
*in
)
3781 ldout(cct
, 10) << "flush_snaps on " << *in
<< dendl
;
3782 ceph_assert(in
->cap_snaps
.size());
3785 ceph_assert(in
->auth_cap
);
3786 MetaSession
*session
= in
->auth_cap
->session
;
3788 for (auto &p
: in
->cap_snaps
) {
3789 CapSnap
&capsnap
= p
.second
;
3790 // only do new flush
3791 if (capsnap
.flush_tid
> 0)
3794 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3795 << " follows " << p
.first
3796 << " size " << capsnap
.size
3797 << " mtime " << capsnap
.mtime
3798 << " dirty_data=" << capsnap
.dirty_data
3799 << " writing=" << capsnap
.writing
3800 << " on " << *in
<< dendl
;
3801 if (capsnap
.dirty_data
|| capsnap
.writing
)
3804 capsnap
.flush_tid
= ++last_flush_tid
;
3805 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3806 in
->flushing_cap_tids
[capsnap
.flush_tid
] = 0;
3807 if (!in
->flushing_cap_item
.is_on_list())
3808 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3810 send_flush_snap(in
, session
, p
.first
, capsnap
);
3814 void Client::wait_on_list(list
<ceph::condition_variable
*>& ls
)
3816 ceph::condition_variable cond
;
3817 ls
.push_back(&cond
);
3818 std::unique_lock l
{client_lock
, std::adopt_lock
};
3824 void Client::signal_cond_list(list
<ceph::condition_variable
*>& ls
)
3826 for (auto cond
: ls
) {
3831 void Client::wait_on_context_list(list
<Context
*>& ls
)
3833 ceph::condition_variable cond
;
3836 ls
.push_back(new C_Cond(cond
, &done
, &r
));
3837 std::unique_lock l
{client_lock
, std::adopt_lock
};
3838 cond
.wait(l
, [&done
] { return done
;});
3842 void Client::signal_context_list(list
<Context
*>& ls
)
3844 while (!ls
.empty()) {
3845 ls
.front()->complete(0);
3850 void Client::wake_up_session_caps(MetaSession
*s
, bool reconnect
)
3852 for (const auto &cap
: s
->caps
) {
3853 auto &in
= cap
->inode
;
3855 in
.requested_max_size
= 0;
3856 in
.wanted_max_size
= 0;
3858 if (cap
->gen
< s
->cap_gen
) {
3859 // mds did not re-issue stale cap.
3860 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
3861 // make sure mds knows what we want.
3862 if (in
.caps_file_wanted() & ~cap
->wanted
)
3863 in
.flags
|= I_CAP_DROPPED
;
3866 signal_cond_list(in
.waitfor_caps
);
3871 // flush dirty data (from objectcache)
3873 class C_Client_CacheInvalidate
: public Context
{
3877 int64_t offset
, length
;
3879 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3880 client(c
), offset(off
), length(len
) {
3881 if (client
->use_faked_inos())
3882 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3886 void finish(int r
) override
{
3887 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3888 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
3889 client
->_async_invalidate(ino
, offset
, length
);
3893 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3897 ldout(cct
, 10) << __func__
<< " " << ino
<< " " << off
<< "~" << len
<< dendl
;
3898 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3901 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3903 if (ino_invalidate_cb
)
3904 // we queue the invalidate, which calls the callback and decrements the ref
3905 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3908 void Client::_invalidate_inode_cache(Inode
*in
)
3910 ldout(cct
, 10) << __func__
<< " " << *in
<< dendl
;
3912 // invalidate our userspace inode cache
3913 if (cct
->_conf
->client_oc
) {
3914 objectcacher
->release_set(&in
->oset
);
3915 if (!objectcacher
->set_is_empty(&in
->oset
))
3916 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3919 _schedule_invalidate_callback(in
, 0, 0);
3922 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3924 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
3926 // invalidate our userspace inode cache
3927 if (cct
->_conf
->client_oc
) {
3928 vector
<ObjectExtent
> ls
;
3929 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3930 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3933 _schedule_invalidate_callback(in
, off
, len
);
3936 bool Client::_release(Inode
*in
)
3938 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3939 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3940 _invalidate_inode_cache(in
);
3946 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3948 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3950 if (!in
->oset
.dirty_or_tx
) {
3951 ldout(cct
, 10) << " nothing to flush" << dendl
;
3952 onfinish
->complete(0);
3956 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3957 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3958 objectcacher
->purge_set(&in
->oset
);
3960 onfinish
->complete(-ENOSPC
);
3965 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3968 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3970 ceph_assert(ceph_mutex_is_locked(client_lock
));
3971 if (!in
->oset
.dirty_or_tx
) {
3972 ldout(cct
, 10) << " nothing to flush" << dendl
;
3976 C_SaferCond
onflush("Client::_flush_range flock");
3977 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3978 offset
, size
, &onflush
);
3981 client_lock
.unlock();
3987 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3989 // std::lock_guard l(client_lock);
3990 ceph_assert(ceph_mutex_is_locked(client_lock
)); // will be called via dispatch() -> objecter -> ...
3991 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3996 void Client::_flushed(Inode
*in
)
3998 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
4000 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
4005 // checks common to add_update_cap, handle_cap_grant
4006 void Client::check_cap_issue(Inode
*in
, unsigned issued
)
4008 unsigned had
= in
->caps_issued();
4010 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
4011 !(had
& CEPH_CAP_FILE_CACHE
))
4014 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
4015 !(had
& CEPH_CAP_FILE_SHARED
)) {
4019 clear_dir_complete_and_ordered(in
, true);
4023 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
4024 unsigned issued
, unsigned wanted
, unsigned seq
, unsigned mseq
,
4025 inodeno_t realm
, int flags
, const UserPerm
& cap_perms
)
4027 if (!in
->is_any_caps()) {
4028 ceph_assert(in
->snaprealm
== 0);
4029 in
->snaprealm
= get_snap_realm(realm
);
4030 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4031 ldout(cct
, 15) << __func__
<< " first one, opened snaprealm " << in
->snaprealm
<< dendl
;
4033 ceph_assert(in
->snaprealm
);
4034 if ((flags
& CEPH_CAP_FLAG_AUTH
) &&
4035 realm
!= inodeno_t(-1) && in
->snaprealm
->ino
!= realm
) {
4036 in
->snaprealm_item
.remove_myself();
4037 auto oldrealm
= in
->snaprealm
;
4038 in
->snaprealm
= get_snap_realm(realm
);
4039 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4040 put_snap_realm(oldrealm
);
4044 mds_rank_t mds
= mds_session
->mds_num
;
4045 const auto &capem
= in
->caps
.emplace(std::piecewise_construct
, std::forward_as_tuple(mds
), std::forward_as_tuple(*in
, mds_session
));
4046 Cap
&cap
= capem
.first
->second
;
4047 if (!capem
.second
) {
4048 if (cap
.gen
< mds_session
->cap_gen
)
4049 cap
.issued
= cap
.implemented
= CEPH_CAP_PIN
;
4052 * auth mds of the inode changed. we received the cap export
4053 * message, but still haven't received the cap import message.
4054 * handle_cap_export() updated the new auth MDS' cap.
4056 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4057 * a message that was send before the cap import message. So
4058 * don't remove caps.
4060 if (ceph_seq_cmp(seq
, cap
.seq
) <= 0) {
4061 if (&cap
!= in
->auth_cap
)
4062 ldout(cct
, 0) << "WARNING: " << "inode " << *in
<< " caps on mds." << mds
<< " != auth_cap." << dendl
;
4064 ceph_assert(cap
.cap_id
== cap_id
);
4067 issued
|= cap
.issued
;
4068 flags
|= CEPH_CAP_FLAG_AUTH
;
4072 check_cap_issue(in
, issued
);
4074 if (flags
& CEPH_CAP_FLAG_AUTH
) {
4075 if (in
->auth_cap
!= &cap
&&
4076 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
4077 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
4078 ldout(cct
, 10) << __func__
<< " changing auth cap: "
4079 << "add myself to new auth MDS' flushing caps list" << dendl
;
4080 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
4082 in
->auth_cap
= &cap
;
4086 unsigned old_caps
= cap
.issued
;
4087 cap
.cap_id
= cap_id
;
4088 cap
.issued
= issued
;
4089 cap
.implemented
|= issued
;
4090 if (ceph_seq_cmp(mseq
, cap
.mseq
) > 0)
4091 cap
.wanted
= wanted
;
4093 cap
.wanted
|= wanted
;
4095 cap
.issue_seq
= seq
;
4097 cap
.gen
= mds_session
->cap_gen
;
4098 cap
.latest_perms
= cap_perms
;
4099 ldout(cct
, 10) << __func__
<< " issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
.issued
)
4100 << " from mds." << mds
4104 if ((issued
& ~old_caps
) && in
->auth_cap
== &cap
) {
4105 // non-auth MDS is revoking the newly grant caps ?
4106 for (auto &p
: in
->caps
) {
4107 if (&p
.second
== &cap
)
4109 if (p
.second
.implemented
& ~p
.second
.issued
& issued
) {
4110 check_caps(in
, CHECK_CAPS_NODELAY
);
4116 if (issued
& ~old_caps
)
4117 signal_cond_list(in
->waitfor_caps
);
4120 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4122 auto &in
= cap
->inode
;
4123 MetaSession
*session
= cap
->session
;
4124 mds_rank_t mds
= cap
->session
->mds_num
;
4126 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " on " << in
<< dendl
;
4128 if (queue_release
) {
4129 session
->enqueue_cap_release(
4137 if (in
.auth_cap
== cap
) {
4138 if (in
.flushing_cap_item
.is_on_list()) {
4139 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4140 in
.flushing_cap_item
.remove_myself();
4144 size_t n
= in
.caps
.erase(mds
);
4145 ceph_assert(n
== 1);
4148 if (!in
.is_any_caps()) {
4149 ldout(cct
, 15) << __func__
<< " last one, closing snaprealm " << in
.snaprealm
<< dendl
;
4150 in
.snaprealm_item
.remove_myself();
4151 put_snap_realm(in
.snaprealm
);
4156 void Client::remove_all_caps(Inode
*in
)
4158 while (!in
->caps
.empty())
4159 remove_cap(&in
->caps
.begin()->second
, true);
4162 void Client::remove_session_caps(MetaSession
*s
, int err
)
4164 ldout(cct
, 10) << __func__
<< " mds." << s
->mds_num
<< dendl
;
4166 while (s
->caps
.size()) {
4167 Cap
*cap
= *s
->caps
.begin();
4168 InodeRef
in(&cap
->inode
);
4169 bool dirty_caps
= false;
4170 if (in
->auth_cap
== cap
) {
4171 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4172 in
->wanted_max_size
= 0;
4173 in
->requested_max_size
= 0;
4174 if (in
->has_any_filelocks())
4175 in
->flags
|= I_ERROR_FILELOCK
;
4177 auto caps
= cap
->implemented
;
4178 if (cap
->wanted
| cap
->issued
)
4179 in
->flags
|= I_CAP_DROPPED
;
4180 remove_cap(cap
, false);
4181 in
->cap_snaps
.clear();
4183 lderr(cct
) << __func__
<< " still has dirty|flushing caps on " << *in
<< dendl
;
4184 if (in
->flushing_caps
) {
4185 num_flushing_caps
--;
4186 in
->flushing_cap_tids
.clear();
4188 in
->flushing_caps
= 0;
4189 in
->mark_caps_clean();
4190 put_inode(in
.get());
4192 caps
&= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
;
4193 if (caps
&& !in
->caps_issued_mask(caps
, true)) {
4194 if (err
== -EBLACKLISTED
) {
4195 if (in
->oset
.dirty_or_tx
) {
4196 lderr(cct
) << __func__
<< " still has dirty data on " << *in
<< dendl
;
4197 in
->set_async_err(err
);
4199 objectcacher
->purge_set(&in
->oset
);
4201 objectcacher
->release_set(&in
->oset
);
4203 _schedule_invalidate_callback(in
.get(), 0, 0);
4206 signal_cond_list(in
->waitfor_caps
);
4208 s
->flushing_caps_tids
.clear();
4209 sync_cond
.notify_all();
4212 int Client::_do_remount(bool retry_on_error
)
4214 uint64_t max_retries
= g_conf().get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4217 int r
= remount_cb(callback_handle
);
4219 retries_on_invalidate
= 0;
4222 client_t whoami
= get_nodeid();
4225 "failed to remount (to trim kernel dentries): "
4226 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4229 "failed to remount (to trim kernel dentries): "
4230 "return code = " << r
<< dendl
;
4233 (cct
->_conf
.get_val
<bool>("client_die_on_failed_remount") ||
4234 cct
->_conf
.get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4235 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4236 if (should_abort
&& !unmounting
) {
4237 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4244 class C_Client_Remount
: public Context
{
4248 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4249 void finish(int r
) override
{
4250 ceph_assert(r
== 0);
4251 client
->_do_remount(true);
4255 void Client::_invalidate_kernel_dcache()
4259 if (can_invalidate_dentries
) {
4260 if (dentry_invalidate_cb
&& root
->dir
) {
4261 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4262 p
!= root
->dir
->dentries
.end();
4264 if (p
->second
->inode
)
4265 _schedule_invalidate_dentry_callback(p
->second
, false);
4268 } else if (remount_cb
) {
4270 // when remounting a file system, linux kernel trims all unused dentries in the fs
4271 remount_finisher
.queue(new C_Client_Remount(this));
4275 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4281 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4282 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4283 Dentry
*dn
= p
->second
;
4285 ceph_assert(!dn
->inode
);
4286 if (dn
->lru_is_expireable())
4287 unlink(dn
, true, false); // keep dir, drop dentry
4289 if (dir
->dentries
.empty()) {
4294 if (in
->flags
& I_SNAPDIR_OPEN
) {
4295 InodeRef snapdir
= open_snapdir(in
.get());
4296 _trim_negative_child_dentries(snapdir
);
4300 class C_Client_CacheRelease
: public Context
{
4305 C_Client_CacheRelease(Client
*c
, Inode
*in
) :
4307 if (client
->use_faked_inos())
4308 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
4312 void finish(int r
) override
{
4313 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
4314 client
->_async_inode_release(ino
);
4318 void Client::_async_inode_release(vinodeno_t ino
)
4322 ldout(cct
, 10) << __func__
<< " " << ino
<< dendl
;
4323 ino_release_cb(callback_handle
, ino
);
4326 void Client::_schedule_ino_release_callback(Inode
*in
) {
4329 // we queue the invalidate, which calls the callback and decrements the ref
4330 async_ino_releasor
.queue(new C_Client_CacheRelease(this, in
));
4333 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4335 mds_rank_t mds
= s
->mds_num
;
4336 size_t caps_size
= s
->caps
.size();
4337 ldout(cct
, 10) << __func__
<< " mds." << mds
<< " max " << max
4338 << " caps " << caps_size
<< dendl
;
4340 uint64_t trimmed
= 0;
4341 auto p
= s
->caps
.begin();
4342 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4343 * looking at from getting deleted during traversal. */
4344 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4346 InodeRef
in(&cap
->inode
);
4348 // Increment p early because it will be invalidated if cap
4349 // is deleted inside remove_cap
4352 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4353 int mine
= cap
->issued
| cap
->implemented
;
4354 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4355 // disposable non-auth cap
4356 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4357 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4358 cap
= (remove_cap(cap
, true), nullptr);
4362 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4363 _trim_negative_child_dentries(in
);
4365 auto q
= in
->dentries
.begin();
4366 while (q
!= in
->dentries
.end()) {
4369 if (dn
->lru_is_expireable()) {
4370 if (can_invalidate_dentries
&&
4371 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4372 // Only issue one of these per DN for inodes in root: handle
4373 // others more efficiently by calling for root-child DNs at
4374 // the end of this function.
4375 _schedule_invalidate_dentry_callback(dn
, true);
4377 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4380 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4384 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4385 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4387 _schedule_ino_release_callback(in
.get());
4391 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4392 for (const auto &dn
: to_trim
) {
4397 caps_size
= s
->caps
.size();
4398 if (caps_size
> (size_t)max
)
4399 _invalidate_kernel_dcache();
4402 void Client::force_session_readonly(MetaSession
*s
)
4405 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4406 auto &in
= (*p
)->inode
;
4407 if (in
.caps_wanted() & CEPH_CAP_FILE_WR
)
4408 signal_cond_list(in
.waitfor_caps
);
4412 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4414 MetaSession
*session
= in
->auth_cap
->session
;
4416 int flushing
= in
->dirty_caps
;
4417 ceph_assert(flushing
);
4419 ceph_tid_t flush_tid
= ++last_flush_tid
;
4420 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4422 if (!in
->flushing_caps
) {
4423 ldout(cct
, 10) << __func__
<< " " << ccap_string(flushing
) << " " << *in
<< dendl
;
4424 num_flushing_caps
++;
4426 ldout(cct
, 10) << __func__
<< " (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4429 in
->flushing_caps
|= flushing
;
4430 in
->mark_caps_clean();
4432 if (!in
->flushing_cap_item
.is_on_list())
4433 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4434 session
->flushing_caps_tids
.insert(flush_tid
);
4440 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4442 for (auto &p
: in
->cap_snaps
) {
4443 CapSnap
&capsnap
= p
.second
;
4444 if (capsnap
.flush_tid
> 0) {
4445 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4446 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4449 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4450 it
!= in
->flushing_cap_tids
.end();
4452 old_s
->flushing_caps_tids
.erase(it
->first
);
4453 new_s
->flushing_caps_tids
.insert(it
->first
);
4455 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4459 * Flush all caps back to the MDS. Because the callers generally wait on the
4460 * result of this function (syncfs and umount cases), we set
4461 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4463 void Client::flush_caps_sync()
4465 ldout(cct
, 10) << __func__
<< dendl
;
4466 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4468 unsigned flags
= CHECK_CAPS_NODELAY
;
4472 delayed_list
.pop_front();
4473 if (p
.end() && dirty_list
.empty())
4474 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4475 check_caps(in
, flags
);
4479 p
= dirty_list
.begin();
4481 unsigned flags
= CHECK_CAPS_NODELAY
;
4486 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4487 check_caps(in
, flags
);
4491 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4493 while (in
->flushing_caps
) {
4494 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4495 ceph_assert(it
!= in
->flushing_cap_tids
.end());
4496 if (it
->first
> want
)
4498 ldout(cct
, 10) << __func__
<< " on " << *in
<< " flushing "
4499 << ccap_string(it
->second
) << " want " << want
4500 << " last " << it
->first
<< dendl
;
4501 wait_on_list(in
->waitfor_caps
);
4505 void Client::wait_sync_caps(ceph_tid_t want
)
4508 ldout(cct
, 10) << __func__
<< " want " << want
<< " (last is " << last_flush_tid
<< ", "
4509 << num_flushing_caps
<< " total flushing)" << dendl
;
4510 for (auto &p
: mds_sessions
) {
4511 MetaSession
*s
= &p
.second
;
4512 if (s
->flushing_caps_tids
.empty())
4514 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4515 if (oldest_tid
<= want
) {
4516 ldout(cct
, 10) << " waiting on mds." << p
.first
<< " tid " << oldest_tid
4517 << " (want " << want
<< ")" << dendl
;
4518 std::unique_lock l
{client_lock
, std::adopt_lock
};
4526 void Client::kick_flushing_caps(Inode
*in
, MetaSession
*session
)
4528 in
->flags
&= ~I_KICK_FLUSH
;
4530 Cap
*cap
= in
->auth_cap
;
4531 ceph_assert(cap
->session
== session
);
4533 ceph_tid_t last_snap_flush
= 0;
4534 for (auto p
= in
->flushing_cap_tids
.rbegin();
4535 p
!= in
->flushing_cap_tids
.rend();
4538 last_snap_flush
= p
->first
;
4543 int wanted
= in
->caps_wanted();
4544 int used
= get_caps_used(in
) | in
->caps_dirty();
4545 auto it
= in
->cap_snaps
.begin();
4546 for (auto& p
: in
->flushing_cap_tids
) {
4548 int msg_flags
= p
.first
< last_snap_flush
? MClientCaps::FLAG_PENDING_CAPSNAP
: 0;
4549 send_cap(in
, session
, cap
, msg_flags
, used
, wanted
, (cap
->issued
| cap
->implemented
),
4552 ceph_assert(it
!= in
->cap_snaps
.end());
4553 ceph_assert(it
->second
.flush_tid
== p
.first
);
4554 send_flush_snap(in
, session
, it
->first
, it
->second
);
4560 void Client::kick_flushing_caps(MetaSession
*session
)
4562 mds_rank_t mds
= session
->mds_num
;
4563 ldout(cct
, 10) << __func__
<< " mds." << mds
<< dendl
;
4565 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4567 if (in
->flags
& I_KICK_FLUSH
) {
4568 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4569 kick_flushing_caps(in
, session
);
4574 void Client::early_kick_flushing_caps(MetaSession
*session
)
4576 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4578 Cap
*cap
= in
->auth_cap
;
4581 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4582 // stage. This guarantees that MDS processes the cap flush message before issuing
4583 // the flushing caps to other client.
4584 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
) {
4585 in
->flags
|= I_KICK_FLUSH
;
4589 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4590 << " to mds." << session
->mds_num
<< dendl
;
4591 // send_reconnect() also will reset these sequence numbers. make sure
4592 // sequence numbers in cap flush message match later reconnect message.
4596 cap
->issued
= cap
->implemented
;
4598 kick_flushing_caps(in
, session
);
4602 void SnapRealm::build_snap_context()
4604 set
<snapid_t
> snaps
;
4605 snapid_t max_seq
= seq
;
4607 // start with prior_parents?
4608 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4609 snaps
.insert(prior_parent_snaps
[i
]);
4611 // current parent's snaps
4613 const SnapContext
& psnapc
= pparent
->get_snap_context();
4614 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4615 if (psnapc
.snaps
[i
] >= parent_since
)
4616 snaps
.insert(psnapc
.snaps
[i
]);
4617 if (psnapc
.seq
> max_seq
)
4618 max_seq
= psnapc
.seq
;
4622 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4623 snaps
.insert(my_snaps
[i
]);
4626 cached_snap_context
.seq
= max_seq
;
4627 cached_snap_context
.snaps
.resize(0);
4628 cached_snap_context
.snaps
.reserve(snaps
.size());
4629 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4630 cached_snap_context
.snaps
.push_back(*p
);
4633 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4638 while (!q
.empty()) {
4642 ldout(cct
, 10) << __func__
<< " " << *realm
<< dendl
;
4643 realm
->invalidate_cache();
4645 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4646 p
!= realm
->pchildren
.end();
4652 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4654 SnapRealm
*realm
= snap_realms
[r
];
4656 snap_realms
[r
] = realm
= new SnapRealm(r
);
4657 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4662 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4664 if (snap_realms
.count(r
) == 0) {
4665 ldout(cct
, 20) << __func__
<< " " << r
<< " fail" << dendl
;
4668 SnapRealm
*realm
= snap_realms
[r
];
4669 ldout(cct
, 20) << __func__
<< " " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4674 void Client::put_snap_realm(SnapRealm
*realm
)
4676 ldout(cct
, 20) << __func__
<< " " << realm
->ino
<< " " << realm
4677 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4678 if (--realm
->nref
== 0) {
4679 snap_realms
.erase(realm
->ino
);
4680 if (realm
->pparent
) {
4681 realm
->pparent
->pchildren
.erase(realm
);
4682 put_snap_realm(realm
->pparent
);
4688 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4690 if (realm
->parent
!= parent
) {
4691 ldout(cct
, 10) << __func__
<< " " << *realm
4692 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4693 realm
->parent
= parent
;
4694 if (realm
->pparent
) {
4695 realm
->pparent
->pchildren
.erase(realm
);
4696 put_snap_realm(realm
->pparent
);
4698 realm
->pparent
= get_snap_realm(parent
);
4699 realm
->pparent
->pchildren
.insert(realm
);
4705 static bool has_new_snaps(const SnapContext
& old_snapc
,
4706 const SnapContext
& new_snapc
)
4708 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4712 void Client::update_snap_trace(const bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4714 SnapRealm
*first_realm
= NULL
;
4715 ldout(cct
, 10) << __func__
<< " len " << bl
.length() << dendl
;
4717 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4719 auto p
= bl
.cbegin();
4723 SnapRealm
*realm
= get_snap_realm(info
.ino());
4725 bool invalidate
= false;
4727 if (info
.seq() > realm
->seq
) {
4728 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4732 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4733 // flush me + children
4736 while (!q
.empty()) {
4737 SnapRealm
*realm
= q
.front();
4740 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4741 p
!= realm
->pchildren
.end();
4745 if (dirty_realms
.count(realm
) == 0) {
4747 dirty_realms
[realm
] = realm
->get_snap_context();
4753 realm
->seq
= info
.seq();
4754 realm
->created
= info
.created();
4755 realm
->parent_since
= info
.parent_since();
4756 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4757 realm
->my_snaps
= info
.my_snaps
;
4761 // _always_ verify parent
4762 if (adjust_realm_parent(realm
, info
.parent()))
4766 invalidate_snaprealm_and_children(realm
);
4767 ldout(cct
, 15) << __func__
<< " " << *realm
<< " self|parent updated" << dendl
;
4768 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4770 ldout(cct
, 10) << __func__
<< " " << *realm
<< " seq " << info
.seq()
4771 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4775 first_realm
= realm
;
4777 put_snap_realm(realm
);
4780 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4781 q
!= dirty_realms
.end();
4783 SnapRealm
*realm
= q
->first
;
4784 // if there are new snaps ?
4785 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4786 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4787 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4791 queue_cap_snap(in
, q
->second
);
4794 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4796 put_snap_realm(realm
);
4800 *realm_ret
= first_realm
;
4802 put_snap_realm(first_realm
);
4805 void Client::handle_snap(const MConstRef
<MClientSnap
>& m
)
4807 ldout(cct
, 10) << __func__
<< " " << *m
<< dendl
;
4808 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4809 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4814 got_mds_push(session
);
4816 map
<Inode
*, SnapContext
> to_move
;
4817 SnapRealm
*realm
= 0;
4819 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4820 ceph_assert(m
->head
.split
);
4822 auto p
= m
->bl
.cbegin();
4824 ceph_assert(info
.ino() == m
->head
.split
);
4826 // flush, then move, ino's.
4827 realm
= get_snap_realm(info
.ino());
4828 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4829 for (auto& ino
: m
->split_inos
) {
4830 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
4831 if (inode_map
.count(vino
)) {
4832 Inode
*in
= inode_map
[vino
];
4833 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4835 if (in
->snaprealm
->created
> info
.created()) {
4836 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4837 << *in
->snaprealm
<< dendl
;
4840 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4843 in
->snaprealm_item
.remove_myself();
4844 to_move
[in
] = in
->snaprealm
->get_snap_context();
4845 put_snap_realm(in
->snaprealm
);
4849 // move child snaprealms, too
4850 for (auto& child_realm
: m
->split_realms
) {
4851 ldout(cct
, 10) << "adjusting snaprealm " << child_realm
<< " parent" << dendl
;
4852 SnapRealm
*child
= get_snap_realm_maybe(child_realm
);
4855 adjust_realm_parent(child
, realm
->ino
);
4856 put_snap_realm(child
);
4860 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4863 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4864 Inode
*in
= p
->first
;
4865 in
->snaprealm
= realm
;
4866 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4868 // queue for snap writeback
4869 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4870 queue_cap_snap(in
, p
->second
);
4872 put_snap_realm(realm
);
4876 void Client::handle_quota(const MConstRef
<MClientQuota
>& m
)
4878 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4879 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4884 got_mds_push(session
);
4886 ldout(cct
, 10) << __func__
<< " " << *m
<< " from mds." << mds
<< dendl
;
4888 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4889 if (inode_map
.count(vino
)) {
4891 in
= inode_map
[vino
];
4894 in
->quota
= m
->quota
;
4895 in
->rstat
= m
->rstat
;
4900 void Client::handle_caps(const MConstRef
<MClientCaps
>& m
)
4902 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4903 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4908 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4909 // Pause RADOS operations until we see the required epoch
4910 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4913 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4914 // Record the barrier so that we will transmit it to MDS when releasing
4915 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4918 got_mds_push(session
);
4921 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4922 if (auto it
= inode_map
.find(vino
); it
!= inode_map
.end()) {
4925 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4926 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4927 session
->enqueue_cap_release(
4934 ldout(cct
, 5) << __func__
<< " don't have vino " << vino
<< ", dropping" << dendl
;
4937 // in case the mds is waiting on e.g. a revocation
4938 flush_cap_releases();
4942 switch (m
->get_op()) {
4943 case CEPH_CAP_OP_EXPORT
: return handle_cap_export(session
, in
, m
);
4944 case CEPH_CAP_OP_FLUSHSNAP_ACK
: return handle_cap_flushsnap_ack(session
, in
, m
);
4945 case CEPH_CAP_OP_IMPORT
: /* no return */ handle_cap_import(session
, in
, m
);
4948 if (auto it
= in
->caps
.find(mds
); it
!= in
->caps
.end()) {
4949 Cap
&cap
= in
->caps
.at(mds
);
4951 switch (m
->get_op()) {
4952 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4953 case CEPH_CAP_OP_IMPORT
:
4954 case CEPH_CAP_OP_REVOKE
:
4955 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, &cap
, m
);
4956 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, &cap
, m
);
4959 ldout(cct
, 5) << __func__
<< " don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4964 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
4966 mds_rank_t mds
= session
->mds_num
;
4968 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4969 << " IMPORT from mds." << mds
<< dendl
;
4971 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4974 if (auto it
= in
->caps
.find(peer_mds
); m
->peer
.cap_id
&& it
!= in
->caps
.end()) {
4976 cap_perms
= cap
->latest_perms
;
4980 SnapRealm
*realm
= NULL
;
4981 update_snap_trace(m
->snapbl
, &realm
);
4983 int issued
= m
->get_caps();
4984 int wanted
= m
->get_wanted();
4985 add_update_cap(in
, session
, m
->get_cap_id(),
4986 issued
, wanted
, m
->get_seq(), m
->get_mseq(),
4987 m
->get_realm(), CEPH_CAP_FLAG_AUTH
, cap_perms
);
4989 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4990 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4994 put_snap_realm(realm
);
4996 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
4997 if (!(wanted
& CEPH_CAP_ANY_FILE_WR
) ||
4998 in
->requested_max_size
> m
->get_max_size()) {
4999 in
->requested_max_size
= 0;
5000 ldout(cct
, 15) << "reset requested_max_size after cap import" << dendl
;
5002 // reflush any/all caps (if we are now the auth_cap)
5003 kick_flushing_caps(in
, session
);
5007 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5009 mds_rank_t mds
= session
->mds_num
;
5011 ldout(cct
, 5) << __func__
<< " ino " << m
->get_ino() << " mseq " << m
->get_mseq()
5012 << " EXPORT from mds." << mds
<< dendl
;
5014 auto it
= in
->caps
.find(mds
);
5015 if (it
!= in
->caps
.end()) {
5016 Cap
&cap
= it
->second
;
5017 if (cap
.cap_id
== m
->get_cap_id()) {
5018 if (m
->peer
.cap_id
) {
5019 const auto peer_mds
= mds_rank_t(m
->peer
.mds
);
5020 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
5021 auto it
= in
->caps
.find(peer_mds
);
5022 if (it
!= in
->caps
.end()) {
5023 Cap
&tcap
= it
->second
;
5024 if (tcap
.cap_id
== m
->peer
.cap_id
&&
5025 ceph_seq_cmp(tcap
.seq
, m
->peer
.seq
) < 0) {
5026 tcap
.cap_id
= m
->peer
.cap_id
;
5027 tcap
.seq
= m
->peer
.seq
- 1;
5028 tcap
.issue_seq
= tcap
.seq
;
5029 tcap
.issued
|= cap
.issued
;
5030 tcap
.implemented
|= cap
.issued
;
5031 if (&cap
== in
->auth_cap
)
5032 in
->auth_cap
= &tcap
;
5033 if (in
->auth_cap
== &tcap
&& in
->flushing_cap_item
.is_on_list())
5034 adjust_session_flushing_caps(in
, session
, tsession
);
5037 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
.issued
, 0,
5038 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
5039 &cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
5043 if (cap
.wanted
| cap
.issued
)
5044 in
->flags
|= I_CAP_DROPPED
;
5047 remove_cap(&cap
, false);
5052 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5054 mds_rank_t mds
= session
->mds_num
;
5055 ceph_assert(in
->caps
.count(mds
));
5057 ldout(cct
, 10) << __func__
<< " on ino " << *in
5058 << " size " << in
->size
<< " -> " << m
->get_size()
5062 in
->caps_issued(&issued
);
5063 issued
|= in
->caps_dirty();
5064 update_inode_file_size(in
, issued
, m
->get_size(),
5065 m
->get_truncate_seq(), m
->get_truncate_size());
5068 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5070 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5071 int dirty
= m
->get_dirty();
5075 auto it
= in
->flushing_cap_tids
.begin();
5076 if (it
->first
< flush_ack_tid
) {
5077 ldout(cct
, 0) << __func__
<< " mds." << session
->mds_num
5078 << " got unexpected flush ack tid " << flush_ack_tid
5079 << " expected is " << it
->first
<< dendl
;
5081 for (; it
!= in
->flushing_cap_tids
.end(); ) {
5087 if (it
->first
== flush_ack_tid
)
5088 cleaned
= it
->second
;
5089 if (it
->first
<= flush_ack_tid
) {
5090 session
->flushing_caps_tids
.erase(it
->first
);
5091 in
->flushing_cap_tids
.erase(it
++);
5095 cleaned
&= ~it
->second
;
5101 ldout(cct
, 5) << __func__
<< " mds." << session
->mds_num
5102 << " cleaned " << ccap_string(cleaned
) << " on " << *in
5103 << " with " << ccap_string(dirty
) << dendl
;
5106 signal_cond_list(in
->waitfor_caps
);
5107 if (session
->flushing_caps_tids
.empty() ||
5108 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5109 sync_cond
.notify_all();
5113 in
->cap_dirtier_uid
= -1;
5114 in
->cap_dirtier_gid
= -1;
5118 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
5120 if (in
->flushing_caps
) {
5121 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
5122 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
5123 in
->flushing_caps
&= ~cleaned
;
5124 if (in
->flushing_caps
== 0) {
5125 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
5126 num_flushing_caps
--;
5127 if (in
->flushing_cap_tids
.empty())
5128 in
->flushing_cap_item
.remove_myself();
5130 if (!in
->caps_dirty())
5137 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, const MConstRef
<MClientCaps
>& m
)
5139 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
5140 mds_rank_t mds
= session
->mds_num
;
5141 ceph_assert(in
->caps
.count(mds
));
5142 snapid_t follows
= m
->get_snap_follows();
5144 if (auto it
= in
->cap_snaps
.find(follows
); it
!= in
->cap_snaps
.end()) {
5145 auto& capsnap
= it
->second
;
5146 if (flush_ack_tid
!= capsnap
.flush_tid
) {
5147 ldout(cct
, 10) << " tid " << flush_ack_tid
<< " != " << capsnap
.flush_tid
<< dendl
;
5149 InodeRef
tmp_ref(in
);
5150 ldout(cct
, 5) << __func__
<< " mds." << mds
<< " flushed snap follows " << follows
5151 << " on " << *in
<< dendl
;
5152 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5153 in
->flushing_cap_tids
.erase(capsnap
.flush_tid
);
5154 if (in
->flushing_caps
== 0 && in
->flushing_cap_tids
.empty())
5155 in
->flushing_cap_item
.remove_myself();
5156 in
->cap_snaps
.erase(it
);
5158 signal_cond_list(in
->waitfor_caps
);
5159 if (session
->flushing_caps_tids
.empty() ||
5160 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
5161 sync_cond
.notify_all();
5164 ldout(cct
, 5) << __func__
<< " DUP(?) mds." << mds
<< " flushed snap follows " << follows
5165 << " on " << *in
<< dendl
;
5166 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5170 class C_Client_DentryInvalidate
: public Context
{
5177 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5178 client(c
), name(dn
->name
) {
5179 if (client
->use_faked_inos()) {
5180 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5182 ino
.ino
= dn
->inode
->faked_ino
;
5184 dirino
= dn
->dir
->parent_inode
->vino();
5186 ino
= dn
->inode
->vino();
5189 ino
.ino
= inodeno_t();
5191 void finish(int r
) override
{
5192 // _async_dentry_invalidate is responsible for its own locking
5193 ceph_assert(ceph_mutex_is_not_locked_by_me(client
->client_lock
));
5194 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5198 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5202 ldout(cct
, 10) << __func__
<< " '" << name
<< "' ino " << ino
5203 << " in dir " << dirino
<< dendl
;
5204 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
.c_str(), name
.length());
5207 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5209 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5210 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5213 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5215 int ref
= in
->get_num_ref();
5216 ldout(cct
, 5) << __func__
<< " in " << *in
<<dendl
;
5218 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5219 for (auto p
= in
->dir
->dentries
.begin();
5220 p
!= in
->dir
->dentries
.end(); ) {
5221 Dentry
*dn
= p
->second
;
5223 /* rmsnap removes whole subtree, need trim inodes recursively.
5224 * we don't need to invalidate dentries recursively. because
5225 * invalidating a directory dentry effectively invalidate
5227 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5228 _try_to_trim_inode(dn
->inode
.get(), false);
5230 if (dn
->lru_is_expireable())
5231 unlink(dn
, true, false); // keep dir, drop dentry
5233 if (in
->dir
->dentries
.empty()) {
5239 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5240 InodeRef snapdir
= open_snapdir(in
);
5241 _try_to_trim_inode(snapdir
.get(), false);
5246 auto q
= in
->dentries
.begin();
5247 while (q
!= in
->dentries
.end()) {
5250 if( in
->ll_ref
> 0 && sched_inval
) {
5251 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5252 // so in->dentries doesn't always reflect the state of kernel's dcache.
5253 _schedule_invalidate_dentry_callback(dn
, true);
5255 unlink(dn
, true, true);
5260 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, const MConstRef
<MClientCaps
>& m
)
5262 mds_rank_t mds
= session
->mds_num
;
5263 int used
= get_caps_used(in
);
5264 int wanted
= in
->caps_wanted();
5266 const unsigned new_caps
= m
->get_caps();
5267 const bool was_stale
= session
->cap_gen
> cap
->gen
;
5268 ldout(cct
, 5) << __func__
<< " on in " << m
->get_ino()
5269 << " mds." << mds
<< " seq " << m
->get_seq()
5270 << " caps now " << ccap_string(new_caps
)
5271 << " was " << ccap_string(cap
->issued
)
5272 << (was_stale
? " (stale)" : "") << dendl
;
5275 cap
->issued
= cap
->implemented
= CEPH_CAP_PIN
;
5276 cap
->seq
= m
->get_seq();
5277 cap
->gen
= session
->cap_gen
;
5279 check_cap_issue(in
, new_caps
);
5283 in
->caps_issued(&issued
);
5284 issued
|= in
->caps_dirty();
5286 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5287 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5288 in
->mode
= m
->head
.mode
;
5289 in
->uid
= m
->head
.uid
;
5290 in
->gid
= m
->head
.gid
;
5291 in
->btime
= m
->btime
;
5293 bool deleted_inode
= false;
5294 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5295 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5296 in
->nlink
= m
->head
.nlink
;
5297 if (in
->nlink
== 0 &&
5298 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5299 deleted_inode
= true;
5301 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5302 m
->xattrbl
.length() &&
5303 m
->head
.xattr_version
> in
->xattr_version
) {
5304 auto p
= m
->xattrbl
.cbegin();
5305 decode(in
->xattrs
, p
);
5306 in
->xattr_version
= m
->head
.xattr_version
;
5309 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5310 in
->dirstat
.nfiles
= m
->get_nfiles();
5311 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5314 if (new_caps
& CEPH_CAP_ANY_RD
) {
5315 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5316 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5319 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5320 in
->layout
= m
->get_layout();
5321 update_inode_file_size(in
, issued
, m
->get_size(),
5322 m
->get_truncate_seq(), m
->get_truncate_size());
5325 if (m
->inline_version
> in
->inline_version
) {
5326 in
->inline_data
= m
->inline_data
;
5327 in
->inline_version
= m
->inline_version
;
5330 /* always take a newer change attr */
5331 if (m
->get_change_attr() > in
->change_attr
)
5332 in
->change_attr
= m
->get_change_attr();
5335 if (cap
== in
->auth_cap
&&
5336 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5337 (m
->get_max_size() != in
->max_size
)) {
5338 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5339 in
->max_size
= m
->get_max_size();
5340 if (in
->max_size
> in
->wanted_max_size
) {
5341 in
->wanted_max_size
= 0;
5342 in
->requested_max_size
= 0;
5347 if ((was_stale
|| m
->get_op() == CEPH_CAP_OP_IMPORT
) &&
5348 (wanted
& ~(cap
->wanted
| new_caps
))) {
5349 // If mds is importing cap, prior cap messages that update 'wanted'
5350 // may get dropped by mds (migrate seq mismatch).
5352 // We don't send cap message to update 'wanted' if what we want are
5353 // already issued. If mds revokes caps, cap message that releases caps
5354 // also tells mds what we want. But if caps got revoked by mds forcedly
5355 // (session stale). We may haven't told mds what we want.
5361 auto revoked
= cap
->issued
& ~new_caps
;
5363 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5364 cap
->issued
= new_caps
;
5365 cap
->implemented
|= new_caps
;
5367 // recall delegations if we're losing caps necessary for them
5368 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5369 in
->recall_deleg(false);
5370 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5371 in
->recall_deleg(true);
5373 used
= adjust_caps_used_for_lazyio(used
, cap
->issued
, cap
->implemented
);
5374 if ((used
& revoked
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
)) &&
5375 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5376 // waitin' for flush
5377 } else if (used
& revoked
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) {
5381 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5384 } else if (cap
->issued
== new_caps
) {
5385 ldout(cct
, 10) << " caps unchanged at " << ccap_string(cap
->issued
) << dendl
;
5387 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~cap
->issued
) << dendl
;
5388 cap
->issued
= new_caps
;
5389 cap
->implemented
|= new_caps
;
5391 if (cap
== in
->auth_cap
) {
5392 // non-auth MDS is revoking the newly grant caps ?
5393 for (const auto &p
: in
->caps
) {
5394 if (&p
.second
== cap
)
5396 if (p
.second
.implemented
& ~p
.second
.issued
& new_caps
) {
5409 signal_cond_list(in
->waitfor_caps
);
5411 // may drop inode's last ref
5413 _try_to_trim_inode(in
, true);
5416 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5418 if (perms
.uid() == 0)
5421 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5422 int ret
= _posix_acl_permission(in
, perms
, want
);
5427 // check permissions before doing anything else
5428 if (!in
->check_mode(perms
, want
))
5433 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5434 const UserPerm
& perms
)
5436 int r
= _getattr_for_perm(in
, perms
);
5441 if (strncmp(name
, "system.", 7) == 0) {
5442 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5445 r
= inode_permission(in
, perms
, want
);
5448 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5452 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5453 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5457 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5458 const UserPerm
& perms
)
5460 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5461 int r
= _getattr_for_perm(in
, perms
);
5465 if (mask
& CEPH_SETATTR_SIZE
) {
5466 r
= inode_permission(in
, perms
, MAY_WRITE
);
5472 if (mask
& CEPH_SETATTR_UID
) {
5473 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5476 if (mask
& CEPH_SETATTR_GID
) {
5477 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5478 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5482 if (mask
& CEPH_SETATTR_MODE
) {
5483 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5486 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5487 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5488 stx
->stx_mode
&= ~S_ISGID
;
5491 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5492 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5493 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5494 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5495 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5496 check_mask
|= CEPH_SETATTR_MTIME
;
5497 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5498 check_mask
|= CEPH_SETATTR_ATIME
;
5499 if (check_mask
& mask
) {
5502 r
= inode_permission(in
, perms
, MAY_WRITE
);
5510 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5514 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5516 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5519 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5521 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5522 want
= MAY_READ
| MAY_WRITE
;
5523 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5525 if (flags
& O_TRUNC
)
5529 switch (in
->mode
& S_IFMT
) {
5534 if (want
& MAY_WRITE
) {
5541 r
= _getattr_for_perm(in
, perms
);
5545 r
= inode_permission(in
, perms
, want
);
5547 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5551 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5553 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5554 int r
= _getattr_for_perm(dir
, perms
);
5558 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5560 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5564 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5566 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5567 int r
= _getattr_for_perm(dir
, perms
);
5571 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5573 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5577 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5579 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5580 int r
= _getattr_for_perm(dir
, perms
);
5584 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5588 /* 'name == NULL' means rmsnap */
5589 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5591 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5594 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5598 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5602 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5604 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5605 int r
= _getattr_for_perm(in
, perms
);
5609 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5615 if (!S_ISREG(in
->mode
))
5618 if (in
->mode
& S_ISUID
)
5621 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5624 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5626 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5630 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5632 int mask
= CEPH_STAT_CAP_MODE
;
5634 if (acl_type
!= NO_ACL
) {
5635 mask
|= CEPH_STAT_CAP_XATTR
;
5636 force
= in
->xattr_version
== 0;
5638 return _getattr(in
, mask
, perms
, force
);
5641 vinodeno_t
Client::_get_vino(Inode
*in
)
5643 /* The caller must hold the client lock */
5644 return vinodeno_t(in
->ino
, in
->snapid
);
5648 * Resolve an MDS spec to a list of MDS daemon GIDs.
5650 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5651 * It may be '*' in which case it matches all GIDs.
5653 * If no error is returned, the `targets` vector will be populated with at least
5656 int Client::resolve_mds(
5657 const std::string
&mds_spec
,
5658 std::vector
<mds_gid_t
> *targets
)
5661 ceph_assert(targets
!= nullptr);
5664 std::stringstream ss
;
5665 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5667 // We got a role, resolve it to a GID
5668 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5669 << role
<< "'" << dendl
;
5671 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5675 std::string strtol_err
;
5676 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5677 if (strtol_err
.empty()) {
5678 // It is a possible GID
5679 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5680 if (fsmap
->gid_exists(mds_gid
)) {
5681 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5682 targets
->push_back(mds_gid
);
5684 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5688 } else if (mds_spec
== "*") {
5689 // It is a wildcard: use all MDSs
5690 const auto mds_info
= fsmap
->get_mds_info();
5692 if (mds_info
.empty()) {
5693 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5697 for (const auto i
: mds_info
) {
5698 targets
->push_back(i
.first
);
5701 // It did not parse as an integer, it is not a wildcard, it must be a name
5702 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5704 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5706 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5710 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5711 << "' to GID " << mds_gid
<< dendl
;
5712 targets
->push_back(mds_gid
);
5721 * Authenticate with mon and establish global ID
5723 int Client::authenticate()
5725 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
5727 if (monclient
->is_authenticated()) {
5731 client_lock
.unlock();
5732 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5738 whoami
= monclient
->get_global_id();
5739 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5744 int Client::fetch_fsmap(bool user
)
5747 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5748 // rather than MDSMap because no one MDSMap contains all the daemons, and
5749 // a `tell` can address any daemon.
5750 version_t fsmap_latest
;
5753 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5754 client_lock
.unlock();
5757 } while (r
== -EAGAIN
);
5760 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5764 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5767 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5768 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5769 monclient
->renew_subs();
5770 wait_on_list(waiting_for_fsmap
);
5772 ceph_assert(fsmap_user
);
5773 ceph_assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5775 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5776 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5777 monclient
->renew_subs();
5778 wait_on_list(waiting_for_fsmap
);
5781 ceph_assert(fsmap
->get_epoch() >= fsmap_latest
);
5783 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5784 << fsmap_latest
<< dendl
;
5790 * @mds_spec one of ID, rank, GID, "*"
5793 int Client::mds_command(
5794 const std::string
&mds_spec
,
5795 const vector
<string
>& cmd
,
5796 const bufferlist
& inbl
,
5801 std::lock_guard
lock(client_lock
);
5812 r
= fetch_fsmap(false);
5817 // Look up MDS target(s) of the command
5818 std::vector
<mds_gid_t
> targets
;
5819 r
= resolve_mds(mds_spec
, &targets
);
5824 // If daemons are laggy, we won't send them commands. If all
5825 // are laggy then we fail.
5826 std::vector
<mds_gid_t
> non_laggy
;
5827 for (const auto gid
: targets
) {
5828 const auto info
= fsmap
->get_info_gid(gid
);
5829 if (!info
.laggy()) {
5830 non_laggy
.push_back(gid
);
5833 if (non_laggy
.size() == 0) {
5834 *outs
= "All targeted MDS daemons are laggy";
5838 if (metadata
.empty()) {
5839 // We are called on an unmounted client, so metadata
5840 // won't be initialized yet.
5841 populate_metadata("");
5844 // Send commands to targets
5845 C_GatherBuilder
gather(cct
, onfinish
);
5846 for (const auto target_gid
: non_laggy
) {
5847 const auto info
= fsmap
->get_info_gid(target_gid
);
5849 // Open a connection to the target MDS
5850 ConnectionRef conn
= messenger
->connect_to_mds(info
.get_addrs());
5852 // Generate MDSCommandOp state
5853 auto &op
= command_table
.start_command();
5855 op
.on_finish
= gather
.new_sub();
5860 op
.mds_gid
= target_gid
;
5863 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5864 << " tid=" << op
.tid
<< cmd
<< dendl
;
5866 // Construct and send MCommand
5867 auto m
= op
.get_message(monclient
->get_fsid());
5868 conn
->send_message2(std::move(m
));
5875 void Client::handle_command_reply(const MConstRef
<MCommandReply
>& m
)
5877 ceph_tid_t
const tid
= m
->get_tid();
5879 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5881 if (!command_table
.exists(tid
)) {
5882 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5886 auto &op
= command_table
.get_command(tid
);
5888 *op
.outbl
= m
->get_data();
5895 op
.on_finish
->complete(m
->r
);
5898 command_table
.erase(tid
);
5901 // -------------------
5904 int Client::subscribe_mdsmap(const std::string
&fs_name
)
5906 int r
= authenticate();
5908 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5912 std::string resolved_fs_name
;
5913 if (fs_name
.empty()) {
5914 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_fs");
5915 if (resolved_fs_name
.empty())
5916 // Try the backwards compatibility fs name option
5917 resolved_fs_name
= cct
->_conf
.get_val
<std::string
>("client_mds_namespace");
5919 resolved_fs_name
= fs_name
;
5922 std::string want
= "mdsmap";
5923 if (!resolved_fs_name
.empty()) {
5924 r
= fetch_fsmap(true);
5927 fscid
= fsmap_user
->get_fs_cid(resolved_fs_name
);
5928 if (fscid
== FS_CLUSTER_ID_NONE
) {
5932 std::ostringstream oss
;
5933 oss
<< want
<< "." << fscid
;
5936 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5938 monclient
->sub_want(want
, 0, 0);
5939 monclient
->renew_subs();
5944 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5945 bool require_mds
, const std::string
&fs_name
)
5947 std::lock_guard
lock(client_lock
);
5950 ldout(cct
, 5) << "already mounted" << dendl
;
5956 int r
= subscribe_mdsmap(fs_name
);
5958 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
5962 tick(); // start tick
5966 auto availability
= mdsmap
->is_cluster_available();
5967 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5969 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5970 return CEPH_FUSE_NO_MDS_UP
;
5971 } else if (availability
== MDSMap::AVAILABLE
) {
5972 // Continue to mount
5974 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5975 // Else, wait. MDSMonitor will update the map to bring
5976 // us to a conclusion eventually.
5977 wait_on_list(waiting_for_mdsmap
);
5979 // Unexpected value!
5985 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5987 filepath
fp(CEPH_INO_ROOT
);
5988 if (!mount_root
.empty()) {
5989 fp
= filepath(mount_root
.c_str());
5992 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5993 req
->set_filepath(fp
);
5994 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5995 int res
= make_request(req
, perms
);
5997 if (res
== -EACCES
&& root
) {
5998 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
6016 if (!cct
->_conf
->client_trace
.empty()) {
6017 traceout
.open(cct
->_conf
->client_trace
.c_str());
6018 if (traceout
.is_open()) {
6019 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6021 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6026 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6027 ldout(cct, 3) << "op: struct stat st;" << dendl;
6028 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6029 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6030 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6031 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6032 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6033 ldout(cct, 3) << "op: int fd;" << dendl;
6040 void Client::_close_sessions()
6042 for (auto it
= mds_sessions
.begin(); it
!= mds_sessions
.end(); ) {
6043 if (it
->second
.state
== MetaSession::STATE_REJECTED
)
6044 mds_sessions
.erase(it
++);
6049 while (!mds_sessions
.empty()) {
6050 // send session closes!
6051 for (auto &p
: mds_sessions
) {
6052 if (p
.second
.state
!= MetaSession::STATE_CLOSING
) {
6053 _close_mds_session(&p
.second
);
6054 mds_ranks_closing
.insert(p
.first
);
6058 // wait for sessions to close
6059 double timo
= cct
->_conf
.get_val
<std::chrono::seconds
>("client_shutdown_timeout").count();
6060 ldout(cct
, 2) << "waiting for " << mds_ranks_closing
.size() << " mds session(s) to close (timeout: "
6061 << timo
<< "s)" << dendl
;
6062 std::unique_lock l
{client_lock
, std::adopt_lock
};
6065 } else if (!mount_cond
.wait_for(l
, ceph::make_timespan(timo
), [this] { return mds_ranks_closing
.empty(); })) {
6066 ldout(cct
, 1) << mds_ranks_closing
.size() << " mds(s) did not respond to session close -- timing out." << dendl
;
6067 while (!mds_ranks_closing
.empty()) {
6068 auto session
= mds_sessions
.at(*mds_ranks_closing
.begin());
6069 // this prunes entry from mds_sessions and mds_ranks_closing
6070 _closed_mds_session(&session
, -ETIMEDOUT
);
6074 mds_ranks_closing
.clear();
6079 void Client::flush_mdlog_sync()
6081 if (mds_requests
.empty())
6083 for (auto &p
: mds_sessions
) {
6084 flush_mdlog(&p
.second
);
6088 void Client::flush_mdlog(MetaSession
*session
)
6090 // Only send this to Luminous or newer MDS daemons, older daemons
6091 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6092 const uint64_t features
= session
->con
->get_features();
6093 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
6094 auto m
= make_message
<MClientSession
>(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
6095 session
->con
->send_message2(std::move(m
));
6100 void Client::_abort_mds_sessions(int err
)
6102 for (auto p
= mds_requests
.begin(); p
!= mds_requests
.end(); ) {
6103 auto req
= p
->second
;
6105 // unsafe requests will be removed during close session below.
6106 if (req
->got_unsafe
)
6110 if (req
->caller_cond
) {
6112 req
->caller_cond
->notify_all();
6116 // Process aborts on any requests that were on this waitlist.
6117 // Any requests that were on a waiting_for_open session waitlist
6118 // will get kicked during close session below.
6119 signal_cond_list(waiting_for_mdsmap
);
6121 // Force-close all sessions
6122 while(!mds_sessions
.empty()) {
6123 auto& session
= mds_sessions
.begin()->second
;
6124 _closed_mds_session(&session
, err
);
6128 void Client::_unmount(bool abort
)
6130 std::unique_lock lock
{client_lock
, std::adopt_lock
};
6134 if (abort
|| blacklisted
) {
6135 ldout(cct
, 2) << "unmounting (" << (abort
? "abort)" : "blacklisted)") << dendl
;
6137 ldout(cct
, 2) << "unmounting" << dendl
;
6144 // Abort all mds sessions
6145 _abort_mds_sessions(-ENOTCONN
);
6147 objecter
->op_cancel_writes(-ENOTCONN
);
6149 // flush the mdlog for pending requests, if any
6153 mount_cond
.wait(lock
, [this] {
6154 if (!mds_requests
.empty()) {
6155 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests"
6158 return mds_requests
.empty();
6161 timer
.cancel_event(tick_event
);
6166 // clean up any unclosed files
6167 while (!fd_map
.empty()) {
6168 Fh
*fh
= fd_map
.begin()->second
;
6169 fd_map
.erase(fd_map
.begin());
6170 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
6174 while (!ll_unclosed_fh_set
.empty()) {
6175 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
6177 ll_unclosed_fh_set
.erase(fh
);
6178 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
6182 while (!opened_dirs
.empty()) {
6183 dir_result_t
*dirp
= *opened_dirs
.begin();
6184 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
6190 mount_cond
.wait(lock
, [this] {
6191 if (unsafe_sync_write
> 0) {
6192 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting"
6195 return unsafe_sync_write
<= 0;
6198 if (cct
->_conf
->client_oc
) {
6199 // flush/release all buffered data
6200 std::list
<InodeRef
> anchor
;
6201 for (auto& p
: inode_map
) {
6202 Inode
*in
= p
.second
;
6204 ldout(cct
, 0) << "null inode_map entry ino " << p
.first
<< dendl
;
6208 // prevent inode from getting freed
6209 anchor
.emplace_back(in
);
6211 if (abort
|| blacklisted
) {
6212 objectcacher
->purge_set(&in
->oset
);
6213 } else if (!in
->caps
.empty()) {
6215 _flush(in
, new C_Client_FlushComplete(this, in
));
6220 if (abort
|| blacklisted
) {
6221 for (auto p
= dirty_list
.begin(); !p
.end(); ) {
6224 if (in
->dirty_caps
) {
6225 ldout(cct
, 0) << " drop dirty caps on " << *in
<< dendl
;
6226 in
->mark_caps_clean();
6232 wait_sync_caps(last_flush_tid
);
6238 while (lru
.lru_get_size() > 0 ||
6239 !inode_map
.empty()) {
6240 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6241 << "+" << inode_map
.size() << " items"
6242 << ", waiting (for caps to release?)"
6244 if (auto r
= mount_cond
.wait_for(lock
, ceph::make_timespan(5));
6245 r
== std::cv_status::timeout
) {
6249 ceph_assert(lru
.lru_get_size() == 0);
6250 ceph_assert(inode_map
.empty());
6253 if (!cct
->_conf
->client_trace
.empty()) {
6254 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6263 ldout(cct
, 2) << "unmounted." << dendl
;
6266 void Client::unmount()
6268 std::lock_guard
lock(client_lock
);
6272 void Client::abort_conn()
6274 std::lock_guard
lock(client_lock
);
6278 void Client::flush_cap_releases()
6280 // send any cap releases
6281 for (auto &p
: mds_sessions
) {
6282 auto &session
= p
.second
;
6283 if (session
.release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6285 if (cct
->_conf
->client_inject_release_failure
) {
6286 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6288 session
.con
->send_message2(std::move(session
.release
));
6290 session
.release
.reset();
6297 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6298 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6299 ceph_assert(0 == cct
->_conf
.set_val("client_debug_inject_tick_delay", "0"));
6300 cct
->_conf
.apply_changes(nullptr);
6303 ldout(cct
, 21) << "tick" << dendl
;
6304 tick_event
= timer
.add_event_after(
6305 cct
->_conf
->client_tick_interval
,
6306 new LambdaContext([this](int) {
6307 // Called back via Timer, which takes client_lock for us
6308 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
6311 utime_t now
= ceph_clock_now();
6313 if (!mounted
&& !mds_requests
.empty()) {
6314 MetaRequest
*req
= mds_requests
.begin()->second
;
6315 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6316 req
->abort(-ETIMEDOUT
);
6317 if (req
->caller_cond
) {
6319 req
->caller_cond
->notify_all();
6321 signal_cond_list(waiting_for_mdsmap
);
6322 for (auto &p
: mds_sessions
) {
6323 signal_context_list(p
.second
.waiting_for_open
);
6328 if (mdsmap
->get_epoch()) {
6330 utime_t el
= now
- last_cap_renew
;
6331 if (el
> mdsmap
->get_session_timeout() / 3.0)
6334 flush_cap_releases();
6338 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6342 if (in
->hold_caps_until
> now
)
6344 delayed_list
.pop_front();
6345 check_caps(in
, CHECK_CAPS_NODELAY
);
6350 if (blacklisted
&& mounted
&&
6351 last_auto_reconnect
+ 30 * 60 < now
&&
6352 cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
6353 messenger
->client_reset();
6354 fd_gen
++; // invalidate open files
6355 blacklisted
= false;
6356 _kick_stale_sessions();
6357 last_auto_reconnect
= now
;
6361 void Client::renew_caps()
6363 ldout(cct
, 10) << "renew_caps()" << dendl
;
6364 last_cap_renew
= ceph_clock_now();
6366 for (auto &p
: mds_sessions
) {
6367 ldout(cct
, 15) << "renew_caps requesting from mds." << p
.first
<< dendl
;
6368 if (mdsmap
->get_state(p
.first
) >= MDSMap::STATE_REJOIN
)
6369 renew_caps(&p
.second
);
6373 void Client::renew_caps(MetaSession
*session
)
6375 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6376 session
->last_cap_renew_request
= ceph_clock_now();
6377 uint64_t seq
= ++session
->cap_renew_seq
;
6378 session
->con
->send_message2(make_message
<MClientSession
>(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6382 // ===============================================================
6383 // high level (POSIXy) interface
6385 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6386 InodeRef
*target
, const UserPerm
& perms
)
6388 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6389 MetaRequest
*req
= new MetaRequest(op
);
6391 dir
->make_nosnap_relative_path(path
);
6392 path
.push_dentry(name
);
6393 req
->set_filepath(path
);
6394 req
->set_inode(dir
);
6395 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6396 mask
|= DEBUG_GETATTR_CAPS
;
6397 req
->head
.args
.getattr
.mask
= mask
;
6399 ldout(cct
, 10) << __func__
<< " on " << path
<< dendl
;
6401 int r
= make_request(req
, perms
, target
);
6402 ldout(cct
, 10) << __func__
<< " res is " << r
<< dendl
;
6406 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6407 const UserPerm
& perms
)
6412 if (dname
== "..") {
6413 if (dir
->dentries
.empty()) {
6414 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
6415 filepath
path(dir
->ino
);
6416 req
->set_filepath(path
);
6419 int r
= make_request(req
, perms
, &tmptarget
, NULL
, rand() % mdsmap
->get_num_in_mds());
6422 Inode
*tempino
= tmptarget
.get();
6425 ldout(cct
, 8) << __func__
<< " found target " << (*target
)->ino
<< dendl
;
6431 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6440 if (!dir
->is_dir()) {
6445 if (dname
.length() > NAME_MAX
) {
6450 if (dname
== cct
->_conf
->client_snapdir
&&
6451 dir
->snapid
== CEPH_NOSNAP
) {
6452 *target
= open_snapdir(dir
);
6457 dir
->dir
->dentries
.count(dname
)) {
6458 dn
= dir
->dir
->dentries
[dname
];
6460 ldout(cct
, 20) << __func__
<< " have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6461 << " seq " << dn
->lease_seq
6464 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6465 // is dn lease valid?
6466 utime_t now
= ceph_clock_now();
6467 if (dn
->lease_mds
>= 0 &&
6468 dn
->lease_ttl
> now
&&
6469 mds_sessions
.count(dn
->lease_mds
)) {
6470 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6471 if (s
.cap_ttl
> now
&&
6472 s
.cap_gen
== dn
->lease_gen
) {
6473 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6474 // make trim_caps() behave.
6475 dir
->try_touch_cap(dn
->lease_mds
);
6478 ldout(cct
, 20) << " bad lease, cap_ttl " << s
.cap_ttl
<< ", cap_gen " << s
.cap_gen
6479 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6482 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6483 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6484 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6486 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6487 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for "
6488 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6493 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6496 // can we conclude ENOENT locally?
6497 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6498 (dir
->flags
& I_COMPLETE
)) {
6499 ldout(cct
, 10) << __func__
<< " concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6504 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6509 *target
= dn
->inode
;
6517 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6519 ldout(cct
, 10) << __func__
<< " " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6523 int Client::get_or_create(Inode
*dir
, const char* name
,
6524 Dentry
**pdn
, bool expect_null
)
6527 ldout(cct
, 20) << __func__
<< " " << *dir
<< " name " << name
<< dendl
;
6529 if (dir
->dir
->dentries
.count(name
)) {
6530 Dentry
*dn
= dir
->dir
->dentries
[name
];
6532 // is dn lease valid?
6533 utime_t now
= ceph_clock_now();
6535 dn
->lease_mds
>= 0 &&
6536 dn
->lease_ttl
> now
&&
6537 mds_sessions
.count(dn
->lease_mds
)) {
6538 MetaSession
&s
= mds_sessions
.at(dn
->lease_mds
);
6539 if (s
.cap_ttl
> now
&&
6540 s
.cap_gen
== dn
->lease_gen
) {
6547 // otherwise link up a new one
6548 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6555 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6556 const UserPerm
& perms
, bool followsym
, int mask
)
6558 filepath path
= origpath
;
6560 if (origpath
.absolute())
6566 ldout(cct
, 10) << __func__
<< " " << path
<< dendl
;
6571 while (i
< path
.depth() && cur
) {
6573 const string
&dname
= path
[i
];
6574 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6575 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6577 if (cct
->_conf
->client_permissions
) {
6578 int r
= may_lookup(cur
.get(), perms
);
6581 caps
= CEPH_CAP_AUTH_SHARED
;
6584 /* Get extra requested caps on the last component */
6585 if (i
== (path
.depth() - 1))
6587 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6590 // only follow trailing symlink if followsym. always follow
6591 // 'directory' symlinks.
6592 if (next
&& next
->is_symlink()) {
6594 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6595 if (symlinks
> MAXSYMLINKS
) {
6599 if (i
< path
.depth() - 1) {
6601 // replace consumed components of path with symlink dir target
6602 filepath
resolved(next
->symlink
.c_str());
6603 resolved
.append(path
.postfixpath(i
+ 1));
6606 if (next
->symlink
[0] == '/') {
6610 } else if (followsym
) {
6611 if (next
->symlink
[0] == '/') {
6612 path
= next
->symlink
.c_str();
6617 filepath
more(next
->symlink
.c_str());
6618 // we need to remove the symlink component from off of the path
6619 // before adding the target that the symlink points to. remain
6620 // at the same position in the path.
6640 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6642 std::lock_guard
lock(client_lock
);
6643 tout(cct
) << "link" << std::endl
;
6644 tout(cct
) << relexisting
<< std::endl
;
6645 tout(cct
) << relpath
<< std::endl
;
6650 filepath
existing(relexisting
);
6653 int r
= path_walk(existing
, &in
, perm
, true);
6656 if (std::string(relpath
) == "/") {
6660 filepath
path(relpath
);
6661 string name
= path
.last_dentry();
6664 r
= path_walk(path
, &dir
, perm
, true);
6667 if (cct
->_conf
->client_permissions
) {
6668 if (S_ISDIR(in
->mode
)) {
6672 r
= may_hardlink(in
.get(), perm
);
6675 r
= may_create(dir
.get(), perm
);
6679 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6683 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6685 std::lock_guard
lock(client_lock
);
6686 tout(cct
) << __func__
<< std::endl
;
6687 tout(cct
) << relpath
<< std::endl
;
6692 if (std::string(relpath
) == "/")
6695 filepath
path(relpath
);
6696 string name
= path
.last_dentry();
6699 int r
= path_walk(path
, &dir
, perm
);
6702 if (cct
->_conf
->client_permissions
) {
6703 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6707 return _unlink(dir
.get(), name
.c_str(), perm
);
6710 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6712 std::lock_guard
lock(client_lock
);
6713 tout(cct
) << __func__
<< std::endl
;
6714 tout(cct
) << relfrom
<< std::endl
;
6715 tout(cct
) << relto
<< std::endl
;
6720 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6723 filepath
from(relfrom
);
6725 string fromname
= from
.last_dentry();
6727 string toname
= to
.last_dentry();
6730 InodeRef fromdir
, todir
;
6731 int r
= path_walk(from
, &fromdir
, perm
);
6734 r
= path_walk(to
, &todir
, perm
);
6738 if (cct
->_conf
->client_permissions
) {
6739 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6742 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6743 if (r
< 0 && r
!= -ENOENT
)
6746 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6753 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6755 std::lock_guard
lock(client_lock
);
6756 tout(cct
) << __func__
<< std::endl
;
6757 tout(cct
) << relpath
<< std::endl
;
6758 tout(cct
) << mode
<< std::endl
;
6759 ldout(cct
, 10) << __func__
<< ": " << relpath
<< dendl
;
6764 if (std::string(relpath
) == "/")
6767 filepath
path(relpath
);
6768 string name
= path
.last_dentry();
6771 int r
= path_walk(path
, &dir
, perm
);
6774 if (cct
->_conf
->client_permissions
) {
6775 r
= may_create(dir
.get(), perm
);
6779 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6782 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6784 std::lock_guard
lock(client_lock
);
6785 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6786 tout(cct
) << __func__
<< std::endl
;
6787 tout(cct
) << relpath
<< std::endl
;
6788 tout(cct
) << mode
<< std::endl
;
6793 //get through existing parts of path
6794 filepath
path(relpath
);
6796 int r
= 0, caps
= 0;
6799 for (i
=0; i
<path
.depth(); ++i
) {
6800 if (cct
->_conf
->client_permissions
) {
6801 r
= may_lookup(cur
.get(), perms
);
6804 caps
= CEPH_CAP_AUTH_SHARED
;
6806 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6811 if (r
!=-ENOENT
) return r
;
6812 ldout(cct
, 20) << __func__
<< " got through " << i
<< " directories on path " << relpath
<< dendl
;
6813 //make new directory at each level
6814 for (; i
<path
.depth(); ++i
) {
6815 if (cct
->_conf
->client_permissions
) {
6816 r
= may_create(cur
.get(), perms
);
6821 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6823 //check proper creation/existence
6824 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6825 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6829 //move to new dir and continue
6831 ldout(cct
, 20) << __func__
<< ": successfully created directory "
6832 << filepath(cur
->ino
).get_path() << dendl
;
6837 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6839 std::lock_guard
lock(client_lock
);
6840 tout(cct
) << __func__
<< std::endl
;
6841 tout(cct
) << relpath
<< std::endl
;
6846 if (std::string(relpath
) == "/")
6849 filepath
path(relpath
);
6850 string name
= path
.last_dentry();
6853 int r
= path_walk(path
, &dir
, perms
);
6856 if (cct
->_conf
->client_permissions
) {
6857 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6861 return _rmdir(dir
.get(), name
.c_str(), perms
);
6864 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6866 std::lock_guard
lock(client_lock
);
6867 tout(cct
) << __func__
<< std::endl
;
6868 tout(cct
) << relpath
<< std::endl
;
6869 tout(cct
) << mode
<< std::endl
;
6870 tout(cct
) << rdev
<< std::endl
;
6875 if (std::string(relpath
) == "/")
6878 filepath
path(relpath
);
6879 string name
= path
.last_dentry();
6882 int r
= path_walk(path
, &dir
, perms
);
6885 if (cct
->_conf
->client_permissions
) {
6886 int r
= may_create(dir
.get(), perms
);
6890 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6895 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6897 std::lock_guard
lock(client_lock
);
6898 tout(cct
) << __func__
<< std::endl
;
6899 tout(cct
) << target
<< std::endl
;
6900 tout(cct
) << relpath
<< std::endl
;
6905 if (std::string(relpath
) == "/")
6908 filepath
path(relpath
);
6909 string name
= path
.last_dentry();
6912 int r
= path_walk(path
, &dir
, perms
);
6915 if (cct
->_conf
->client_permissions
) {
6916 int r
= may_create(dir
.get(), perms
);
6920 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6923 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6925 std::lock_guard
lock(client_lock
);
6926 tout(cct
) << __func__
<< std::endl
;
6927 tout(cct
) << relpath
<< std::endl
;
6932 filepath
path(relpath
);
6934 int r
= path_walk(path
, &in
, perms
, false);
6938 return _readlink(in
.get(), buf
, size
);
6941 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6943 if (!in
->is_symlink())
6946 // copy into buf (at most size bytes)
6947 int r
= in
->symlink
.length();
6950 memcpy(buf
, in
->symlink
.c_str(), r
);
6957 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6959 bool yes
= in
->caps_issued_mask(mask
, true);
6961 ldout(cct
, 10) << __func__
<< " mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6965 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6967 in
->make_nosnap_relative_path(path
);
6968 req
->set_filepath(path
);
6970 req
->head
.args
.getattr
.mask
= mask
;
6972 int res
= make_request(req
, perms
);
6973 ldout(cct
, 10) << __func__
<< " result=" << res
<< dendl
;
6977 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6978 const UserPerm
& perms
, InodeRef
*inp
)
6980 int issued
= in
->caps_issued();
6982 ldout(cct
, 10) << __func__
<< " mask " << mask
<< " issued " <<
6983 ccap_string(issued
) << dendl
;
6985 if (in
->snapid
!= CEPH_NOSNAP
) {
6988 if ((mask
& CEPH_SETATTR_SIZE
) &&
6989 (unsigned long)stx
->stx_size
> in
->size
&&
6990 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6995 // make the change locally?
6996 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6997 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6998 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6999 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
7000 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
7003 * This works because we implicitly flush the caps as part of the
7004 * request, so the cap update check will happen with the writeback
7005 * cap context, and then the setattr check will happen with the
7008 * In reality this pattern is likely pretty rare (different users
7009 * setattr'ing the same file). If that turns out not to be the
7010 * case later, we can build a more complex pipelined cap writeback
7014 mask
|= CEPH_SETATTR_CTIME
;
7019 // caller just needs us to bump the ctime
7020 in
->ctime
= ceph_clock_now();
7021 in
->cap_dirtier_uid
= perms
.uid();
7022 in
->cap_dirtier_gid
= perms
.gid();
7023 if (issued
& CEPH_CAP_AUTH_EXCL
)
7024 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7025 else if (issued
& CEPH_CAP_FILE_EXCL
)
7026 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7027 else if (issued
& CEPH_CAP_XATTR_EXCL
)
7028 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
7030 mask
|= CEPH_SETATTR_CTIME
;
7033 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
7034 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
7036 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
7038 if (mask
& CEPH_SETATTR_UID
) {
7039 in
->ctime
= ceph_clock_now();
7040 in
->cap_dirtier_uid
= perms
.uid();
7041 in
->cap_dirtier_gid
= perms
.gid();
7042 in
->uid
= stx
->stx_uid
;
7043 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7044 mask
&= ~CEPH_SETATTR_UID
;
7046 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7048 if (mask
& CEPH_SETATTR_GID
) {
7049 in
->ctime
= ceph_clock_now();
7050 in
->cap_dirtier_uid
= perms
.uid();
7051 in
->cap_dirtier_gid
= perms
.gid();
7052 in
->gid
= stx
->stx_gid
;
7053 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7054 mask
&= ~CEPH_SETATTR_GID
;
7056 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7059 if (mask
& CEPH_SETATTR_MODE
) {
7060 in
->ctime
= ceph_clock_now();
7061 in
->cap_dirtier_uid
= perms
.uid();
7062 in
->cap_dirtier_gid
= perms
.gid();
7063 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
7064 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7065 mask
&= ~CEPH_SETATTR_MODE
;
7066 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7067 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
7068 /* Must squash the any setuid/setgid bits with an ownership change */
7069 in
->mode
&= ~(S_ISUID
|S_ISGID
);
7070 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7073 if (mask
& CEPH_SETATTR_BTIME
) {
7074 in
->ctime
= ceph_clock_now();
7075 in
->cap_dirtier_uid
= perms
.uid();
7076 in
->cap_dirtier_gid
= perms
.gid();
7077 in
->btime
= utime_t(stx
->stx_btime
);
7078 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
7079 mask
&= ~CEPH_SETATTR_BTIME
;
7080 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
7082 } else if (mask
& CEPH_SETATTR_SIZE
) {
7083 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7084 mask
|= CEPH_SETATTR_KILL_SGUID
;
7087 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
7088 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
7089 if (mask
& CEPH_SETATTR_MTIME
)
7090 in
->mtime
= utime_t(stx
->stx_mtime
);
7091 if (mask
& CEPH_SETATTR_ATIME
)
7092 in
->atime
= utime_t(stx
->stx_atime
);
7093 in
->ctime
= ceph_clock_now();
7094 in
->cap_dirtier_uid
= perms
.uid();
7095 in
->cap_dirtier_gid
= perms
.gid();
7096 in
->time_warp_seq
++;
7097 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
7098 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
7107 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
7111 in
->make_nosnap_relative_path(path
);
7112 req
->set_filepath(path
);
7115 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
7116 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7118 if (mask
& CEPH_SETATTR_MODE
) {
7119 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
7120 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7121 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
7123 if (mask
& CEPH_SETATTR_UID
) {
7124 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
7125 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7126 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
7128 if (mask
& CEPH_SETATTR_GID
) {
7129 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
7130 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7131 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
7133 if (mask
& CEPH_SETATTR_BTIME
) {
7134 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
7135 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
7137 if (mask
& CEPH_SETATTR_MTIME
) {
7138 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
7139 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7142 if (mask
& CEPH_SETATTR_ATIME
) {
7143 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
7144 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
7147 if (mask
& CEPH_SETATTR_SIZE
) {
7148 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
7149 req
->head
.args
.setattr
.size
= stx
->stx_size
;
7150 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
7153 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
7156 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
7159 req
->head
.args
.setattr
.mask
= mask
;
7161 req
->regetattr_mask
= mask
;
7163 int res
= make_request(req
, perms
, inp
);
7164 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
7168 /* Note that we only care about attrs that setattr cares about */
7169 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
7171 stx
->stx_size
= st
->st_size
;
7172 stx
->stx_mode
= st
->st_mode
;
7173 stx
->stx_uid
= st
->st_uid
;
7174 stx
->stx_gid
= st
->st_gid
;
7176 stx
->stx_mtime
= st
->st_mtimespec
;
7177 stx
->stx_atime
= st
->st_atimespec
;
7179 stx
->stx_mtime
= st
->st_mtim
;
7180 stx
->stx_atime
= st
->st_atim
;
7184 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
7185 const UserPerm
& perms
, InodeRef
*inp
)
7187 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
7190 if (mask
& CEPH_SETATTR_MODE
)
7191 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
7195 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
7196 const UserPerm
& perms
)
7198 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
7199 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
7200 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
7201 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
7202 if (cct
->_conf
->client_permissions
) {
7203 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
7207 return __setattrx(in
.get(), stx
, mask
, perms
);
7210 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
7211 const UserPerm
& perms
)
7213 struct ceph_statx stx
;
7215 stat_to_statx(attr
, &stx
);
7216 mask
&= ~CEPH_SETATTR_BTIME
;
7218 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
7219 mask
&= ~CEPH_SETATTR_UID
;
7221 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
7222 mask
&= ~CEPH_SETATTR_GID
;
7225 return _setattrx(in
, &stx
, mask
, perms
);
7228 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
7229 const UserPerm
& perms
)
7231 std::lock_guard
lock(client_lock
);
7232 tout(cct
) << __func__
<< std::endl
;
7233 tout(cct
) << relpath
<< std::endl
;
7234 tout(cct
) << mask
<< std::endl
;
7239 filepath
path(relpath
);
7241 int r
= path_walk(path
, &in
, perms
);
7244 return _setattr(in
, attr
, mask
, perms
);
7247 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
7248 const UserPerm
& perms
, int flags
)
7250 std::lock_guard
lock(client_lock
);
7251 tout(cct
) << __func__
<< std::endl
;
7252 tout(cct
) << relpath
<< std::endl
;
7253 tout(cct
) << mask
<< std::endl
;
7258 filepath
path(relpath
);
7260 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
7263 return _setattrx(in
, stx
, mask
, perms
);
7266 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
7268 std::lock_guard
lock(client_lock
);
7269 tout(cct
) << __func__
<< std::endl
;
7270 tout(cct
) << fd
<< std::endl
;
7271 tout(cct
) << mask
<< std::endl
;
7276 Fh
*f
= get_filehandle(fd
);
7279 #if defined(__linux__) && defined(O_PATH)
7280 if (f
->flags
& O_PATH
)
7283 return _setattr(f
->inode
, attr
, mask
, perms
);
7286 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7288 std::lock_guard
lock(client_lock
);
7289 tout(cct
) << __func__
<< std::endl
;
7290 tout(cct
) << fd
<< std::endl
;
7291 tout(cct
) << mask
<< std::endl
;
7296 Fh
*f
= get_filehandle(fd
);
7299 #if defined(__linux__) && defined(O_PATH)
7300 if (f
->flags
& O_PATH
)
7303 return _setattrx(f
->inode
, stx
, mask
, perms
);
7306 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7307 frag_info_t
*dirstat
, int mask
)
7309 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7310 std::lock_guard
lock(client_lock
);
7311 tout(cct
) << "stat" << std::endl
;
7312 tout(cct
) << relpath
<< std::endl
;
7317 filepath
path(relpath
);
7319 int r
= path_walk(path
, &in
, perms
, true, mask
);
7322 r
= _getattr(in
, mask
, perms
);
7324 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7327 fill_stat(in
, stbuf
, dirstat
);
7328 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7332 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7336 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7337 if (flags
& AT_NO_ATTR_SYNC
)
7340 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7341 mask
|= CEPH_CAP_PIN
;
7342 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7343 mask
|= CEPH_CAP_AUTH_SHARED
;
7344 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7345 mask
|= CEPH_CAP_LINK_SHARED
;
7346 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7347 mask
|= CEPH_CAP_FILE_SHARED
;
7348 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7349 mask
|= CEPH_CAP_XATTR_SHARED
;
7354 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7355 const UserPerm
& perms
,
7356 unsigned int want
, unsigned int flags
)
7358 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7359 std::lock_guard
lock(client_lock
);
7360 tout(cct
) << "statx" << std::endl
;
7361 tout(cct
) << relpath
<< std::endl
;
7366 filepath
path(relpath
);
7369 unsigned mask
= statx_to_mask(flags
, want
);
7371 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7375 r
= _getattr(in
, mask
, perms
);
7377 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7381 fill_statx(in
, mask
, stx
);
7382 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7386 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7387 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7389 ldout(cct
, 3) << __func__
<< " enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7390 std::lock_guard
lock(client_lock
);
7391 tout(cct
) << __func__
<< std::endl
;
7392 tout(cct
) << relpath
<< std::endl
;
7397 filepath
path(relpath
);
7399 // don't follow symlinks
7400 int r
= path_walk(path
, &in
, perms
, false, mask
);
7403 r
= _getattr(in
, mask
, perms
);
7405 ldout(cct
, 3) << __func__
<< " exit on error!" << dendl
;
7408 fill_stat(in
, stbuf
, dirstat
);
7409 ldout(cct
, 3) << __func__
<< " exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7413 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7415 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7416 << " mode 0" << oct
<< in
->mode
<< dec
7417 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7418 memset(st
, 0, sizeof(struct stat
));
7419 if (use_faked_inos())
7420 st
->st_ino
= in
->faked_ino
;
7422 st
->st_ino
= in
->ino
;
7423 st
->st_dev
= in
->snapid
;
7424 st
->st_mode
= in
->mode
;
7425 st
->st_rdev
= in
->rdev
;
7427 switch (in
->nlink
) {
7429 st
->st_nlink
= 0; /* dir is unlinked */
7432 st
->st_nlink
= 1 /* parent dentry */
7434 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7440 st
->st_nlink
= in
->nlink
;
7442 st
->st_uid
= in
->uid
;
7443 st
->st_gid
= in
->gid
;
7444 if (in
->ctime
> in
->mtime
) {
7445 stat_set_ctime_sec(st
, in
->ctime
.sec());
7446 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7448 stat_set_ctime_sec(st
, in
->mtime
.sec());
7449 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7451 stat_set_atime_sec(st
, in
->atime
.sec());
7452 stat_set_atime_nsec(st
, in
->atime
.nsec());
7453 stat_set_mtime_sec(st
, in
->mtime
.sec());
7454 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7456 if (cct
->_conf
->client_dirsize_rbytes
)
7457 st
->st_size
= in
->rstat
.rbytes
;
7459 st
->st_size
= in
->dirstat
.size();
7462 st
->st_size
= in
->size
;
7463 st
->st_blocks
= (in
->size
+ 511) >> 9;
7465 st
->st_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7468 *dirstat
= in
->dirstat
;
7472 return in
->caps_issued();
7475 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7477 ldout(cct
, 10) << __func__
<< " on " << in
->ino
<< " snap/dev" << in
->snapid
7478 << " mode 0" << oct
<< in
->mode
<< dec
7479 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7480 memset(stx
, 0, sizeof(struct ceph_statx
));
7483 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7484 * so that all bits are set.
7489 /* These are always considered to be available */
7490 stx
->stx_dev
= in
->snapid
;
7491 stx
->stx_blksize
= std::max
<uint32_t>(in
->layout
.stripe_unit
, 4096);
7493 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7494 stx
->stx_mode
= S_IFMT
& in
->mode
;
7495 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7496 stx
->stx_rdev
= in
->rdev
;
7497 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7499 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7500 stx
->stx_uid
= in
->uid
;
7501 stx
->stx_gid
= in
->gid
;
7502 stx
->stx_mode
= in
->mode
;
7503 in
->btime
.to_timespec(&stx
->stx_btime
);
7504 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7507 if (mask
& CEPH_CAP_LINK_SHARED
) {
7509 switch (in
->nlink
) {
7511 stx
->stx_nlink
= 0; /* dir is unlinked */
7514 stx
->stx_nlink
= 1 /* parent dentry */
7516 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7522 stx
->stx_nlink
= in
->nlink
;
7524 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7527 if (mask
& CEPH_CAP_FILE_SHARED
) {
7529 in
->atime
.to_timespec(&stx
->stx_atime
);
7530 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7533 if (cct
->_conf
->client_dirsize_rbytes
)
7534 stx
->stx_size
= in
->rstat
.rbytes
;
7536 stx
->stx_size
= in
->dirstat
.size();
7537 stx
->stx_blocks
= 1;
7539 stx
->stx_size
= in
->size
;
7540 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7542 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7543 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7546 /* Change time and change_attr both require all shared caps to view */
7547 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7548 stx
->stx_version
= in
->change_attr
;
7549 if (in
->ctime
> in
->mtime
)
7550 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7552 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7553 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7558 void Client::touch_dn(Dentry
*dn
)
7563 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7565 std::lock_guard
lock(client_lock
);
7566 tout(cct
) << __func__
<< std::endl
;
7567 tout(cct
) << relpath
<< std::endl
;
7568 tout(cct
) << mode
<< std::endl
;
7573 filepath
path(relpath
);
7575 int r
= path_walk(path
, &in
, perms
);
7579 attr
.st_mode
= mode
;
7580 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7583 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7585 std::lock_guard
lock(client_lock
);
7586 tout(cct
) << __func__
<< std::endl
;
7587 tout(cct
) << fd
<< std::endl
;
7588 tout(cct
) << mode
<< std::endl
;
7593 Fh
*f
= get_filehandle(fd
);
7596 #if defined(__linux__) && defined(O_PATH)
7597 if (f
->flags
& O_PATH
)
7601 attr
.st_mode
= mode
;
7602 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7605 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7607 std::lock_guard
lock(client_lock
);
7608 tout(cct
) << __func__
<< std::endl
;
7609 tout(cct
) << relpath
<< std::endl
;
7610 tout(cct
) << mode
<< std::endl
;
7615 filepath
path(relpath
);
7617 // don't follow symlinks
7618 int r
= path_walk(path
, &in
, perms
, false);
7622 attr
.st_mode
= mode
;
7623 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7626 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7627 const UserPerm
& perms
)
7629 std::lock_guard
lock(client_lock
);
7630 tout(cct
) << __func__
<< std::endl
;
7631 tout(cct
) << relpath
<< std::endl
;
7632 tout(cct
) << new_uid
<< std::endl
;
7633 tout(cct
) << new_gid
<< std::endl
;
7638 filepath
path(relpath
);
7640 int r
= path_walk(path
, &in
, perms
);
7644 attr
.st_uid
= new_uid
;
7645 attr
.st_gid
= new_gid
;
7646 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7649 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7651 std::lock_guard
lock(client_lock
);
7652 tout(cct
) << __func__
<< std::endl
;
7653 tout(cct
) << fd
<< std::endl
;
7654 tout(cct
) << new_uid
<< std::endl
;
7655 tout(cct
) << new_gid
<< std::endl
;
7660 Fh
*f
= get_filehandle(fd
);
7663 #if defined(__linux__) && defined(O_PATH)
7664 if (f
->flags
& O_PATH
)
7668 attr
.st_uid
= new_uid
;
7669 attr
.st_gid
= new_gid
;
7671 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7672 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7673 return _setattr(f
->inode
, &attr
, mask
, perms
);
7676 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7677 const UserPerm
& perms
)
7679 std::lock_guard
lock(client_lock
);
7680 tout(cct
) << __func__
<< std::endl
;
7681 tout(cct
) << relpath
<< std::endl
;
7682 tout(cct
) << new_uid
<< std::endl
;
7683 tout(cct
) << new_gid
<< std::endl
;
7688 filepath
path(relpath
);
7690 // don't follow symlinks
7691 int r
= path_walk(path
, &in
, perms
, false);
7695 attr
.st_uid
= new_uid
;
7696 attr
.st_gid
= new_gid
;
7698 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7699 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7700 return _setattr(in
, &attr
, mask
, perms
);
7703 static void attr_set_atime_and_mtime(struct stat
*attr
,
7704 const utime_t
&atime
,
7705 const utime_t
&mtime
)
7707 stat_set_atime_sec(attr
, atime
.tv
.tv_sec
);
7708 stat_set_atime_nsec(attr
, atime
.tv
.tv_nsec
);
7709 stat_set_mtime_sec(attr
, mtime
.tv
.tv_sec
);
7710 stat_set_mtime_nsec(attr
, mtime
.tv
.tv_nsec
);
7713 // for [l]utime() invoke the timeval variant as the timespec
7714 // variant are not yet implemented. for futime[s](), invoke
7715 // the timespec variant.
7716 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7717 const UserPerm
& perms
)
7719 struct timeval tv
[2];
7720 tv
[0].tv_sec
= buf
->actime
;
7722 tv
[1].tv_sec
= buf
->modtime
;
7725 return utimes(relpath
, tv
, perms
);
7728 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7729 const UserPerm
& perms
)
7731 struct timeval tv
[2];
7732 tv
[0].tv_sec
= buf
->actime
;
7734 tv
[1].tv_sec
= buf
->modtime
;
7737 return lutimes(relpath
, tv
, perms
);
7740 int Client::futime(int fd
, struct utimbuf
*buf
, const UserPerm
& perms
)
7742 struct timespec ts
[2];
7743 ts
[0].tv_sec
= buf
->actime
;
7745 ts
[1].tv_sec
= buf
->modtime
;
7748 return futimens(fd
, ts
, perms
);
7751 int Client::utimes(const char *relpath
, struct timeval times
[2],
7752 const UserPerm
& perms
)
7754 std::lock_guard
lock(client_lock
);
7755 tout(cct
) << __func__
<< std::endl
;
7756 tout(cct
) << relpath
<< std::endl
;
7757 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7759 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7765 filepath
path(relpath
);
7767 int r
= path_walk(path
, &in
, perms
);
7771 utime_t
atime(times
[0]);
7772 utime_t
mtime(times
[1]);
7774 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7775 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7778 int Client::lutimes(const char *relpath
, struct timeval times
[2],
7779 const UserPerm
& perms
)
7781 std::lock_guard
lock(client_lock
);
7782 tout(cct
) << __func__
<< std::endl
;
7783 tout(cct
) << relpath
<< std::endl
;
7784 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_usec
7786 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_usec
7792 filepath
path(relpath
);
7794 int r
= path_walk(path
, &in
, perms
, false);
7798 utime_t
atime(times
[0]);
7799 utime_t
mtime(times
[1]);
7801 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7802 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7805 int Client::futimes(int fd
, struct timeval times
[2], const UserPerm
& perms
)
7807 struct timespec ts
[2];
7808 ts
[0].tv_sec
= times
[0].tv_sec
;
7809 ts
[0].tv_nsec
= times
[0].tv_usec
* 1000;
7810 ts
[1].tv_sec
= times
[1].tv_sec
;
7811 ts
[1].tv_nsec
= times
[1].tv_usec
* 1000;
7813 return futimens(fd
, ts
, perms
);
7816 int Client::futimens(int fd
, struct timespec times
[2], const UserPerm
& perms
)
7818 std::lock_guard
lock(client_lock
);
7819 tout(cct
) << __func__
<< std::endl
;
7820 tout(cct
) << fd
<< std::endl
;
7821 tout(cct
) << "atime: " << times
[0].tv_sec
<< "." << times
[0].tv_nsec
7823 tout(cct
) << "mtime: " << times
[1].tv_sec
<< "." << times
[1].tv_nsec
7829 Fh
*f
= get_filehandle(fd
);
7832 #if defined(__linux__) && defined(O_PATH)
7833 if (f
->flags
& O_PATH
)
7837 utime_t
atime(times
[0]);
7838 utime_t
mtime(times
[1]);
7840 attr_set_atime_and_mtime(&attr
, atime
, mtime
);
7841 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7844 int Client::flock(int fd
, int operation
, uint64_t owner
)
7846 std::lock_guard
lock(client_lock
);
7847 tout(cct
) << __func__
<< std::endl
;
7848 tout(cct
) << fd
<< std::endl
;
7849 tout(cct
) << operation
<< std::endl
;
7850 tout(cct
) << owner
<< std::endl
;
7855 Fh
*f
= get_filehandle(fd
);
7859 return _flock(f
, operation
, owner
);
7862 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7864 std::lock_guard
lock(client_lock
);
7865 tout(cct
) << __func__
<< std::endl
;
7866 tout(cct
) << relpath
<< std::endl
;
7871 filepath
path(relpath
);
7873 int r
= path_walk(path
, &in
, perms
, true);
7876 if (cct
->_conf
->client_permissions
) {
7877 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7881 r
= _opendir(in
.get(), dirpp
, perms
);
7882 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7884 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7888 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7892 *dirpp
= new dir_result_t(in
, perms
);
7893 opened_dirs
.insert(*dirpp
);
7894 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7899 int Client::closedir(dir_result_t
*dir
)
7901 std::lock_guard
lock(client_lock
);
7902 tout(cct
) << __func__
<< std::endl
;
7903 tout(cct
) << (unsigned long)dir
<< std::endl
;
7905 ldout(cct
, 3) << __func__
<< "(" << dir
<< ") = 0" << dendl
;
7910 void Client::_closedir(dir_result_t
*dirp
)
7912 ldout(cct
, 10) << __func__
<< "(" << dirp
<< ")" << dendl
;
7914 ldout(cct
, 10) << __func__
<< " detaching inode " << dirp
->inode
<< dendl
;
7915 dirp
->inode
.reset();
7917 _readdir_drop_dirp_buffer(dirp
);
7918 opened_dirs
.erase(dirp
);
7922 void Client::rewinddir(dir_result_t
*dirp
)
7924 std::lock_guard
lock(client_lock
);
7925 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ")" << dendl
;
7930 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7931 _readdir_drop_dirp_buffer(d
);
7935 loff_t
Client::telldir(dir_result_t
*dirp
)
7937 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7938 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ") = " << d
->offset
<< dendl
;
7942 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7944 std::lock_guard
lock(client_lock
);
7946 ldout(cct
, 3) << __func__
<< "(" << dirp
<< ", " << offset
<< ")" << dendl
;
7951 if (offset
== dirp
->offset
)
7954 if (offset
> dirp
->offset
)
7955 dirp
->release_count
= 0; // bump if we do a forward seek
7957 dirp
->ordered_count
= 0; // disable filling readdir cache
7959 if (dirp
->hash_order()) {
7960 if (dirp
->offset
> offset
) {
7961 _readdir_drop_dirp_buffer(dirp
);
7966 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7967 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7968 _readdir_drop_dirp_buffer(dirp
);
7973 dirp
->offset
= offset
;
7978 // ino_t d_ino; /* inode number */
7979 // off_t d_off; /* offset to the next dirent */
7980 // unsigned short d_reclen; /* length of this record */
7981 // unsigned char d_type; /* type of file */
7982 // char d_name[256]; /* filename */
7984 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7986 strncpy(de
->d_name
, name
, 255);
7987 de
->d_name
[255] = '\0';
7990 #if !defined(__APPLE__) && !defined(__FreeBSD__)
7991 de
->d_off
= next_off
;
7994 de
->d_type
= IFTODT(type
);
7995 ldout(cct
, 10) << __func__
<< " '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7996 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
8000 void Client::_readdir_next_frag(dir_result_t
*dirp
)
8002 frag_t fg
= dirp
->buffer_frag
;
8004 if (fg
.is_rightmost()) {
8005 ldout(cct
, 10) << __func__
<< " advance from " << fg
<< " to END" << dendl
;
8012 ldout(cct
, 10) << __func__
<< " advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
8014 if (dirp
->hash_order()) {
8016 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
8017 if (dirp
->offset
< new_offset
) // don't decrease offset
8018 dirp
->offset
= new_offset
;
8020 dirp
->last_name
.clear();
8021 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8022 _readdir_rechoose_frag(dirp
);
8026 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
8028 ceph_assert(dirp
->inode
);
8030 if (dirp
->hash_order())
8033 frag_t cur
= frag_t(dirp
->offset_high());
8034 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
8036 ldout(cct
, 10) << __func__
<< " frag " << cur
<< " maps to " << fg
<< dendl
;
8037 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
8038 dirp
->last_name
.clear();
8039 dirp
->next_offset
= 2;
8043 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
8045 ldout(cct
, 10) << __func__
<< " " << dirp
<< dendl
;
8046 dirp
->buffer
.clear();
8049 int Client::_readdir_get_frag(dir_result_t
*dirp
)
8052 ceph_assert(dirp
->inode
);
8054 // get the current frag.
8056 if (dirp
->hash_order())
8057 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
8059 fg
= frag_t(dirp
->offset_high());
8061 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
8062 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
8064 int op
= CEPH_MDS_OP_READDIR
;
8065 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
8066 op
= CEPH_MDS_OP_LSSNAP
;
8068 InodeRef
& diri
= dirp
->inode
;
8070 MetaRequest
*req
= new MetaRequest(op
);
8072 diri
->make_nosnap_relative_path(path
);
8073 req
->set_filepath(path
);
8074 req
->set_inode(diri
.get());
8075 req
->head
.args
.readdir
.frag
= fg
;
8076 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
8077 if (dirp
->last_name
.length()) {
8078 req
->path2
.set_path(dirp
->last_name
);
8079 } else if (dirp
->hash_order()) {
8080 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
8085 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
8087 if (res
== -EAGAIN
) {
8088 ldout(cct
, 10) << __func__
<< " got EAGAIN, retrying" << dendl
;
8089 _readdir_rechoose_frag(dirp
);
8090 return _readdir_get_frag(dirp
);
8094 ldout(cct
, 10) << __func__
<< " " << dirp
<< " got frag " << dirp
->buffer_frag
8095 << " size " << dirp
->buffer
.size() << dendl
;
8097 ldout(cct
, 10) << __func__
<< " got error " << res
<< ", setting end flag" << dendl
;
8104 struct dentry_off_lt
{
8105 bool operator()(const Dentry
* dn
, int64_t off
) const {
8106 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
8110 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
8111 int caps
, bool getref
)
8113 ceph_assert(ceph_mutex_is_locked(client_lock
));
8114 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
8115 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
8117 Dir
*dir
= dirp
->inode
->dir
;
8120 ldout(cct
, 10) << " dir is empty" << dendl
;
8125 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
8126 dir
->readdir_cache
.end(),
8127 dirp
->offset
, dentry_off_lt());
8131 if (!dirp
->inode
->is_complete_and_ordered())
8133 if (pd
== dir
->readdir_cache
.end())
8136 if (dn
->inode
== NULL
) {
8137 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
8141 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
8142 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
8147 int idx
= pd
- dir
->readdir_cache
.begin();
8148 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
8152 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8153 pd
= dir
->readdir_cache
.begin() + idx
;
8154 if (pd
>= dir
->readdir_cache
.end() || *pd
!= dn
)
8157 struct ceph_statx stx
;
8159 fill_statx(dn
->inode
, caps
, &stx
);
8161 uint64_t next_off
= dn
->offset
+ 1;
8162 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8164 if (pd
== dir
->readdir_cache
.end())
8165 next_off
= dir_result_t::END
;
8169 in
= dn
->inode
.get();
8173 dn_name
= dn
->name
; // fill in name while we have lock
8175 client_lock
.unlock();
8176 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
8178 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
8179 << " = " << r
<< dendl
;
8184 dirp
->offset
= next_off
;
8186 dirp
->next_offset
= 2;
8188 dirp
->next_offset
= dirp
->offset_low();
8189 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
8190 dirp
->release_count
= 0; // last_name no longer match cache index
8195 ldout(cct
, 10) << __func__
<< " " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
8200 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
8201 unsigned want
, unsigned flags
, bool getref
)
8203 int caps
= statx_to_mask(flags
, want
);
8205 std::lock_guard
lock(client_lock
);
8210 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
8212 ldout(cct
, 10) << __func__
<< " " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
8213 << dec
<< " at_end=" << dirp
->at_end()
8214 << " hash_order=" << dirp
->hash_order() << dendl
;
8217 struct ceph_statx stx
;
8218 memset(&de
, 0, sizeof(de
));
8219 memset(&stx
, 0, sizeof(stx
));
8221 InodeRef
& diri
= dirp
->inode
;
8226 if (dirp
->offset
== 0) {
8227 ldout(cct
, 15) << " including ." << dendl
;
8228 ceph_assert(diri
->dentries
.size() < 2); // can't have multiple hard-links to a dir
8229 uint64_t next_off
= 1;
8232 r
= _getattr(diri
, caps
, dirp
->perms
);
8236 fill_statx(diri
, caps
, &stx
);
8237 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
8239 Inode
*inode
= NULL
;
8245 client_lock
.unlock();
8246 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8251 dirp
->offset
= next_off
;
8255 if (dirp
->offset
== 1) {
8256 ldout(cct
, 15) << " including .." << dendl
;
8257 uint64_t next_off
= 2;
8259 if (diri
->dentries
.empty())
8262 in
= diri
->get_first_parent()->dir
->parent_inode
;
8265 r
= _getattr(in
, caps
, dirp
->perms
);
8269 fill_statx(in
, caps
, &stx
);
8270 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
8272 Inode
*inode
= NULL
;
8278 client_lock
.unlock();
8279 r
= cb(p
, &de
, &stx
, next_off
, inode
);
8284 dirp
->offset
= next_off
;
8289 // can we read from our cache?
8290 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
8291 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
8292 << dirp
->inode
->is_complete_and_ordered()
8293 << " issued " << ccap_string(dirp
->inode
->caps_issued())
8295 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
8296 dirp
->inode
->is_complete_and_ordered() &&
8297 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
8298 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
8307 bool check_caps
= true;
8308 if (!dirp
->is_cached()) {
8309 int r
= _readdir_get_frag(dirp
);
8312 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8313 // different than the requested one. (our dirfragtree was outdated)
8316 frag_t fg
= dirp
->buffer_frag
;
8318 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
8319 << " offset " << hex
<< dirp
->offset
<< dendl
;
8321 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
8322 dirp
->offset
, dir_result_t::dentry_off_lt());
8323 it
!= dirp
->buffer
.end();
8325 dir_result_t::dentry
&entry
= *it
;
8327 uint64_t next_off
= entry
.offset
+ 1;
8331 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
8336 fill_statx(entry
.inode
, caps
, &stx
);
8337 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
8339 Inode
*inode
= NULL
;
8341 inode
= entry
.inode
.get();
8345 client_lock
.unlock();
8346 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
8349 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
8350 << " = " << r
<< dendl
;
8354 dirp
->offset
= next_off
;
8359 if (dirp
->next_offset
> 2) {
8360 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
8361 _readdir_drop_dirp_buffer(dirp
);
8365 if (!fg
.is_rightmost()) {
8367 _readdir_next_frag(dirp
);
8371 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8372 diri
->dir_release_count
== dirp
->release_count
) {
8373 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8374 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8376 ceph_assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8377 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8379 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8381 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8382 diri
->flags
|= I_COMPLETE
;
8394 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8396 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8403 * 1 if we got a dirent
8404 * 0 for end of directory
8408 struct single_readdir
{
8410 struct ceph_statx
*stx
;
8415 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8416 struct ceph_statx
*stx
, off_t off
,
8419 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8422 return -1; // already filled this dirent
8432 struct dirent
*Client::readdir(dir_result_t
*d
)
8435 static struct dirent de
;
8442 // our callback fills the dirent and sets sr.full=true on first
8443 // call, and returns -1 the second time around.
8444 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8446 errno
= -ret
; // this sucks.
8447 return (dirent
*) NULL
;
8452 return (dirent
*) NULL
;
8455 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8456 struct ceph_statx
*stx
, unsigned want
,
8457 unsigned flags
, Inode
**out
)
8465 // our callback fills the dirent and sets sr.full=true on first
8466 // call, and returns -1 the second time around.
8467 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8479 struct getdents_result
{
8486 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8487 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8489 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8495 dlen
= strlen(de
->d_name
) + 1;
8497 if (c
->pos
+ dlen
> c
->buflen
)
8498 return -1; // doesn't fit
8501 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8503 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8509 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8514 gr
.fullent
= fullent
;
8517 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8519 if (r
< 0) { // some error
8520 if (r
== -1) { // buffer ran out of space
8521 if (gr
.pos
) { // but we got some entries already!
8523 } // or we need a larger buffer
8525 } else { // actual error, return it
8534 struct getdir_result
{
8535 list
<string
> *contents
;
8539 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8541 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8543 r
->contents
->push_back(de
->d_name
);
8548 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8549 const UserPerm
& perms
)
8551 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8553 std::lock_guard
lock(client_lock
);
8554 tout(cct
) << "getdir" << std::endl
;
8555 tout(cct
) << relpath
<< std::endl
;
8559 int r
= opendir(relpath
, &d
, perms
);
8564 gr
.contents
= &contents
;
8566 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8576 /****** file i/o **********/
8577 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8578 mode_t mode
, int stripe_unit
, int stripe_count
,
8579 int object_size
, const char *data_pool
)
8581 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8582 std::lock_guard
lock(client_lock
);
8583 tout(cct
) << "open" << std::endl
;
8584 tout(cct
) << relpath
<< std::endl
;
8585 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8592 #if defined(__linux__) && defined(O_PATH)
8593 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8594 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8595 * in kernel (fs/open.c). */
8597 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8600 filepath
path(relpath
);
8602 bool created
= false;
8603 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8604 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8605 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8607 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8610 #if defined(__linux__) && defined(O_PATH)
8611 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8613 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8617 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8618 filepath dirpath
= path
;
8619 string dname
= dirpath
.last_dentry();
8620 dirpath
.pop_dentry();
8622 r
= path_walk(dirpath
, &dir
, perms
, true,
8623 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8626 if (cct
->_conf
->client_permissions
) {
8627 r
= may_create(dir
.get(), perms
);
8631 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8632 stripe_count
, object_size
, data_pool
, &created
, perms
);
8638 // posix says we can only check permissions of existing files
8639 if (cct
->_conf
->client_permissions
) {
8640 r
= may_open(in
.get(), flags
, perms
);
8647 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8649 // allocate a integer file descriptor
8652 ceph_assert(fd_map
.count(r
) == 0);
8657 tout(cct
) << r
<< std::endl
;
8658 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8662 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8664 /* Use default file striping parameters */
8665 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8668 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8669 const UserPerm
& perms
)
8671 std::lock_guard
lock(client_lock
);
8672 ldout(cct
, 3) << __func__
<< " enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8677 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8679 req
->set_filepath(path
);
8681 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8683 sprintf(f
, "%u", h
);
8684 filepath
path2(dirino
);
8685 path2
.push_dentry(string(f
));
8686 req
->set_filepath2(path2
);
8688 int r
= make_request(req
, perms
, NULL
, NULL
,
8689 rand() % mdsmap
->get_num_in_mds());
8690 ldout(cct
, 3) << __func__
<< " exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8696 * Load inode into local cache.
8698 * If inode pointer is non-NULL, and take a reference on
8699 * the resulting Inode object in one operation, so that caller
8700 * can safely assume inode will still be there after return.
8702 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8704 ldout(cct
, 8) << __func__
<< " enter(" << ino
<< ")" << dendl
;
8709 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8711 req
->set_filepath(path
);
8713 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8714 if (r
== 0 && inode
!= NULL
) {
8715 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8716 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8717 ceph_assert(p
!= inode_map
.end());
8721 ldout(cct
, 8) << __func__
<< " exit(" << ino
<< ") = " << r
<< dendl
;
8725 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8727 std::lock_guard
lock(client_lock
);
8728 return _lookup_ino(ino
, perms
, inode
);
8732 * Find the parent inode of `ino` and insert it into
8733 * our cache. Conditionally also set `parent` to a referenced
8734 * Inode* if caller provides non-NULL value.
8736 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8738 ldout(cct
, 8) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8740 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8741 filepath
path(ino
->ino
);
8742 req
->set_filepath(path
);
8745 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8746 // Give caller a reference to the parent ino if they provided a pointer.
8747 if (parent
!= NULL
) {
8749 *parent
= target
.get();
8751 ldout(cct
, 8) << __func__
<< " found parent " << (*parent
)->ino
<< dendl
;
8756 ldout(cct
, 8) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8761 * Populate the parent dentry for `ino`, provided it is
8762 * a child of `parent`.
8764 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8766 ceph_assert(parent
->is_dir());
8767 ldout(cct
, 3) << __func__
<< " enter(" << ino
->ino
<< ")" << dendl
;
8772 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8773 req
->set_filepath2(filepath(parent
->ino
));
8774 req
->set_filepath(filepath(ino
->ino
));
8775 req
->set_inode(ino
);
8777 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8778 ldout(cct
, 3) << __func__
<< " exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8782 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8784 std::lock_guard
lock(client_lock
);
8785 return _lookup_name(ino
, parent
, perms
);
8788 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8791 Fh
*f
= new Fh(in
, flags
, cmode
, fd_gen
, perms
);
8793 ldout(cct
, 10) << __func__
<< " " << in
->ino
<< " mode " << cmode
<< dendl
;
8795 if (in
->snapid
!= CEPH_NOSNAP
) {
8796 in
->snap_cap_refs
++;
8797 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8798 << ccap_string(in
->caps_issued()) << dendl
;
8801 const auto& conf
= cct
->_conf
;
8802 f
->readahead
.set_trigger_requests(1);
8803 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8804 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8805 if (conf
->client_readahead_max_bytes
) {
8806 max_readahead
= std::min(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8808 if (conf
->client_readahead_max_periods
) {
8809 max_readahead
= std::min(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8811 f
->readahead
.set_max_readahead_size(max_readahead
);
8812 vector
<uint64_t> alignments
;
8813 alignments
.push_back(in
->layout
.get_period());
8814 alignments
.push_back(in
->layout
.stripe_unit
);
8815 f
->readahead
.set_alignments(alignments
);
8820 int Client::_release_fh(Fh
*f
)
8822 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8823 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8824 Inode
*in
= f
->inode
.get();
8825 ldout(cct
, 8) << __func__
<< " " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8829 if (in
->snapid
== CEPH_NOSNAP
) {
8830 if (in
->put_open_ref(f
->mode
)) {
8831 _flush(in
, new C_Client_FlushComplete(this, in
));
8835 ceph_assert(in
->snap_cap_refs
> 0);
8836 in
->snap_cap_refs
--;
8839 _release_filelocks(f
);
8841 // Finally, read any async err (i.e. from flushes)
8842 int err
= f
->take_async_err();
8844 ldout(cct
, 1) << __func__
<< " " << f
<< " on inode " << *in
<< " caught async_err = "
8845 << cpp_strerror(err
) << dendl
;
8847 ldout(cct
, 10) << __func__
<< " " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8855 void Client::_put_fh(Fh
*f
)
8857 int left
= f
->put();
8863 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8864 const UserPerm
& perms
)
8866 if (in
->snapid
!= CEPH_NOSNAP
&&
8867 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8871 // use normalized flags to generate cmode
8872 int cflags
= ceph_flags_sys2wire(flags
);
8873 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
8874 cflags
|= CEPH_O_LAZY
;
8876 int cmode
= ceph_flags_to_mode(cflags
);
8877 int want
= ceph_caps_for_mode(cmode
);
8880 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8882 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8884 check_caps(in
, CHECK_CAPS_NODELAY
);
8887 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8889 in
->make_nosnap_relative_path(path
);
8890 req
->set_filepath(path
);
8891 req
->head
.args
.open
.flags
= cflags
& ~CEPH_O_CREAT
;
8892 req
->head
.args
.open
.mode
= mode
;
8893 req
->head
.args
.open
.pool
= -1;
8894 if (cct
->_conf
->client_debug_getattr_caps
)
8895 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8897 req
->head
.args
.open
.mask
= 0;
8898 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8900 result
= make_request(req
, perms
);
8903 * NFS expects that delegations will be broken on a conflicting open,
8904 * not just when there is actual conflicting access to the file. SMB leases
8905 * and oplocks also have similar semantics.
8907 * Ensure that clients that have delegations enabled will wait on minimal
8908 * caps during open, just to ensure that other clients holding delegations
8909 * return theirs first.
8911 if (deleg_timeout
&& result
== 0) {
8914 if (cmode
& CEPH_FILE_MODE_WR
)
8915 need
|= CEPH_CAP_FILE_WR
;
8916 if (cmode
& CEPH_FILE_MODE_RD
)
8917 need
|= CEPH_CAP_FILE_RD
;
8919 Fh
fh(in
, flags
, cmode
, fd_gen
, perms
);
8920 result
= get_caps(&fh
, need
, want
, &have
, -1);
8922 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8923 " . Denying open: " <<
8924 cpp_strerror(result
) << dendl
;
8925 in
->put_open_ref(cmode
);
8927 put_cap_ref(in
, need
);
8935 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8937 in
->put_open_ref(cmode
);
8945 int Client::_renew_caps(Inode
*in
)
8947 int wanted
= in
->caps_file_wanted();
8948 if (in
->is_any_caps() &&
8949 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8950 check_caps(in
, CHECK_CAPS_NODELAY
);
8955 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8957 else if (wanted
& CEPH_CAP_FILE_RD
)
8959 else if (wanted
& CEPH_CAP_FILE_WR
)
8962 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8964 in
->make_nosnap_relative_path(path
);
8965 req
->set_filepath(path
);
8966 req
->head
.args
.open
.flags
= flags
;
8967 req
->head
.args
.open
.pool
= -1;
8968 if (cct
->_conf
->client_debug_getattr_caps
)
8969 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8971 req
->head
.args
.open
.mask
= 0;
8974 // duplicate in case Cap goes away; not sure if that race is a concern?
8975 const UserPerm
*pperm
= in
->get_best_perms();
8979 int ret
= make_request(req
, perms
);
8983 int Client::close(int fd
)
8985 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8986 std::lock_guard
lock(client_lock
);
8987 tout(cct
) << "close" << std::endl
;
8988 tout(cct
) << fd
<< std::endl
;
8993 Fh
*fh
= get_filehandle(fd
);
8996 int err
= _release_fh(fh
);
8999 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
9007 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
9009 std::lock_guard
lock(client_lock
);
9010 tout(cct
) << "lseek" << std::endl
;
9011 tout(cct
) << fd
<< std::endl
;
9012 tout(cct
) << offset
<< std::endl
;
9013 tout(cct
) << whence
<< std::endl
;
9018 Fh
*f
= get_filehandle(fd
);
9021 #if defined(__linux__) && defined(O_PATH)
9022 if (f
->flags
& O_PATH
)
9025 return _lseek(f
, offset
, whence
);
9028 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
9030 Inode
*in
= f
->inode
.get();
9031 bool whence_check
= false;
9036 whence_check
= true;
9041 whence_check
= true;
9047 whence_check
= true;
9053 int r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9064 pos
= f
->pos
+ offset
;
9068 pos
= in
->size
+ offset
;
9073 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9081 if (offset
< 0 || static_cast<uint64_t>(offset
) >= in
->size
)
9088 ldout(cct
, 1) << __func__
<< ": invalid whence value " << whence
<< dendl
;
9098 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
9103 void Client::lock_fh_pos(Fh
*f
)
9105 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9107 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
9108 ceph::condition_variable cond
;
9109 f
->pos_waiters
.push_back(&cond
);
9110 ldout(cct
, 10) << __func__
<< " BLOCKING on " << f
<< dendl
;
9111 std::unique_lock l
{client_lock
, std::adopt_lock
};
9112 cond
.wait(l
, [f
, me
=&cond
] {
9113 return !f
->pos_locked
&& f
->pos_waiters
.front() == me
;
9116 ldout(cct
, 10) << __func__
<< " UNBLOCKING on " << f
<< dendl
;
9117 ceph_assert(f
->pos_waiters
.front() == &cond
);
9118 f
->pos_waiters
.pop_front();
9121 f
->pos_locked
= true;
9124 void Client::unlock_fh_pos(Fh
*f
)
9126 ldout(cct
, 10) << __func__
<< " " << f
<< dendl
;
9127 f
->pos_locked
= false;
9130 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
9132 if (!in
->inline_data
.length()) {
9133 onfinish
->complete(0);
9138 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
9139 object_t oid
= oid_buf
;
9141 ObjectOperation create_ops
;
9142 create_ops
.create(false);
9144 objecter
->mutate(oid
,
9145 OSDMap::file_to_object_locator(in
->layout
),
9147 in
->snaprealm
->get_snap_context(),
9148 ceph::real_clock::now(),
9152 bufferlist inline_version_bl
;
9153 encode(in
->inline_version
, inline_version_bl
);
9155 ObjectOperation uninline_ops
;
9156 uninline_ops
.cmpxattr("inline_version",
9157 CEPH_OSD_CMPXATTR_OP_GT
,
9158 CEPH_OSD_CMPXATTR_MODE_U64
,
9160 bufferlist inline_data
= in
->inline_data
;
9161 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
9162 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
9164 objecter
->mutate(oid
,
9165 OSDMap::file_to_object_locator(in
->layout
),
9167 in
->snaprealm
->get_snap_context(),
9168 ceph::real_clock::now(),
9177 // blocking osd interface
9179 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
9181 std::unique_lock
lock(client_lock
);
9182 tout(cct
) << "read" << std::endl
;
9183 tout(cct
) << fd
<< std::endl
;
9184 tout(cct
) << size
<< std::endl
;
9185 tout(cct
) << offset
<< std::endl
;
9190 Fh
*f
= get_filehandle(fd
);
9193 #if defined(__linux__) && defined(O_PATH)
9194 if (f
->flags
& O_PATH
)
9198 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9199 size
= std::min(size
, (loff_t
)INT_MAX
);
9200 int r
= _read(f
, offset
, size
, &bl
);
9201 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9204 bl
.begin().copy(bl
.length(), buf
);
9210 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
9214 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
9217 int64_t Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
9220 bool movepos
= false;
9221 std::unique_ptr
<C_SaferCond
> onuninline
;
9223 const auto& conf
= cct
->_conf
;
9224 Inode
*in
= f
->inode
.get();
9226 utime_t start
= ceph_clock_now();
9228 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
9230 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9237 loff_t start_pos
= offset
;
9239 if (in
->inline_version
== 0) {
9240 r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9244 ceph_assert(in
->inline_version
> 0);
9248 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9249 want
= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
;
9251 want
= CEPH_CAP_FILE_CACHE
;
9252 r
= get_caps(f
, CEPH_CAP_FILE_RD
, want
, &have
, -1);
9256 if (f
->flags
& O_DIRECT
)
9257 have
&= ~(CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
);
9259 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9260 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
9261 onuninline
.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9262 uninline_data(in
, onuninline
.get());
9264 uint32_t len
= in
->inline_data
.length();
9265 uint64_t endoff
= offset
+ size
;
9266 if (endoff
> in
->size
)
9270 if (endoff
<= len
) {
9271 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
9273 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
9274 bl
->append_zero(endoff
- len
);
9276 r
= endoff
- offset
;
9277 } else if ((uint64_t)offset
< endoff
) {
9278 bl
->append_zero(endoff
- offset
);
9279 r
= endoff
- offset
;
9287 if (!conf
->client_debug_force_sync_read
&&
9289 (have
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
))) {
9291 if (f
->flags
& O_RSYNC
) {
9292 _flush_range(in
, offset
, size
);
9294 r
= _read_async(f
, offset
, size
, bl
);
9298 if (f
->flags
& O_DIRECT
)
9299 _flush_range(in
, offset
, size
);
9301 bool checkeof
= false;
9302 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
9309 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9312 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
9317 if ((uint64_t)offset
< in
->size
)
9323 ceph_assert(r
>= 0);
9326 f
->pos
= start_pos
+ r
;
9329 lat
= ceph_clock_now();
9331 logger
->tinc(l_c_read
, lat
);
9337 client_lock
.unlock();
9338 int ret
= onuninline
->wait();
9340 if (ret
>= 0 || ret
== -ECANCELED
) {
9341 in
->inline_data
.clear();
9342 in
->inline_version
= CEPH_INLINE_NONE
;
9343 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9349 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
9357 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
9360 f
->readahead
.inc_pending();
9363 Client::C_Readahead::~C_Readahead() {
9364 f
->readahead
.dec_pending();
9368 void Client::C_Readahead::finish(int r
) {
9369 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
9370 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9373 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
9375 const auto& conf
= cct
->_conf
;
9376 Inode
*in
= f
->inode
.get();
9378 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9380 // trim read based on file size?
9381 if (off
>= in
->size
)
9385 if (off
+ len
> in
->size
) {
9386 len
= in
->size
- off
;
9389 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9390 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9391 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9393 // read (and possibly block)
9395 C_SaferCond
onfinish("Client::_read_async flock");
9396 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9397 off
, len
, bl
, 0, &onfinish
);
9399 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9400 client_lock
.unlock();
9401 r
= onfinish
.wait();
9403 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9406 if(f
->readahead
.get_min_readahead_size() > 0) {
9407 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9408 if (readahead_extent
.second
> 0) {
9409 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9410 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9411 Context
*onfinish2
= new C_Readahead(this, f
);
9412 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9413 readahead_extent
.first
, readahead_extent
.second
,
9414 NULL
, 0, onfinish2
);
9416 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9417 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9419 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9428 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9431 Inode
*in
= f
->inode
.get();
9436 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << off
<< "~" << len
<< dendl
;
9439 C_SaferCond
onfinish("Client::_read_sync flock");
9443 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9445 in
->truncate_size
, in
->truncate_seq
,
9447 client_lock
.unlock();
9448 int r
= onfinish
.wait();
9451 // if we get ENOENT from OSD, assume 0 bytes returned
9462 bl
->claim_append(tbl
);
9465 if (r
>= 0 && r
< wanted
) {
9466 if (pos
< in
->size
) {
9467 // zero up to known EOF
9468 int64_t some
= in
->size
- pos
;
9471 auto z
= buffer::ptr_node::create(some
);
9473 bl
->push_back(std::move(z
));
9490 * we keep count of uncommitted sync writes on the inode, so that
9493 void Client::_sync_write_commit(Inode
*in
)
9495 ceph_assert(unsafe_sync_write
> 0);
9496 unsafe_sync_write
--;
9498 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9500 ldout(cct
, 15) << __func__
<< " unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9501 if (unsafe_sync_write
== 0 && unmounting
) {
9502 ldout(cct
, 10) << __func__
<< " -- no more unsafe writes, unmount can proceed" << dendl
;
9503 mount_cond
.notify_all();
9507 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9509 std::lock_guard
lock(client_lock
);
9510 tout(cct
) << "write" << std::endl
;
9511 tout(cct
) << fd
<< std::endl
;
9512 tout(cct
) << size
<< std::endl
;
9513 tout(cct
) << offset
<< std::endl
;
9518 Fh
*fh
= get_filehandle(fd
);
9521 #if defined(__linux__) && defined(O_PATH)
9522 if (fh
->flags
& O_PATH
)
9525 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9526 size
= std::min(size
, (loff_t
)INT_MAX
);
9527 int r
= _write(fh
, offset
, size
, buf
, NULL
, false);
9528 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9532 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9536 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9539 int64_t Client::_preadv_pwritev_locked(Fh
*fh
, const struct iovec
*iov
,
9540 unsigned iovcnt
, int64_t offset
, bool write
,
9543 #if defined(__linux__) && defined(O_PATH)
9544 if (fh
->flags
& O_PATH
)
9547 loff_t totallen
= 0;
9548 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9549 totallen
+= iov
[i
].iov_len
;
9553 * Some of the API functions take 64-bit size values, but only return
9554 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9555 * we don't do I/Os larger than the values we can return.
9558 totallen
= std::min(totallen
, (loff_t
)INT_MAX
);
9561 int64_t w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9562 ldout(cct
, 3) << "pwritev(" << fh
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9566 int64_t r
= _read(fh
, offset
, totallen
, &bl
);
9567 ldout(cct
, 3) << "preadv(" << fh
<< ", " << offset
<< ") = " << r
<< dendl
;
9571 auto iter
= bl
.cbegin();
9572 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9574 * This piece of code aims to handle the case that bufferlist does not have enough data
9575 * to fill in the iov
9577 const auto round_size
= std::min
<unsigned>(resid
, iov
[j
].iov_len
);
9578 iter
.copy(round_size
, reinterpret_cast<char*>(iov
[j
].iov_base
));
9579 resid
-= round_size
;
9580 /* iter is self-updating */
9586 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9588 std::lock_guard
lock(client_lock
);
9589 tout(cct
) << fd
<< std::endl
;
9590 tout(cct
) << offset
<< std::endl
;
9595 Fh
*fh
= get_filehandle(fd
);
9598 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, offset
, write
, true);
9601 int64_t Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9602 const struct iovec
*iov
, int iovcnt
)
9606 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9609 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9610 Inode
*in
= f
->inode
.get();
9612 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9616 ceph_assert(in
->snapid
== CEPH_NOSNAP
);
9618 // was Fh opened as writeable?
9619 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9622 // use/adjust fd pos?
9626 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9627 * change out from under us.
9629 if (f
->flags
& O_APPEND
) {
9630 auto r
= _lseek(f
, 0, SEEK_END
);
9642 uint64_t endoff
= offset
+ size
;
9643 if (endoff
> in
->size
&& is_quota_bytes_exceeded(in
, endoff
- in
->size
,
9648 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9650 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9653 utime_t start
= ceph_clock_now();
9655 if (in
->inline_version
== 0) {
9656 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9659 ceph_assert(in
->inline_version
> 0);
9662 // copy into fresh buffer (since our write may be resub, async)
9666 bl
.append(buf
, size
);
9668 for (int i
= 0; i
< iovcnt
; i
++) {
9669 if (iov
[i
].iov_len
> 0) {
9670 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9676 uint64_t totalwritten
;
9678 if (f
->mode
& CEPH_FILE_MODE_LAZY
)
9679 want
= CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
;
9681 want
= CEPH_CAP_FILE_BUFFER
;
9682 int r
= get_caps(f
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
, want
, &have
, endoff
);
9686 /* clear the setuid/setgid bits, if any */
9687 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9688 struct ceph_statx stx
= { 0 };
9690 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9691 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9695 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9698 if (f
->flags
& O_DIRECT
)
9699 have
&= ~(CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
);
9701 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9703 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
9705 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9706 if (endoff
> cct
->_conf
->client_max_inline_size
||
9707 endoff
> CEPH_INLINE_MAX_SIZE
||
9708 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9709 onuninline
.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9710 uninline_data(in
, onuninline
.get());
9712 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9714 uint32_t len
= in
->inline_data
.length();
9717 in
->inline_data
.begin(endoff
).copy(len
- endoff
, bl
); // XXX
9720 in
->inline_data
.splice(offset
, len
- offset
);
9721 else if (offset
> len
)
9722 in
->inline_data
.append_zero(offset
- len
);
9724 in
->inline_data
.append(bl
);
9725 in
->inline_version
++;
9727 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9733 if (cct
->_conf
->client_oc
&&
9734 (have
& (CEPH_CAP_FILE_BUFFER
| CEPH_CAP_FILE_LAZYIO
))) {
9735 // do buffered write
9736 if (!in
->oset
.dirty_or_tx
)
9737 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9739 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9741 // async, caching, non-blocking.
9742 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9743 in
->snaprealm
->get_snap_context(),
9744 offset
, size
, bl
, ceph::real_clock::now(),
9746 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9751 // flush cached write if O_SYNC is set on file fh
9752 // O_DSYNC == O_SYNC on linux < 2.6.33
9753 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9754 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9755 _flush_range(in
, offset
, size
);
9758 if (f
->flags
& O_DIRECT
)
9759 _flush_range(in
, offset
, size
);
9761 // simple, non-atomic sync write
9762 C_SaferCond
onfinish("Client::_write flock");
9763 unsafe_sync_write
++;
9764 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9766 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9767 offset
, size
, bl
, ceph::real_clock::now(), 0,
9768 in
->truncate_size
, in
->truncate_seq
,
9770 client_lock
.unlock();
9771 r
= onfinish
.wait();
9773 _sync_write_commit(in
);
9778 // if we get here, write was successful, update client metadata
9781 lat
= ceph_clock_now();
9783 logger
->tinc(l_c_wrlat
, lat
);
9790 totalwritten
= size
;
9791 r
= (int64_t)totalwritten
;
9794 if (totalwritten
+ offset
> in
->size
) {
9795 in
->size
= totalwritten
+ offset
;
9796 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9798 if (is_quota_bytes_approaching(in
, f
->actor_perms
)) {
9799 check_caps(in
, CHECK_CAPS_NODELAY
);
9800 } else if (is_max_size_approaching(in
)) {
9804 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9806 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9810 in
->mtime
= in
->ctime
= ceph_clock_now();
9812 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9816 if (nullptr != onuninline
) {
9817 client_lock
.unlock();
9818 int uninline_ret
= onuninline
->wait();
9821 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9822 in
->inline_data
.clear();
9823 in
->inline_version
= CEPH_INLINE_NONE
;
9824 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9830 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9834 int Client::_flush(Fh
*f
)
9836 Inode
*in
= f
->inode
.get();
9837 int err
= f
->take_async_err();
9839 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9840 << cpp_strerror(err
) << dendl
;
9842 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9848 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9850 struct ceph_statx stx
;
9851 stx
.stx_size
= length
;
9852 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9855 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9857 std::lock_guard
lock(client_lock
);
9858 tout(cct
) << __func__
<< std::endl
;
9859 tout(cct
) << fd
<< std::endl
;
9860 tout(cct
) << length
<< std::endl
;
9865 Fh
*f
= get_filehandle(fd
);
9868 #if defined(__linux__) && defined(O_PATH)
9869 if (f
->flags
& O_PATH
)
9873 attr
.st_size
= length
;
9874 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9877 int Client::fsync(int fd
, bool syncdataonly
)
9879 std::lock_guard
lock(client_lock
);
9880 tout(cct
) << "fsync" << std::endl
;
9881 tout(cct
) << fd
<< std::endl
;
9882 tout(cct
) << syncdataonly
<< std::endl
;
9887 Fh
*f
= get_filehandle(fd
);
9890 #if defined(__linux__) && defined(O_PATH)
9891 if (f
->flags
& O_PATH
)
9894 int r
= _fsync(f
, syncdataonly
);
9896 // The IOs in this fsync were okay, but maybe something happened
9897 // in the background that we shoudl be reporting?
9898 r
= f
->take_async_err();
9899 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9900 << ") = 0, async_err = " << r
<< dendl
;
9902 // Assume that an error we encountered during fsync, even reported
9903 // synchronously, would also have applied the error to the Fh, and we
9904 // should clear it here to avoid returning the same error again on next
9906 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9908 f
->take_async_err();
9913 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9916 std::unique_ptr
<C_SaferCond
> object_cacher_completion
= nullptr;
9917 ceph_tid_t flush_tid
= 0;
9920 utime_t start
= ceph_clock_now();
9922 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9924 if (cct
->_conf
->client_oc
) {
9925 object_cacher_completion
.reset(new C_SaferCond("Client::_fsync::lock"));
9926 tmp_ref
= in
; // take a reference; C_SaferCond doesn't and _flush won't either
9927 _flush(in
, object_cacher_completion
.get());
9928 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9931 if (!syncdataonly
&& in
->dirty_caps
) {
9932 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9933 if (in
->flushing_caps
)
9934 flush_tid
= last_flush_tid
;
9935 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9937 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9940 MetaRequest
*req
= in
->unsafe_ops
.back();
9941 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9944 wait_on_list(req
->waitfor_safe
);
9948 if (nullptr != object_cacher_completion
) { // wait on a real reply instead of guessing
9949 client_lock
.unlock();
9950 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9951 r
= object_cacher_completion
->wait();
9953 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9955 // FIXME: this can starve
9956 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9957 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9958 << " uncommitted, waiting" << dendl
;
9959 wait_on_list(in
->waitfor_commit
);
9965 wait_sync_caps(in
, flush_tid
);
9967 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9969 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9970 << cpp_strerror(-r
) << dendl
;
9973 lat
= ceph_clock_now();
9975 logger
->tinc(l_c_fsync
, lat
);
9980 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9982 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9983 return _fsync(f
->inode
.get(), syncdataonly
);
9986 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9988 std::lock_guard
lock(client_lock
);
9989 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9990 tout(cct
) << fd
<< std::endl
;
9995 Fh
*f
= get_filehandle(fd
);
9998 int r
= _getattr(f
->inode
, mask
, perms
);
10001 fill_stat(f
->inode
, stbuf
, NULL
);
10002 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
10006 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
10007 unsigned int want
, unsigned int flags
)
10009 std::lock_guard
lock(client_lock
);
10010 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
10011 tout(cct
) << fd
<< std::endl
;
10016 Fh
*f
= get_filehandle(fd
);
10020 unsigned mask
= statx_to_mask(flags
, want
);
10023 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
10024 r
= _getattr(f
->inode
, mask
, perms
);
10026 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
10031 fill_statx(f
->inode
, mask
, stx
);
10032 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
10036 // not written yet, but i want to link!
10038 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
10039 const UserPerm
& perms
)
10041 std::lock_guard
lock(client_lock
);
10042 tout(cct
) << "chdir" << std::endl
;
10043 tout(cct
) << relpath
<< std::endl
;
10048 filepath
path(relpath
);
10050 int r
= path_walk(path
, &in
, perms
);
10054 if (!(in
.get()->is_dir()))
10059 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
10061 _getcwd(new_cwd
, perms
);
10065 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
10068 ldout(cct
, 10) << __func__
<< " " << *cwd
<< dendl
;
10070 Inode
*in
= cwd
.get();
10071 while (in
!= root
) {
10072 ceph_assert(in
->dentries
.size() < 2); // dirs can't be hard-linked
10074 // A cwd or ancester is unlinked
10075 if (in
->dentries
.empty()) {
10079 Dentry
*dn
= in
->get_first_parent();
10084 ldout(cct
, 10) << __func__
<< " looking up parent for " << *in
<< dendl
;
10085 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
10086 filepath
path(in
->ino
);
10087 req
->set_filepath(path
);
10088 req
->set_inode(in
);
10089 int res
= make_request(req
, perms
);
10098 path
.push_front_dentry(dn
->name
);
10099 in
= dn
->dir
->parent_inode
;
10102 dir
+= path
.get_path();
10105 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
10107 std::lock_guard
l(client_lock
);
10109 _getcwd(dir
, perms
);
10112 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
10113 const UserPerm
& perms
)
10115 std::lock_guard
l(client_lock
);
10116 tout(cct
) << __func__
<< std::endl
;
10117 unsigned long int total_files_on_fs
;
10125 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
10126 if (data_pools
.size() == 1) {
10127 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
10129 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
10132 client_lock
.unlock();
10133 int rval
= cond
.wait();
10135 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
10136 client_lock
.lock();
10139 ldout(cct
, 1) << "underlying call to statfs returned error: "
10140 << cpp_strerror(rval
)
10145 memset(stbuf
, 0, sizeof(*stbuf
));
10148 * we're going to set a block size of 4MB so we can represent larger
10149 * FSes without overflowing. Additionally convert the space
10150 * measurements from KB to bytes while making them in terms of
10151 * blocks. We use 4MB only because it is big enough, and because it
10152 * actually *is* the (ceph) default block size.
10154 const int CEPH_BLOCK_SHIFT
= 22;
10155 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
10156 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
10157 stbuf
->f_files
= total_files_on_fs
;
10158 stbuf
->f_ffree
= 0;
10159 stbuf
->f_favail
= -1;
10160 stbuf
->f_fsid
= -1; // ??
10161 stbuf
->f_flag
= 0; // ??
10162 stbuf
->f_namemax
= NAME_MAX
;
10164 // Usually quota_root will == root_ancestor, but if the mount root has no
10165 // quota but we can see a parent of it that does have a quota, we'll
10166 // respect that one instead.
10167 ceph_assert(root
!= nullptr);
10168 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
10170 // get_quota_root should always give us something
10171 // because client quotas are always enabled
10172 ceph_assert(quota_root
!= nullptr);
10174 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
10176 // Skip the getattr if any sessions are stale, as we don't want to
10177 // block `df` if this client has e.g. been evicted, or if the MDS cluster
10179 if (!_any_stale_sessions()) {
10180 int r
= _getattr(quota_root
, 0, perms
, true);
10182 // Ignore return value: error getting latest inode metadata is not a good
10183 // reason to break "df".
10184 lderr(cct
) << "Error in getattr on quota root 0x"
10185 << std::hex
<< quota_root
->ino
<< std::dec
10186 << " statfs result may be outdated" << dendl
;
10190 // Special case: if there is a size quota set on the Inode acting
10191 // as the root for this client mount, then report the quota status
10192 // as the filesystem statistics.
10193 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
10194 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
10195 // It is possible for a quota to be exceeded: arithmetic here must
10196 // handle case where used > total.
10197 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
10199 stbuf
->f_blocks
= total
;
10200 stbuf
->f_bfree
= free
;
10201 stbuf
->f_bavail
= free
;
10203 // General case: report the cluster statistics returned from RADOS. Because
10204 // multiple pools may be used without one filesystem namespace via
10205 // layouts, this is the most correct thing we can do.
10206 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
10207 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10208 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
10214 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
10215 struct flock
*fl
, uint64_t owner
, bool removing
)
10217 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
10218 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
10219 << " type " << fl
->l_type
<< " owner " << owner
10220 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
10222 if (in
->flags
& I_ERROR_FILELOCK
)
10226 if (F_RDLCK
== fl
->l_type
)
10227 lock_cmd
= CEPH_LOCK_SHARED
;
10228 else if (F_WRLCK
== fl
->l_type
)
10229 lock_cmd
= CEPH_LOCK_EXCL
;
10230 else if (F_UNLCK
== fl
->l_type
)
10231 lock_cmd
= CEPH_LOCK_UNLOCK
;
10235 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
10239 * Set the most significant bit, so that MDS knows the 'owner'
10240 * is sufficient to identify the owner of lock. (old code uses
10241 * both 'owner' and 'pid')
10243 owner
|= (1ULL << 63);
10245 MetaRequest
*req
= new MetaRequest(op
);
10247 in
->make_nosnap_relative_path(path
);
10248 req
->set_filepath(path
);
10249 req
->set_inode(in
);
10251 req
->head
.args
.filelock_change
.rule
= lock_type
;
10252 req
->head
.args
.filelock_change
.type
= lock_cmd
;
10253 req
->head
.args
.filelock_change
.owner
= owner
;
10254 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
10255 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
10256 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
10257 req
->head
.args
.filelock_change
.wait
= sleep
;
10262 if (sleep
&& switch_interrupt_cb
) {
10263 // enable interrupt
10264 switch_interrupt_cb(callback_handle
, req
->get());
10265 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10266 // disable interrupt
10267 switch_interrupt_cb(callback_handle
, NULL
);
10268 if (ret
== 0 && req
->aborted()) {
10269 // effect of this lock request has been revoked by the 'lock intr' request
10270 ret
= req
->get_abort_code();
10274 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
10278 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
10279 ceph_filelock filelock
;
10280 auto p
= bl
.cbegin();
10281 decode(filelock
, p
);
10283 if (CEPH_LOCK_SHARED
== filelock
.type
)
10284 fl
->l_type
= F_RDLCK
;
10285 else if (CEPH_LOCK_EXCL
== filelock
.type
)
10286 fl
->l_type
= F_WRLCK
;
10288 fl
->l_type
= F_UNLCK
;
10290 fl
->l_whence
= SEEK_SET
;
10291 fl
->l_start
= filelock
.start
;
10292 fl
->l_len
= filelock
.length
;
10293 fl
->l_pid
= filelock
.pid
;
10294 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
10295 ceph_lock_state_t
*lock_state
;
10296 if (lock_type
== CEPH_LOCK_FCNTL
) {
10297 if (!in
->fcntl_locks
)
10298 in
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10299 lock_state
= in
->fcntl_locks
.get();
10300 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
10301 if (!in
->flock_locks
)
10302 in
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10303 lock_state
= in
->flock_locks
.get();
10308 _update_lock_state(fl
, owner
, lock_state
);
10311 if (lock_type
== CEPH_LOCK_FCNTL
) {
10312 if (!fh
->fcntl_locks
)
10313 fh
->fcntl_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
));
10314 lock_state
= fh
->fcntl_locks
.get();
10316 if (!fh
->flock_locks
)
10317 fh
->flock_locks
.reset(new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
));
10318 lock_state
= fh
->flock_locks
.get();
10320 _update_lock_state(fl
, owner
, lock_state
);
10328 int Client::_interrupt_filelock(MetaRequest
*req
)
10330 // Set abort code, but do not kick. The abort code prevents the request
10331 // from being re-sent.
10332 req
->abort(-EINTR
);
10334 return 0; // haven't sent the request
10336 Inode
*in
= req
->inode();
10339 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
10340 lock_type
= CEPH_LOCK_FLOCK_INTR
;
10341 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
10342 lock_type
= CEPH_LOCK_FCNTL_INTR
;
10348 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
10350 in
->make_nosnap_relative_path(path
);
10351 intr_req
->set_filepath(path
);
10352 intr_req
->set_inode(in
);
10353 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
10354 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
10355 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
10357 UserPerm
perms(req
->get_uid(), req
->get_gid());
10358 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
10361 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
10363 if (!in
->fcntl_locks
&& !in
->flock_locks
)
10366 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
10367 encode(nr_fcntl_locks
, bl
);
10368 if (nr_fcntl_locks
) {
10369 auto &lock_state
= in
->fcntl_locks
;
10370 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10371 p
!= lock_state
->held_locks
.end();
10373 encode(p
->second
, bl
);
10376 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
10377 encode(nr_flock_locks
, bl
);
10378 if (nr_flock_locks
) {
10379 auto &lock_state
= in
->flock_locks
;
10380 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10381 p
!= lock_state
->held_locks
.end();
10383 encode(p
->second
, bl
);
10386 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< ", " << nr_fcntl_locks
10387 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10390 void Client::_release_filelocks(Fh
*fh
)
10392 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10395 Inode
*in
= fh
->inode
.get();
10396 ldout(cct
, 10) << __func__
<< " " << fh
<< " ino " << in
->ino
<< dendl
;
10398 list
<ceph_filelock
> activated_locks
;
10400 list
<pair
<int, ceph_filelock
> > to_release
;
10402 if (fh
->fcntl_locks
) {
10403 auto &lock_state
= fh
->fcntl_locks
;
10404 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10406 if (in
->flags
& I_ERROR_FILELOCK
) {
10407 lock_state
->remove_lock(q
->second
, activated_locks
);
10409 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, q
->second
));
10412 lock_state
.reset();
10414 if (fh
->flock_locks
) {
10415 auto &lock_state
= fh
->flock_locks
;
10416 for(auto p
= lock_state
->held_locks
.begin(); p
!= lock_state
->held_locks
.end(); ) {
10418 if (in
->flags
& I_ERROR_FILELOCK
) {
10419 lock_state
->remove_lock(q
->second
, activated_locks
);
10421 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, q
->second
));
10424 lock_state
.reset();
10427 if ((in
->flags
& I_ERROR_FILELOCK
) && !in
->has_any_filelocks())
10428 in
->flags
&= ~I_ERROR_FILELOCK
;
10430 if (to_release
.empty())
10434 memset(&fl
, 0, sizeof(fl
));
10435 fl
.l_whence
= SEEK_SET
;
10436 fl
.l_type
= F_UNLCK
;
10438 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10439 p
!= to_release
.end();
10441 fl
.l_start
= p
->second
.start
;
10442 fl
.l_len
= p
->second
.length
;
10443 fl
.l_pid
= p
->second
.pid
;
10444 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10445 p
->second
.owner
, true);
10449 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10450 ceph_lock_state_t
*lock_state
)
10453 if (F_RDLCK
== fl
->l_type
)
10454 lock_cmd
= CEPH_LOCK_SHARED
;
10455 else if (F_WRLCK
== fl
->l_type
)
10456 lock_cmd
= CEPH_LOCK_EXCL
;
10458 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10460 ceph_filelock filelock
;
10461 filelock
.start
= fl
->l_start
;
10462 filelock
.length
= fl
->l_len
;
10463 filelock
.client
= 0;
10464 // see comment in _do_filelock()
10465 filelock
.owner
= owner
| (1ULL << 63);
10466 filelock
.pid
= fl
->l_pid
;
10467 filelock
.type
= lock_cmd
;
10469 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10470 list
<ceph_filelock
> activated_locks
;
10471 lock_state
->remove_lock(filelock
, activated_locks
);
10473 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10478 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10480 Inode
*in
= fh
->inode
.get();
10481 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10482 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10486 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10488 Inode
*in
= fh
->inode
.get();
10489 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10490 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10491 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10495 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10497 Inode
*in
= fh
->inode
.get();
10498 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10500 int sleep
= !(cmd
& LOCK_NB
);
10519 memset(&fl
, 0, sizeof(fl
));
10521 fl
.l_whence
= SEEK_SET
;
10523 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10524 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10528 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10530 /* Since the only thing this does is wrap a call to statfs, and
10531 statfs takes a lock, it doesn't seem we have a need to split it
10533 return statfs(0, stbuf
, perms
);
10536 void Client::ll_register_callbacks(struct ceph_client_callback_args
*args
)
10540 std::lock_guard
l(client_lock
);
10541 ldout(cct
, 10) << __func__
<< " cb " << args
->handle
10542 << " invalidate_ino_cb " << args
->ino_cb
10543 << " invalidate_dentry_cb " << args
->dentry_cb
10544 << " switch_interrupt_cb " << args
->switch_intr_cb
10545 << " remount_cb " << args
->remount_cb
10547 callback_handle
= args
->handle
;
10548 if (args
->ino_cb
) {
10549 ino_invalidate_cb
= args
->ino_cb
;
10550 async_ino_invalidator
.start();
10552 if (args
->dentry_cb
) {
10553 dentry_invalidate_cb
= args
->dentry_cb
;
10554 async_dentry_invalidator
.start();
10556 if (args
->switch_intr_cb
) {
10557 switch_interrupt_cb
= args
->switch_intr_cb
;
10558 interrupt_finisher
.start();
10560 if (args
->remount_cb
) {
10561 remount_cb
= args
->remount_cb
;
10562 remount_finisher
.start();
10564 if (args
->ino_release_cb
) {
10565 ino_release_cb
= args
->ino_release_cb
;
10566 async_ino_releasor
.start();
10568 if (args
->umask_cb
)
10569 umask_cb
= args
->umask_cb
;
10572 int Client::test_dentry_handling(bool can_invalidate
)
10576 can_invalidate_dentries
= can_invalidate
;
10578 if (can_invalidate_dentries
) {
10579 ceph_assert(dentry_invalidate_cb
);
10580 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10583 ceph_assert(remount_cb
);
10584 ldout(cct
, 1) << "using remount_cb" << dendl
;
10585 r
= _do_remount(false);
10591 int Client::_sync_fs()
10593 ldout(cct
, 10) << __func__
<< dendl
;
10596 std::unique_ptr
<C_SaferCond
> cond
= nullptr;
10597 if (cct
->_conf
->client_oc
) {
10598 cond
.reset(new C_SaferCond("Client::_sync_fs:lock"));
10599 objectcacher
->flush_all(cond
.get());
10604 ceph_tid_t flush_tid
= last_flush_tid
;
10606 // wait for unsafe mds requests
10607 wait_unsafe_requests();
10609 wait_sync_caps(flush_tid
);
10611 if (nullptr != cond
) {
10612 client_lock
.unlock();
10613 ldout(cct
, 15) << __func__
<< " waiting on data to flush" << dendl
;
10615 ldout(cct
, 15) << __func__
<< " flush finished" << dendl
;
10616 client_lock
.lock();
10622 int Client::sync_fs()
10624 std::lock_guard
l(client_lock
);
10632 int64_t Client::drop_caches()
10634 std::lock_guard
l(client_lock
);
10635 return objectcacher
->release_all();
10638 int Client::_lazyio(Fh
*fh
, int enable
)
10640 Inode
*in
= fh
->inode
.get();
10641 ldout(cct
, 20) << __func__
<< " " << *in
<< " " << !!enable
<< dendl
;
10643 if (!!(fh
->mode
& CEPH_FILE_MODE_LAZY
) == !!enable
)
10646 int orig_mode
= fh
->mode
;
10648 fh
->mode
|= CEPH_FILE_MODE_LAZY
;
10649 in
->get_open_ref(fh
->mode
);
10650 in
->put_open_ref(orig_mode
);
10651 check_caps(in
, CHECK_CAPS_NODELAY
);
10653 fh
->mode
&= ~CEPH_FILE_MODE_LAZY
;
10654 in
->get_open_ref(fh
->mode
);
10655 in
->put_open_ref(orig_mode
);
10662 int Client::lazyio(int fd
, int enable
)
10664 std::lock_guard
l(client_lock
);
10665 Fh
*f
= get_filehandle(fd
);
10669 return _lazyio(f
, enable
);
10672 int Client::ll_lazyio(Fh
*fh
, int enable
)
10674 std::lock_guard
lock(client_lock
);
10675 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << !!enable
<< dendl
;
10676 tout(cct
) << __func__
<< std::endl
;
10678 return _lazyio(fh
, enable
);
10681 int Client::lazyio_propagate(int fd
, loff_t offset
, size_t count
)
10683 std::lock_guard
l(client_lock
);
10684 ldout(cct
, 3) << "op: client->lazyio_propagate(" << fd
10685 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10687 Fh
*f
= get_filehandle(fd
);
10697 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10699 std::lock_guard
l(client_lock
);
10700 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10701 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10703 Fh
*f
= get_filehandle(fd
);
10706 Inode
*in
= f
->inode
.get();
10709 if (_release(in
)) {
10710 int r
=_getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
10718 // =============================
10721 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10723 std::lock_guard
l(client_lock
);
10728 filepath
path(relpath
);
10730 int r
= path_walk(path
, &in
, perm
);
10733 if (cct
->_conf
->client_permissions
) {
10734 r
= may_create(in
.get(), perm
);
10738 Inode
*snapdir
= open_snapdir(in
.get());
10739 return _mkdir(snapdir
, name
, 0, perm
);
10742 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10744 std::lock_guard
l(client_lock
);
10749 filepath
path(relpath
);
10751 int r
= path_walk(path
, &in
, perms
);
10754 if (cct
->_conf
->client_permissions
) {
10755 r
= may_delete(in
.get(), NULL
, perms
);
10759 Inode
*snapdir
= open_snapdir(in
.get());
10760 return _rmdir(snapdir
, name
, perms
);
10763 // =============================
10766 int Client::get_caps_issued(int fd
) {
10768 std::lock_guard
lock(client_lock
);
10773 Fh
*f
= get_filehandle(fd
);
10777 return f
->inode
->caps_issued();
10780 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10782 std::lock_guard
lock(client_lock
);
10789 int r
= path_walk(p
, &in
, perms
, true);
10792 return in
->caps_issued();
10795 // =========================================
10798 Inode
*Client::open_snapdir(Inode
*diri
)
10801 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10802 if (!inode_map
.count(vino
)) {
10803 in
= new Inode(this, vino
, &diri
->layout
);
10805 in
->ino
= diri
->ino
;
10806 in
->snapid
= CEPH_SNAPDIR
;
10807 in
->mode
= diri
->mode
;
10808 in
->uid
= diri
->uid
;
10809 in
->gid
= diri
->gid
;
10811 in
->mtime
= diri
->mtime
;
10812 in
->ctime
= diri
->ctime
;
10813 in
->btime
= diri
->btime
;
10814 in
->atime
= diri
->atime
;
10815 in
->size
= diri
->size
;
10816 in
->change_attr
= diri
->change_attr
;
10818 in
->dirfragtree
.clear();
10819 in
->snapdir_parent
= diri
;
10820 diri
->flags
|= I_SNAPDIR_OPEN
;
10821 inode_map
[vino
] = in
;
10822 if (use_faked_inos())
10823 _assign_faked_ino(in
);
10824 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10826 in
= inode_map
[vino
];
10827 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10832 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10833 Inode
**out
, const UserPerm
& perms
)
10835 std::lock_guard
lock(client_lock
);
10836 vinodeno_t vparent
= _get_vino(parent
);
10837 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10838 tout(cct
) << __func__
<< std::endl
;
10839 tout(cct
) << name
<< std::endl
;
10845 if (!fuse_default_permissions
) {
10846 if (strcmp(name
, ".") && strcmp(name
, "..")) {
10847 r
= may_lookup(parent
, perms
);
10853 string
dname(name
);
10856 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10863 fill_stat(in
, attr
);
10867 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10868 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10869 tout(cct
) << attr
->st_ino
<< std::endl
;
10874 int Client::ll_lookup_inode(
10875 struct inodeno_t ino
,
10876 const UserPerm
& perms
,
10879 ceph_assert(inode
!= NULL
);
10880 std::lock_guard
lock(client_lock
);
10881 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10886 // Num1: get inode and *inode
10887 int r
= _lookup_ino(ino
, perms
, inode
);
10891 ceph_assert(*inode
!= NULL
);
10893 if (!(*inode
)->dentries
.empty()) {
10894 ldout(cct
, 8) << __func__
<< " dentry already present" << dendl
;
10898 if ((*inode
)->is_root()) {
10899 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
10903 // Num2: Request the parent inode, so that we can look up the name
10905 r
= _lookup_parent(*inode
, perms
, &parent
);
10907 _ll_forget(*inode
, 1);
10911 ceph_assert(parent
!= NULL
);
10913 // Num3: Finally, get the name (dentry) of the requested inode
10914 r
= _lookup_name(*inode
, parent
, perms
);
10916 // Unexpected error
10917 _ll_forget(parent
, 1);
10918 _ll_forget(*inode
, 1);
10922 _ll_forget(parent
, 1);
10926 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10927 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10928 const UserPerm
& perms
)
10930 std::lock_guard
lock(client_lock
);
10931 vinodeno_t vparent
= _get_vino(parent
);
10932 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
<< dendl
;
10933 tout(cct
) << "ll_lookupx" << std::endl
;
10934 tout(cct
) << name
<< std::endl
;
10940 if (!fuse_default_permissions
) {
10941 r
= may_lookup(parent
, perms
);
10946 string
dname(name
);
10949 unsigned mask
= statx_to_mask(flags
, want
);
10950 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10956 fill_statx(in
, mask
, stx
);
10960 ldout(cct
, 3) << __func__
<< " " << vparent
<< " " << name
10961 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10962 tout(cct
) << stx
->stx_ino
<< std::endl
;
10967 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10968 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10970 std::lock_guard
lock(client_lock
);
10975 filepath
fp(name
, 0);
10978 unsigned mask
= statx_to_mask(flags
, want
);
10980 ldout(cct
, 3) << __func__
<< " " << name
<< dendl
;
10981 tout(cct
) << __func__
<< std::endl
;
10982 tout(cct
) << name
<< std::endl
;
10984 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10986 /* zero out mask, just in case... */
10993 fill_statx(in
, mask
, stx
);
11000 void Client::_ll_get(Inode
*in
)
11002 if (in
->ll_ref
== 0) {
11004 if (in
->is_dir() && !in
->dentries
.empty()) {
11005 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11006 in
->get_first_parent()->get(); // pin dentry
11008 if (in
->snapid
!= CEPH_NOSNAP
)
11009 ll_snap_ref
[in
->snapid
]++;
11012 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
11015 int Client::_ll_put(Inode
*in
, uint64_t num
)
11018 ldout(cct
, 20) << __func__
<< " " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
11019 if (in
->ll_ref
== 0) {
11020 if (in
->is_dir() && !in
->dentries
.empty()) {
11021 ceph_assert(in
->dentries
.size() == 1); // dirs can't be hard-linked
11022 in
->get_first_parent()->put(); // unpin dentry
11024 if (in
->snapid
!= CEPH_NOSNAP
) {
11025 auto p
= ll_snap_ref
.find(in
->snapid
);
11026 ceph_assert(p
!= ll_snap_ref
.end());
11027 ceph_assert(p
->second
> 0);
11028 if (--p
->second
== 0)
11029 ll_snap_ref
.erase(p
);
11038 void Client::_ll_drop_pins()
11040 ldout(cct
, 10) << __func__
<< dendl
;
11041 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
11042 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
11043 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
11044 it
!= inode_map
.end();
11046 Inode
*in
= it
->second
;
11050 to_be_put
.insert(in
);
11051 _ll_put(in
, in
->ll_ref
);
11056 bool Client::_ll_forget(Inode
*in
, uint64_t count
)
11058 inodeno_t ino
= in
->ino
;
11060 ldout(cct
, 8) << __func__
<< " " << ino
<< " " << count
<< dendl
;
11061 tout(cct
) << __func__
<< std::endl
;
11062 tout(cct
) << ino
.val
<< std::endl
;
11063 tout(cct
) << count
<< std::endl
;
11065 // Ignore forget if we're no longer mounted
11069 if (ino
== 1) return true; // ignore forget on root.
11072 if (in
->ll_ref
< count
) {
11073 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
11074 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
11075 _ll_put(in
, in
->ll_ref
);
11078 if (_ll_put(in
, count
) == 0)
11085 bool Client::ll_forget(Inode
*in
, uint64_t count
)
11087 std::lock_guard
lock(client_lock
);
11088 return _ll_forget(in
, count
);
11091 bool Client::ll_put(Inode
*in
)
11093 /* ll_forget already takes the lock */
11094 return ll_forget(in
, 1);
11097 int Client::ll_get_snap_ref(snapid_t snap
)
11099 std::lock_guard
lock(client_lock
);
11100 auto p
= ll_snap_ref
.find(snap
);
11101 if (p
!= ll_snap_ref
.end())
11106 snapid_t
Client::ll_get_snapid(Inode
*in
)
11108 std::lock_guard
lock(client_lock
);
11112 Inode
*Client::ll_get_inode(ino_t ino
)
11114 std::lock_guard
lock(client_lock
);
11119 vinodeno_t vino
= _map_faked_ino(ino
);
11120 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11121 if (p
== inode_map
.end())
11123 Inode
*in
= p
->second
;
11128 Inode
*Client::ll_get_inode(vinodeno_t vino
)
11130 std::lock_guard
lock(client_lock
);
11135 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
11136 if (p
== inode_map
.end())
11138 Inode
*in
= p
->second
;
11143 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
11145 vinodeno_t vino
= _get_vino(in
);
11147 ldout(cct
, 8) << __func__
<< " " << vino
<< dendl
;
11148 tout(cct
) << __func__
<< std::endl
;
11149 tout(cct
) << vino
.ino
.val
<< std::endl
;
11151 if (vino
.snapid
< CEPH_NOSNAP
)
11154 return _getattr(in
, caps
, perms
);
11157 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
11159 std::lock_guard
lock(client_lock
);
11164 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
11167 fill_stat(in
, attr
);
11168 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11172 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
11173 unsigned int flags
, const UserPerm
& perms
)
11175 std::lock_guard
lock(client_lock
);
11181 unsigned mask
= statx_to_mask(flags
, want
);
11183 if (mask
&& !in
->caps_issued_mask(mask
, true))
11184 res
= _ll_getattr(in
, mask
, perms
);
11187 fill_statx(in
, mask
, stx
);
11188 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11192 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11193 const UserPerm
& perms
, InodeRef
*inp
)
11195 vinodeno_t vino
= _get_vino(in
);
11197 ldout(cct
, 8) << __func__
<< " " << vino
<< " mask " << hex
<< mask
<< dec
11199 tout(cct
) << __func__
<< std::endl
;
11200 tout(cct
) << vino
.ino
.val
<< std::endl
;
11201 tout(cct
) << stx
->stx_mode
<< std::endl
;
11202 tout(cct
) << stx
->stx_uid
<< std::endl
;
11203 tout(cct
) << stx
->stx_gid
<< std::endl
;
11204 tout(cct
) << stx
->stx_size
<< std::endl
;
11205 tout(cct
) << stx
->stx_mtime
<< std::endl
;
11206 tout(cct
) << stx
->stx_atime
<< std::endl
;
11207 tout(cct
) << stx
->stx_btime
<< std::endl
;
11208 tout(cct
) << mask
<< std::endl
;
11210 if (!fuse_default_permissions
) {
11211 int res
= may_setattr(in
, stx
, mask
, perms
);
11216 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
11218 return __setattrx(in
, stx
, mask
, perms
, inp
);
11221 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
11222 const UserPerm
& perms
)
11224 std::lock_guard
lock(client_lock
);
11229 InodeRef
target(in
);
11230 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
11232 ceph_assert(in
== target
.get());
11233 fill_statx(in
, in
->caps_issued(), stx
);
11236 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11240 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
11241 const UserPerm
& perms
)
11243 struct ceph_statx stx
;
11244 stat_to_statx(attr
, &stx
);
11246 std::lock_guard
lock(client_lock
);
11251 InodeRef
target(in
);
11252 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
11254 ceph_assert(in
== target
.get());
11255 fill_stat(in
, attr
);
11258 ldout(cct
, 3) << __func__
<< " " << _get_vino(in
) << " = " << res
<< dendl
;
11266 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
11267 const UserPerm
& perms
)
11269 std::lock_guard
lock(client_lock
);
11275 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11278 return _getxattr(in
, name
, value
, size
, perms
);
11281 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
11282 const UserPerm
& perms
)
11284 std::lock_guard
lock(client_lock
);
11290 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11293 return _getxattr(in
, name
, value
, size
, perms
);
11296 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
11297 const UserPerm
& perms
)
11299 std::lock_guard
lock(client_lock
);
11304 Fh
*f
= get_filehandle(fd
);
11307 return _getxattr(f
->inode
, name
, value
, size
, perms
);
11310 int Client::listxattr(const char *path
, char *list
, size_t size
,
11311 const UserPerm
& perms
)
11313 std::lock_guard
lock(client_lock
);
11319 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
11322 return Client::_listxattr(in
.get(), list
, size
, perms
);
11325 int Client::llistxattr(const char *path
, char *list
, size_t size
,
11326 const UserPerm
& perms
)
11328 std::lock_guard
lock(client_lock
);
11334 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
11337 return Client::_listxattr(in
.get(), list
, size
, perms
);
11340 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
11342 std::lock_guard
lock(client_lock
);
11347 Fh
*f
= get_filehandle(fd
);
11350 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
11353 int Client::removexattr(const char *path
, const char *name
,
11354 const UserPerm
& perms
)
11356 std::lock_guard
lock(client_lock
);
11362 int r
= Client::path_walk(path
, &in
, perms
, true);
11365 return _removexattr(in
, name
, perms
);
11368 int Client::lremovexattr(const char *path
, const char *name
,
11369 const UserPerm
& perms
)
11371 std::lock_guard
lock(client_lock
);
11377 int r
= Client::path_walk(path
, &in
, perms
, false);
11380 return _removexattr(in
, name
, perms
);
11383 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
11385 std::lock_guard
lock(client_lock
);
11390 Fh
*f
= get_filehandle(fd
);
11393 return _removexattr(f
->inode
, name
, perms
);
11396 int Client::setxattr(const char *path
, const char *name
, const void *value
,
11397 size_t size
, int flags
, const UserPerm
& perms
)
11399 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11401 std::lock_guard
lock(client_lock
);
11407 int r
= Client::path_walk(path
, &in
, perms
, true);
11410 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11413 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
11414 size_t size
, int flags
, const UserPerm
& perms
)
11416 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11418 std::lock_guard
lock(client_lock
);
11424 int r
= Client::path_walk(path
, &in
, perms
, false);
11427 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11430 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
11431 int flags
, const UserPerm
& perms
)
11433 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11435 std::lock_guard
lock(client_lock
);
11440 Fh
*f
= get_filehandle(fd
);
11443 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
11446 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
11447 const UserPerm
& perms
)
11451 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11455 // Do a force getattr to get the latest quota before returning
11456 // a value to userspace.
11458 if (vxattr
->flags
& VXATTR_RSTAT
) {
11459 flags
|= CEPH_STAT_RSTAT
;
11461 r
= _getattr(in
, flags
, perms
, true);
11463 // Error from getattr!
11467 // call pointer-to-member function
11469 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11470 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11476 if (r
> (int)size
) {
11478 } else if (r
> 0) {
11479 memcpy(value
, buf
, r
);
11485 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11490 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11494 if (in
->xattrs
.count(n
)) {
11495 r
= in
->xattrs
[n
].length();
11496 if (r
> 0 && size
!= 0) {
11497 if (size
>= (unsigned)r
)
11498 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11505 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11509 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11510 const UserPerm
& perms
)
11512 if (cct
->_conf
->client_permissions
) {
11513 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11517 return _getxattr(in
.get(), name
, value
, size
, perms
);
11520 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11521 size_t size
, const UserPerm
& perms
)
11523 std::lock_guard
lock(client_lock
);
11528 vinodeno_t vino
= _get_vino(in
);
11530 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11531 tout(cct
) << __func__
<< std::endl
;
11532 tout(cct
) << vino
.ino
.val
<< std::endl
;
11533 tout(cct
) << name
<< std::endl
;
11535 if (!fuse_default_permissions
) {
11536 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11541 return _getxattr(in
, name
, value
, size
, perms
);
11544 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11545 const UserPerm
& perms
)
11547 bool len_only
= (size
== 0);
11548 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11554 for (const auto& p
: in
->xattrs
) {
11555 size_t this_len
= p
.first
.length() + 1;
11560 if (this_len
> size
) {
11565 memcpy(name
, p
.first
.c_str(), this_len
);
11570 ldout(cct
, 8) << __func__
<< "(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11574 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11575 const UserPerm
& perms
)
11577 std::lock_guard
lock(client_lock
);
11582 vinodeno_t vino
= _get_vino(in
);
11584 ldout(cct
, 3) << __func__
<< " " << vino
<< " size " << size
<< dendl
;
11585 tout(cct
) << __func__
<< std::endl
;
11586 tout(cct
) << vino
.ino
.val
<< std::endl
;
11587 tout(cct
) << size
<< std::endl
;
11589 return _listxattr(in
, names
, size
, perms
);
11592 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11593 size_t size
, int flags
, const UserPerm
& perms
)
11596 int xattr_flags
= 0;
11598 xattr_flags
|= CEPH_XATTR_REMOVE
;
11599 if (flags
& XATTR_CREATE
)
11600 xattr_flags
|= CEPH_XATTR_CREATE
;
11601 if (flags
& XATTR_REPLACE
)
11602 xattr_flags
|= CEPH_XATTR_REPLACE
;
11604 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11606 in
->make_nosnap_relative_path(path
);
11607 req
->set_filepath(path
);
11608 req
->set_string2(name
);
11609 req
->set_inode(in
);
11610 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11613 assert (value
|| size
== 0);
11614 bl
.append((const char*)value
, size
);
11617 int res
= make_request(req
, perms
);
11620 ldout(cct
, 3) << __func__
<< "(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11625 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11626 size_t size
, int flags
, const UserPerm
& perms
)
11628 if (in
->snapid
!= CEPH_NOSNAP
) {
11634 } else if (value
== NULL
) {
11638 bool posix_acl_xattr
= false;
11639 if (acl_type
== POSIX_ACL
)
11640 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11642 if (strncmp(name
, "user.", 5) &&
11643 strncmp(name
, "security.", 9) &&
11644 strncmp(name
, "trusted.", 8) &&
11645 strncmp(name
, "ceph.", 5) &&
11647 return -EOPNOTSUPP
;
11649 bool check_realm
= false;
11651 if (posix_acl_xattr
) {
11652 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11653 mode_t new_mode
= in
->mode
;
11655 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11662 if (new_mode
!= in
->mode
) {
11663 struct ceph_statx stx
;
11664 stx
.stx_mode
= new_mode
;
11665 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11670 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11672 if (!S_ISDIR(in
->mode
))
11674 int ret
= posix_acl_check(value
, size
);
11683 return -EOPNOTSUPP
;
11686 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11688 if (vxattr
->readonly
)
11689 return -EOPNOTSUPP
;
11690 if (vxattr
->name
.compare(0, 10, "ceph.quota") == 0 && value
)
11691 check_realm
= true;
11695 int ret
= _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11696 if (ret
>= 0 && check_realm
) {
11697 // check if snaprealm was created for quota inode
11698 if (in
->quota
.is_enable() &&
11699 !(in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
))
11706 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11707 size_t size
, int flags
, const UserPerm
& perms
)
11709 if (cct
->_conf
->client_permissions
) {
11710 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11714 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11717 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11720 if (name
== "layout") {
11721 string::iterator begin
= value
.begin();
11722 string::iterator end
= value
.end();
11723 keys_and_values
<string::iterator
> p
; // create instance of parser
11724 std::map
<string
, string
> m
; // map to receive results
11725 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11730 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11731 if (q
->first
== "pool") {
11736 } else if (name
== "layout.pool") {
11740 if (tmp
.length()) {
11743 pool
= boost::lexical_cast
<unsigned>(tmp
);
11744 if (!osdmap
->have_pg_pool(pool
))
11746 } catch (boost::bad_lexical_cast
const&) {
11747 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11757 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11759 // For setting pool of layout, MetaRequest need osdmap epoch.
11760 // There is a race which create a new data pool but client and mds both don't have.
11761 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11762 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11763 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11764 string
rest(strstr(name
, "layout"));
11765 string
v((const char*)value
, size
);
11766 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11767 return _setxattr_check_data_pool(rest
, v
, &o
);
11770 if (r
== -ENOENT
) {
11772 objecter
->wait_for_latest_osdmap(&ctx
);
11778 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11779 size_t size
, int flags
, const UserPerm
& perms
)
11781 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11783 std::lock_guard
lock(client_lock
);
11788 vinodeno_t vino
= _get_vino(in
);
11790 ldout(cct
, 3) << __func__
<< " " << vino
<< " " << name
<< " size " << size
<< dendl
;
11791 tout(cct
) << __func__
<< std::endl
;
11792 tout(cct
) << vino
.ino
.val
<< std::endl
;
11793 tout(cct
) << name
<< std::endl
;
11795 if (!fuse_default_permissions
) {
11796 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11800 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11803 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11805 if (in
->snapid
!= CEPH_NOSNAP
) {
11809 // same xattrs supported by kernel client
11810 if (strncmp(name
, "user.", 5) &&
11811 strncmp(name
, "system.", 7) &&
11812 strncmp(name
, "security.", 9) &&
11813 strncmp(name
, "trusted.", 8) &&
11814 strncmp(name
, "ceph.", 5))
11815 return -EOPNOTSUPP
;
11817 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11818 if (vxattr
&& vxattr
->readonly
)
11819 return -EOPNOTSUPP
;
11821 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11823 in
->make_nosnap_relative_path(path
);
11824 req
->set_filepath(path
);
11825 req
->set_filepath2(name
);
11826 req
->set_inode(in
);
11828 int res
= make_request(req
, perms
);
11831 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11835 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11837 if (cct
->_conf
->client_permissions
) {
11838 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11842 return _removexattr(in
.get(), name
, perms
);
11845 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11847 std::lock_guard
lock(client_lock
);
11852 vinodeno_t vino
= _get_vino(in
);
11854 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11855 tout(cct
) << "ll_removexattr" << std::endl
;
11856 tout(cct
) << vino
.ino
.val
<< std::endl
;
11857 tout(cct
) << name
<< std::endl
;
11859 if (!fuse_default_permissions
) {
11860 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11865 return _removexattr(in
, name
, perms
);
11868 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11870 return in
->quota
.is_enable() &&
11871 (in
->snapid
!= CEPH_NOSNAP
||
11872 (in
->snaprealm
&& in
->snaprealm
->ino
== in
->ino
));
11874 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11876 return snprintf(val
, size
,
11877 "max_bytes=%lld max_files=%lld",
11878 (long long int)in
->quota
.max_bytes
,
11879 (long long int)in
->quota
.max_files
);
11881 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11883 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11885 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11887 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11890 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11892 return in
->layout
!= file_layout_t();
11894 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11896 int r
= snprintf(val
, size
,
11897 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
11898 (unsigned long long)in
->layout
.stripe_unit
,
11899 (unsigned long long)in
->layout
.stripe_count
,
11900 (unsigned long long)in
->layout
.object_size
);
11901 objecter
->with_osdmap([&](const OSDMap
& o
) {
11902 if (o
.have_pg_pool(in
->layout
.pool_id
))
11903 r
+= snprintf(val
+ r
, size
- r
, "%s",
11904 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11906 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11907 (uint64_t)in
->layout
.pool_id
);
11909 if (in
->layout
.pool_ns
.length())
11910 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11911 in
->layout
.pool_ns
.c_str());
11914 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11916 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_unit
);
11918 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11920 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.stripe_count
);
11922 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11924 return snprintf(val
, size
, "%llu", (unsigned long long)in
->layout
.object_size
);
11926 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11929 objecter
->with_osdmap([&](const OSDMap
& o
) {
11930 if (o
.have_pg_pool(in
->layout
.pool_id
))
11931 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11932 in
->layout
.pool_id
).c_str());
11934 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11938 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11940 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11942 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11944 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11946 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11948 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nfiles
);
11950 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11952 return snprintf(val
, size
, "%llu", (unsigned long long)in
->dirstat
.nsubdirs
);
11954 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11956 return snprintf(val
, size
, "%llu", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11958 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11960 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rfiles
);
11962 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11964 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rsubdirs
);
11966 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11968 return snprintf(val
, size
, "%llu", (unsigned long long)in
->rstat
.rbytes
);
11970 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11972 return snprintf(val
, size
, "%ld.%09ld", (long)in
->rstat
.rctime
.sec(),
11973 (long)in
->rstat
.rctime
.nsec());
11975 bool Client::_vxattrcb_dir_pin_exists(Inode
*in
)
11977 return in
->dir_pin
!= -ENODATA
;
11979 size_t Client::_vxattrcb_dir_pin(Inode
*in
, char *val
, size_t size
)
11981 return snprintf(val
, size
, "%ld", (long)in
->dir_pin
);
11984 bool Client::_vxattrcb_snap_btime_exists(Inode
*in
)
11986 return !in
->snap_btime
.is_zero();
11989 size_t Client::_vxattrcb_snap_btime(Inode
*in
, char *val
, size_t size
)
11991 return snprintf(val
, size
, "%llu.%09lu",
11992 (long long unsigned)in
->snap_btime
.sec(),
11993 (long unsigned)in
->snap_btime
.nsec());
11996 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11997 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11999 #define XATTR_NAME_CEPH(_type, _name) \
12001 name: CEPH_XATTR_NAME(_type, _name), \
12002 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12007 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
12009 name: CEPH_XATTR_NAME(_type, _name), \
12010 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12015 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12017 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12018 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12020 exists_cb: &Client::_vxattrcb_layout_exists, \
12023 #define XATTR_QUOTA_FIELD(_type, _name) \
12025 name: CEPH_XATTR_NAME(_type, _name), \
12026 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12028 exists_cb: &Client::_vxattrcb_quota_exists, \
12032 const Client::VXattr
Client::_dir_vxattrs
[] = {
12034 name
: "ceph.dir.layout",
12035 getxattr_cb
: &Client::_vxattrcb_layout
,
12037 exists_cb
: &Client::_vxattrcb_layout_exists
,
12040 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
12041 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
12042 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
12043 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
12044 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
12045 XATTR_NAME_CEPH(dir
, entries
),
12046 XATTR_NAME_CEPH(dir
, files
),
12047 XATTR_NAME_CEPH(dir
, subdirs
),
12048 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
12049 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
12050 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
12051 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
12052 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
12054 name
: "ceph.quota",
12055 getxattr_cb
: &Client::_vxattrcb_quota
,
12057 exists_cb
: &Client::_vxattrcb_quota_exists
,
12060 XATTR_QUOTA_FIELD(quota
, max_bytes
),
12061 XATTR_QUOTA_FIELD(quota
, max_files
),
12063 name
: "ceph.dir.pin",
12064 getxattr_cb
: &Client::_vxattrcb_dir_pin
,
12066 exists_cb
: &Client::_vxattrcb_dir_pin_exists
,
12070 name
: "ceph.snap.btime",
12071 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12073 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12076 { name
: "" } /* Required table terminator */
12079 const Client::VXattr
Client::_file_vxattrs
[] = {
12081 name
: "ceph.file.layout",
12082 getxattr_cb
: &Client::_vxattrcb_layout
,
12084 exists_cb
: &Client::_vxattrcb_layout_exists
,
12087 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
12088 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
12089 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
12090 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
12091 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
12093 name
: "ceph.snap.btime",
12094 getxattr_cb
: &Client::_vxattrcb_snap_btime
,
12096 exists_cb
: &Client::_vxattrcb_snap_btime_exists
,
12099 { name
: "" } /* Required table terminator */
12102 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
12105 return _dir_vxattrs
;
12106 else if (in
->is_file())
12107 return _file_vxattrs
;
12111 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
12113 if (strncmp(name
, "ceph.", 5) == 0) {
12114 const VXattr
*vxattr
= _get_vxattrs(in
);
12116 while (!vxattr
->name
.empty()) {
12117 if (vxattr
->name
== name
)
12126 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
12128 std::lock_guard
lock(client_lock
);
12133 vinodeno_t vino
= _get_vino(in
);
12135 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
12136 tout(cct
) << "ll_readlink" << std::endl
;
12137 tout(cct
) << vino
.ino
.val
<< std::endl
;
12139 for (auto dn
: in
->dentries
) {
12143 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
12144 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
12148 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
12149 const UserPerm
& perms
, InodeRef
*inp
)
12151 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
12152 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
12153 << ", gid " << perms
.gid() << ")" << dendl
;
12155 if (strlen(name
) > NAME_MAX
)
12156 return -ENAMETOOLONG
;
12158 if (dir
->snapid
!= CEPH_NOSNAP
) {
12161 if (is_quota_files_exceeded(dir
, perms
)) {
12165 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
12168 dir
->make_nosnap_relative_path(path
);
12169 path
.push_dentry(name
);
12170 req
->set_filepath(path
);
12171 req
->set_inode(dir
);
12172 req
->head
.args
.mknod
.rdev
= rdev
;
12173 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12174 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12176 bufferlist xattrs_bl
;
12177 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12180 req
->head
.args
.mknod
.mode
= mode
;
12181 if (xattrs_bl
.length() > 0)
12182 req
->set_data(xattrs_bl
);
12185 res
= get_or_create(dir
, name
, &de
);
12188 req
->set_dentry(de
);
12190 res
= make_request(req
, perms
, inp
);
12194 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12202 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
12203 dev_t rdev
, struct stat
*attr
, Inode
**out
,
12204 const UserPerm
& perms
)
12206 std::lock_guard
lock(client_lock
);
12211 vinodeno_t vparent
= _get_vino(parent
);
12213 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
12214 tout(cct
) << "ll_mknod" << std::endl
;
12215 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12216 tout(cct
) << name
<< std::endl
;
12217 tout(cct
) << mode
<< std::endl
;
12218 tout(cct
) << rdev
<< std::endl
;
12220 if (!fuse_default_permissions
) {
12221 int r
= may_create(parent
, perms
);
12227 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12229 fill_stat(in
, attr
);
12232 tout(cct
) << attr
->st_ino
<< std::endl
;
12233 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
12234 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12239 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
12240 dev_t rdev
, Inode
**out
,
12241 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12242 const UserPerm
& perms
)
12244 unsigned caps
= statx_to_mask(flags
, want
);
12245 std::lock_guard
lock(client_lock
);
12250 vinodeno_t vparent
= _get_vino(parent
);
12252 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
12253 tout(cct
) << "ll_mknodx" << std::endl
;
12254 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12255 tout(cct
) << name
<< std::endl
;
12256 tout(cct
) << mode
<< std::endl
;
12257 tout(cct
) << rdev
<< std::endl
;
12259 if (!fuse_default_permissions
) {
12260 int r
= may_create(parent
, perms
);
12266 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
12268 fill_statx(in
, caps
, stx
);
12271 tout(cct
) << stx
->stx_ino
<< std::endl
;
12272 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
12273 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12278 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
12279 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
12280 int object_size
, const char *data_pool
, bool *created
,
12281 const UserPerm
& perms
)
12283 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
12284 mode
<< dec
<< ")" << dendl
;
12286 if (strlen(name
) > NAME_MAX
)
12287 return -ENAMETOOLONG
;
12288 if (dir
->snapid
!= CEPH_NOSNAP
) {
12291 if (is_quota_files_exceeded(dir
, perms
)) {
12295 // use normalized flags to generate cmode
12296 int cflags
= ceph_flags_sys2wire(flags
);
12297 if (cct
->_conf
.get_val
<bool>("client_force_lazyio"))
12298 cflags
|= CEPH_O_LAZY
;
12300 int cmode
= ceph_flags_to_mode(cflags
);
12302 int64_t pool_id
= -1;
12303 if (data_pool
&& *data_pool
) {
12304 pool_id
= objecter
->with_osdmap(
12305 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
12308 if (pool_id
> 0xffffffffll
)
12309 return -ERANGE
; // bummer!
12312 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
12315 dir
->make_nosnap_relative_path(path
);
12316 path
.push_dentry(name
);
12317 req
->set_filepath(path
);
12318 req
->set_inode(dir
);
12319 req
->head
.args
.open
.flags
= cflags
| CEPH_O_CREAT
;
12321 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
12322 req
->head
.args
.open
.stripe_count
= stripe_count
;
12323 req
->head
.args
.open
.object_size
= object_size
;
12324 if (cct
->_conf
->client_debug_getattr_caps
)
12325 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
12327 req
->head
.args
.open
.mask
= 0;
12328 req
->head
.args
.open
.pool
= pool_id
;
12329 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12330 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12333 bufferlist xattrs_bl
;
12334 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
12337 req
->head
.args
.open
.mode
= mode
;
12338 if (xattrs_bl
.length() > 0)
12339 req
->set_data(xattrs_bl
);
12342 res
= get_or_create(dir
, name
, &de
);
12345 req
->set_dentry(de
);
12347 res
= make_request(req
, perms
, inp
, created
);
12352 /* If the caller passed a value in fhp, do the open */
12354 (*inp
)->get_open_ref(cmode
);
12355 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
12361 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
12362 << " layout " << stripe_unit
12363 << ' ' << stripe_count
12364 << ' ' << object_size
12365 <<") = " << res
<< dendl
;
12374 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
12377 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
12378 << mode
<< dec
<< ", uid " << perm
.uid()
12379 << ", gid " << perm
.gid() << ")" << dendl
;
12381 if (strlen(name
) > NAME_MAX
)
12382 return -ENAMETOOLONG
;
12384 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12387 if (is_quota_files_exceeded(dir
, perm
)) {
12390 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
12391 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
12394 dir
->make_nosnap_relative_path(path
);
12395 path
.push_dentry(name
);
12396 req
->set_filepath(path
);
12397 req
->set_inode(dir
);
12398 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12399 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12402 bufferlist xattrs_bl
;
12403 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
12406 req
->head
.args
.mkdir
.mode
= mode
;
12407 if (xattrs_bl
.length() > 0)
12408 req
->set_data(xattrs_bl
);
12411 res
= get_or_create(dir
, name
, &de
);
12414 req
->set_dentry(de
);
12416 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
12417 res
= make_request(req
, perm
, inp
);
12418 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
12422 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
12430 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
12431 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
12433 std::lock_guard
lock(client_lock
);
12438 vinodeno_t vparent
= _get_vino(parent
);
12440 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
12441 tout(cct
) << "ll_mkdir" << std::endl
;
12442 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12443 tout(cct
) << name
<< std::endl
;
12444 tout(cct
) << mode
<< std::endl
;
12446 if (!fuse_default_permissions
) {
12447 int r
= may_create(parent
, perm
);
12453 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
12455 fill_stat(in
, attr
);
12458 tout(cct
) << attr
->st_ino
<< std::endl
;
12459 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
12460 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12465 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
12466 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
12467 const UserPerm
& perms
)
12469 std::lock_guard
lock(client_lock
);
12474 vinodeno_t vparent
= _get_vino(parent
);
12476 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
12477 tout(cct
) << "ll_mkdirx" << std::endl
;
12478 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12479 tout(cct
) << name
<< std::endl
;
12480 tout(cct
) << mode
<< std::endl
;
12482 if (!fuse_default_permissions
) {
12483 int r
= may_create(parent
, perms
);
12489 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
12491 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12497 tout(cct
) << stx
->stx_ino
<< std::endl
;
12498 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12499 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12504 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12505 const UserPerm
& perms
, InodeRef
*inp
)
12507 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12508 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12511 if (strlen(name
) > NAME_MAX
)
12512 return -ENAMETOOLONG
;
12514 if (dir
->snapid
!= CEPH_NOSNAP
) {
12517 if (is_quota_files_exceeded(dir
, perms
)) {
12521 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12524 dir
->make_nosnap_relative_path(path
);
12525 path
.push_dentry(name
);
12526 req
->set_filepath(path
);
12527 req
->set_inode(dir
);
12528 req
->set_string2(target
);
12529 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12530 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12533 int res
= get_or_create(dir
, name
, &de
);
12536 req
->set_dentry(de
);
12538 res
= make_request(req
, perms
, inp
);
12541 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12550 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12551 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12553 std::lock_guard
lock(client_lock
);
12558 vinodeno_t vparent
= _get_vino(parent
);
12560 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12562 tout(cct
) << "ll_symlink" << std::endl
;
12563 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12564 tout(cct
) << name
<< std::endl
;
12565 tout(cct
) << value
<< std::endl
;
12567 if (!fuse_default_permissions
) {
12568 int r
= may_create(parent
, perms
);
12574 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12576 fill_stat(in
, attr
);
12579 tout(cct
) << attr
->st_ino
<< std::endl
;
12580 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12581 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12586 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12587 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12588 unsigned flags
, const UserPerm
& perms
)
12590 std::lock_guard
lock(client_lock
);
12595 vinodeno_t vparent
= _get_vino(parent
);
12597 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12599 tout(cct
) << "ll_symlinkx" << std::endl
;
12600 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12601 tout(cct
) << name
<< std::endl
;
12602 tout(cct
) << value
<< std::endl
;
12604 if (!fuse_default_permissions
) {
12605 int r
= may_create(parent
, perms
);
12611 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12613 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12616 tout(cct
) << stx
->stx_ino
<< std::endl
;
12617 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12618 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12623 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12625 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12626 << " uid " << perm
.uid() << " gid " << perm
.gid()
12629 if (dir
->snapid
!= CEPH_NOSNAP
) {
12633 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12636 dir
->make_nosnap_relative_path(path
);
12637 path
.push_dentry(name
);
12638 req
->set_filepath(path
);
12644 int res
= get_or_create(dir
, name
, &de
);
12647 req
->set_dentry(de
);
12648 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12649 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12651 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12655 in
= otherin
.get();
12656 req
->set_other_inode(in
);
12657 in
->break_all_delegs();
12658 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12660 req
->set_inode(dir
);
12662 res
= make_request(req
, perm
);
12665 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12673 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12675 std::lock_guard
lock(client_lock
);
12680 vinodeno_t vino
= _get_vino(in
);
12682 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12683 tout(cct
) << "ll_unlink" << std::endl
;
12684 tout(cct
) << vino
.ino
.val
<< std::endl
;
12685 tout(cct
) << name
<< std::endl
;
12687 if (!fuse_default_permissions
) {
12688 int r
= may_delete(in
, name
, perm
);
12692 return _unlink(in
, name
, perm
);
12695 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12697 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12698 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12700 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12704 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12705 MetaRequest
*req
= new MetaRequest(op
);
12707 dir
->make_nosnap_relative_path(path
);
12708 path
.push_dentry(name
);
12709 req
->set_filepath(path
);
12710 req
->set_inode(dir
);
12712 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12713 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12714 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12719 int res
= get_or_create(dir
, name
, &de
);
12722 if (op
== CEPH_MDS_OP_RMDIR
)
12723 req
->set_dentry(de
);
12727 res
= _lookup(dir
, name
, 0, &in
, perms
);
12731 if (op
== CEPH_MDS_OP_RMSNAP
) {
12732 unlink(de
, true, true);
12735 req
->set_other_inode(in
.get());
12737 res
= make_request(req
, perms
);
12740 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12748 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12750 std::lock_guard
lock(client_lock
);
12755 vinodeno_t vino
= _get_vino(in
);
12757 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12758 tout(cct
) << "ll_rmdir" << std::endl
;
12759 tout(cct
) << vino
.ino
.val
<< std::endl
;
12760 tout(cct
) << name
<< std::endl
;
12762 if (!fuse_default_permissions
) {
12763 int r
= may_delete(in
, name
, perms
);
12768 return _rmdir(in
, name
, perms
);
12771 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12773 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12774 << todir
->ino
<< " " << toname
12775 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12778 if (fromdir
->snapid
!= todir
->snapid
)
12781 int op
= CEPH_MDS_OP_RENAME
;
12782 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12783 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12784 op
= CEPH_MDS_OP_RENAMESNAP
;
12790 MetaRequest
*req
= new MetaRequest(op
);
12793 fromdir
->make_nosnap_relative_path(from
);
12794 from
.push_dentry(fromname
);
12796 todir
->make_nosnap_relative_path(to
);
12797 to
.push_dentry(toname
);
12798 req
->set_filepath(to
);
12799 req
->set_filepath2(from
);
12802 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12806 res
= get_or_create(todir
, toname
, &de
);
12810 if (op
== CEPH_MDS_OP_RENAME
) {
12811 req
->set_old_dentry(oldde
);
12812 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12813 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12815 req
->set_dentry(de
);
12816 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12817 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12819 InodeRef oldin
, otherin
;
12820 Inode
*fromdir_root
= nullptr;
12821 Inode
*todir_root
= nullptr;
12823 bool quota_check
= false;
12824 if (fromdir
!= todir
) {
12826 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12828 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12830 if (todir_root
->quota
.is_enable() && fromdir_root
!= todir_root
) {
12831 // use CEPH_STAT_RSTAT mask to force send getattr or lookup request
12832 // to auth MDS to get latest rstat for todir_root and source dir
12833 // even if their dentry caches and inode caps are satisfied.
12834 res
= _getattr(todir_root
, CEPH_STAT_RSTAT
, perm
, true);
12838 quota_check
= true;
12839 if (oldde
->inode
&& oldde
->inode
->is_dir()) {
12840 mask
|= CEPH_STAT_RSTAT
;
12845 res
= _lookup(fromdir
, fromname
, mask
, &oldin
, perm
);
12849 Inode
*oldinode
= oldin
.get();
12850 oldinode
->break_all_delegs();
12851 req
->set_old_inode(oldinode
);
12852 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12855 int64_t old_bytes
, old_files
;
12856 if (oldinode
->is_dir()) {
12857 old_bytes
= oldinode
->rstat
.rbytes
;
12858 old_files
= oldinode
->rstat
.rsize();
12860 old_bytes
= oldinode
->size
;
12864 bool quota_exceed
= false;
12865 if (todir_root
&& todir_root
->quota
.max_bytes
&&
12866 (old_bytes
+ todir_root
->rstat
.rbytes
) >= todir_root
->quota
.max_bytes
) {
12867 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " bytes="
12868 << old_bytes
<< ") to (" << todir
->ino
12869 << ") will exceed quota on " << *todir_root
<< dendl
;
12870 quota_exceed
= true;
12873 if (todir_root
&& todir_root
->quota
.max_files
&&
12874 (old_files
+ todir_root
->rstat
.rsize()) >= todir_root
->quota
.max_files
) {
12875 ldout(cct
, 10) << "_rename (" << oldinode
->ino
<< " files="
12876 << old_files
<< ") to (" << todir
->ino
12877 << ") will exceed quota on " << *todir_root
<< dendl
;
12878 quota_exceed
= true;
12881 if (quota_exceed
) {
12882 res
= (oldinode
->is_dir()) ? -EXDEV
: -EDQUOT
;
12887 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12891 Inode
*in
= otherin
.get();
12892 req
->set_other_inode(in
);
12893 in
->break_all_delegs();
12895 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12903 req
->set_inode(todir
);
12905 // renamesnap reply contains no tracedn, so we need to invalidate
12907 unlink(oldde
, true, true);
12908 unlink(de
, true, true);
12910 req
->set_inode(todir
);
12913 res
= make_request(req
, perm
, &target
);
12914 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12916 // renamed item from our cache
12919 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12927 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12928 const char *newname
, const UserPerm
& perm
)
12930 std::lock_guard
lock(client_lock
);
12935 vinodeno_t vparent
= _get_vino(parent
);
12936 vinodeno_t vnewparent
= _get_vino(newparent
);
12938 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12939 << vnewparent
<< " " << newname
<< dendl
;
12940 tout(cct
) << "ll_rename" << std::endl
;
12941 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12942 tout(cct
) << name
<< std::endl
;
12943 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12944 tout(cct
) << newname
<< std::endl
;
12946 if (!fuse_default_permissions
) {
12947 int r
= may_delete(parent
, name
, perm
);
12950 r
= may_delete(newparent
, newname
, perm
);
12951 if (r
< 0 && r
!= -ENOENT
)
12955 return _rename(parent
, name
, newparent
, newname
, perm
);
12958 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12960 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12961 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12963 if (strlen(newname
) > NAME_MAX
)
12964 return -ENAMETOOLONG
;
12966 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12969 if (is_quota_files_exceeded(dir
, perm
)) {
12973 in
->break_all_delegs();
12974 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12976 filepath
path(newname
, dir
->ino
);
12977 req
->set_filepath(path
);
12978 filepath
existing(in
->ino
);
12979 req
->set_filepath2(existing
);
12981 req
->set_inode(dir
);
12982 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12983 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12986 int res
= get_or_create(dir
, newname
, &de
);
12989 req
->set_dentry(de
);
12991 res
= make_request(req
, perm
, inp
);
12992 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12995 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
13003 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
13004 const UserPerm
& perm
)
13006 std::lock_guard
lock(client_lock
);
13011 vinodeno_t vino
= _get_vino(in
);
13012 vinodeno_t vnewparent
= _get_vino(newparent
);
13014 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
13016 tout(cct
) << "ll_link" << std::endl
;
13017 tout(cct
) << vino
.ino
.val
<< std::endl
;
13018 tout(cct
) << vnewparent
<< std::endl
;
13019 tout(cct
) << newname
<< std::endl
;
13023 if (!fuse_default_permissions
) {
13024 if (S_ISDIR(in
->mode
))
13027 int r
= may_hardlink(in
, perm
);
13031 r
= may_create(newparent
, perm
);
13036 return _link(in
, newparent
, newname
, perm
, &target
);
13039 int Client::ll_num_osds(void)
13041 std::lock_guard
lock(client_lock
);
13042 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
13045 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
13047 std::lock_guard
lock(client_lock
);
13050 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
13051 if (!o
.exists(osd
))
13053 g
= o
.get_addrs(osd
).front();
13058 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
13059 *addr
= ntohl(nb_addr
);
13063 uint32_t Client::ll_stripe_unit(Inode
*in
)
13065 std::lock_guard
lock(client_lock
);
13066 return in
->layout
.stripe_unit
;
13069 uint64_t Client::ll_snap_seq(Inode
*in
)
13071 std::lock_guard
lock(client_lock
);
13072 return in
->snaprealm
->seq
;
13075 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
13077 std::lock_guard
lock(client_lock
);
13078 *layout
= in
->layout
;
13082 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
13084 return ll_file_layout(fh
->inode
.get(), layout
);
13087 /* Currently we cannot take advantage of redundancy in reads, since we
13088 would have to go through all possible placement groups (a
13089 potentially quite large number determined by a hash), and use CRUSH
13090 to calculate the appropriate set of OSDs for each placement group,
13091 then index into that. An array with one entry per OSD is much more
13092 tractable and works for demonstration purposes. */
13094 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
13095 file_layout_t
* layout
)
13097 std::lock_guard
lock(client_lock
);
13099 inodeno_t ino
= in
->ino
;
13100 uint32_t object_size
= layout
->object_size
;
13101 uint32_t su
= layout
->stripe_unit
;
13102 uint32_t stripe_count
= layout
->stripe_count
;
13103 uint64_t stripes_per_object
= object_size
/ su
;
13104 uint64_t stripeno
= 0, stripepos
= 0;
13107 stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
13108 stripepos
= blockno
% stripe_count
; // which object in the object set (X)
13110 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
13111 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
13113 object_t oid
= file_object_t(ino
, objectno
);
13114 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13115 ceph_object_layout olayout
=
13116 o
.file_to_object_layout(oid
, *layout
);
13117 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
13120 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
13125 /* Return the offset of the block, internal to the object */
13127 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
13129 std::lock_guard
lock(client_lock
);
13130 file_layout_t
*layout
=&(in
->layout
);
13131 uint32_t object_size
= layout
->object_size
;
13132 uint32_t su
= layout
->stripe_unit
;
13133 uint64_t stripes_per_object
= object_size
/ su
;
13135 return (blockno
% stripes_per_object
) * su
;
13138 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
13139 const UserPerm
& perms
)
13141 std::lock_guard
lock(client_lock
);
13146 vinodeno_t vino
= _get_vino(in
);
13148 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
13149 tout(cct
) << "ll_opendir" << std::endl
;
13150 tout(cct
) << vino
.ino
.val
<< std::endl
;
13152 if (!fuse_default_permissions
) {
13153 int r
= may_open(in
, flags
, perms
);
13158 int r
= _opendir(in
, dirpp
, perms
);
13159 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
13161 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
13166 int Client::ll_releasedir(dir_result_t
*dirp
)
13168 std::lock_guard
lock(client_lock
);
13169 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
13170 tout(cct
) << "ll_releasedir" << std::endl
;
13171 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13180 int Client::ll_fsyncdir(dir_result_t
*dirp
)
13182 std::lock_guard
lock(client_lock
);
13183 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
13184 tout(cct
) << "ll_fsyncdir" << std::endl
;
13185 tout(cct
) << (unsigned long)dirp
<< std::endl
;
13190 return _fsync(dirp
->inode
.get(), false);
13193 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
13195 ceph_assert(!(flags
& O_CREAT
));
13197 std::lock_guard
lock(client_lock
);
13202 vinodeno_t vino
= _get_vino(in
);
13204 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
13205 tout(cct
) << "ll_open" << std::endl
;
13206 tout(cct
) << vino
.ino
.val
<< std::endl
;
13207 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13210 if (!fuse_default_permissions
) {
13211 r
= may_open(in
, flags
, perms
);
13216 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
13219 Fh
*fhptr
= fhp
? *fhp
: NULL
;
13221 ll_unclosed_fh_set
.insert(fhptr
);
13223 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
13224 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
13225 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
13229 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13230 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
13231 const UserPerm
& perms
)
13235 vinodeno_t vparent
= _get_vino(parent
);
13237 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13238 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
13239 << ", gid " << perms
.gid() << dendl
;
13240 tout(cct
) << "ll_create" << std::endl
;
13241 tout(cct
) << vparent
.ino
.val
<< std::endl
;
13242 tout(cct
) << name
<< std::endl
;
13243 tout(cct
) << mode
<< std::endl
;
13244 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
13246 bool created
= false;
13247 int r
= _lookup(parent
, name
, caps
, in
, perms
);
13249 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
13252 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
13253 if (!fuse_default_permissions
) {
13254 r
= may_create(parent
, perms
);
13258 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
13269 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
13271 if (!fuse_default_permissions
) {
13272 r
= may_open(in
->get(), flags
, perms
);
13275 int release_r
= _release_fh(*fhp
);
13276 ceph_assert(release_r
== 0); // during create, no async data ops should have happened
13281 if (*fhp
== NULL
) {
13282 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
13290 ll_unclosed_fh_set
.insert(*fhp
);
13295 Inode
*inode
= in
->get();
13296 if (use_faked_inos())
13297 ino
= inode
->faked_ino
;
13302 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
13303 tout(cct
) << ino
<< std::endl
;
13304 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
13305 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
13306 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
13311 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
13312 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
13313 const UserPerm
& perms
)
13315 std::lock_guard
lock(client_lock
);
13321 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
13326 // passing an Inode in outp requires an additional ref
13331 fill_stat(in
, attr
);
13339 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
13340 int oflags
, Inode
**outp
, Fh
**fhp
,
13341 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
13342 const UserPerm
& perms
)
13344 unsigned caps
= statx_to_mask(lflags
, want
);
13345 std::lock_guard
lock(client_lock
);
13351 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
13355 // passing an Inode in outp requires an additional ref
13360 fill_statx(in
, caps
, stx
);
13369 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
13371 std::lock_guard
lock(client_lock
);
13372 tout(cct
) << "ll_lseek" << std::endl
;
13373 tout(cct
) << offset
<< std::endl
;
13374 tout(cct
) << whence
<< std::endl
;
13379 return _lseek(fh
, offset
, whence
);
13382 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
13384 std::lock_guard
lock(client_lock
);
13385 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
13386 tout(cct
) << "ll_read" << std::endl
;
13387 tout(cct
) << (unsigned long)fh
<< std::endl
;
13388 tout(cct
) << off
<< std::endl
;
13389 tout(cct
) << len
<< std::endl
;
13394 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13395 len
= std::min(len
, (loff_t
)INT_MAX
);
13396 int r
= _read(fh
, off
, len
, bl
);
13397 ldout(cct
, 3) << "ll_read " << fh
<< " " << off
<< "~" << len
<< " = " << r
13402 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
13406 file_layout_t
* layout
)
13408 std::lock_guard
lock(client_lock
);
13413 vinodeno_t vino
= _get_vino(in
);
13414 object_t oid
= file_object_t(vino
.ino
, blockid
);
13415 C_SaferCond onfinish
;
13418 objecter
->read(oid
,
13419 object_locator_t(layout
->pool_id
),
13424 CEPH_OSD_FLAG_READ
,
13427 client_lock
.unlock();
13428 int r
= onfinish
.wait();
13429 client_lock
.lock();
13432 bl
.begin().copy(bl
.length(), buf
);
13439 /* It appears that the OSD doesn't return success unless the entire
13440 buffer was written, return the write length on success. */
13442 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
13443 char* buf
, uint64_t offset
,
13444 uint64_t length
, file_layout_t
* layout
,
13445 uint64_t snapseq
, uint32_t sync
)
13447 vinodeno_t vino
= ll_get_vino(in
);
13449 std::unique_ptr
<C_SaferCond
> onsafe
= nullptr;
13454 if (true || sync
) {
13455 /* if write is stable, the epilogue is waiting on
13457 onsafe
.reset(new C_SaferCond("Client::ll_write_block flock"));
13459 object_t oid
= file_object_t(vino
.ino
, blockid
);
13460 SnapContext fakesnap
;
13461 ceph::bufferlist bl
;
13463 bl
.push_back(buffer::copy(buf
, length
));
13466 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
13469 fakesnap
.seq
= snapseq
;
13471 /* lock just in time */
13472 client_lock
.lock();
13474 client_lock
.unlock();
13478 objecter
->write(oid
,
13479 object_locator_t(layout
->pool_id
),
13484 ceph::real_clock::now(),
13488 client_lock
.unlock();
13489 if (nullptr != onsafe
) {
13490 r
= onsafe
->wait();
13500 int Client::ll_commit_blocks(Inode
*in
,
13504 std::lock_guard
lock(client_lock
);
13506 BarrierContext *bctx;
13507 vinodeno_t vino = _get_vino(in);
13508 uint64_t ino = vino.ino;
13510 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13511 << offset << " to " << length << dendl;
13517 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13518 if (p != barriers.end()) {
13519 barrier_interval civ(offset, offset + length);
13520 p->second->commit_barrier(civ);
13526 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
13528 std::lock_guard
lock(client_lock
);
13529 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
13530 "~" << len
<< dendl
;
13531 tout(cct
) << "ll_write" << std::endl
;
13532 tout(cct
) << (unsigned long)fh
<< std::endl
;
13533 tout(cct
) << off
<< std::endl
;
13534 tout(cct
) << len
<< std::endl
;
13539 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13540 len
= std::min(len
, (loff_t
)INT_MAX
);
13541 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13542 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13547 int64_t Client::ll_writev(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13549 std::lock_guard
lock(client_lock
);
13552 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, true, false);
13555 int64_t Client::ll_readv(struct Fh
*fh
, const struct iovec
*iov
, int iovcnt
, int64_t off
)
13557 std::lock_guard
lock(client_lock
);
13560 return _preadv_pwritev_locked(fh
, iov
, iovcnt
, off
, false, false);
13563 int Client::ll_flush(Fh
*fh
)
13565 std::lock_guard
lock(client_lock
);
13566 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13567 tout(cct
) << "ll_flush" << std::endl
;
13568 tout(cct
) << (unsigned long)fh
<< std::endl
;
13576 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13578 std::lock_guard
lock(client_lock
);
13579 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13580 tout(cct
) << "ll_fsync" << std::endl
;
13581 tout(cct
) << (unsigned long)fh
<< std::endl
;
13586 int r
= _fsync(fh
, syncdataonly
);
13588 // If we're returning an error, clear it from the FH
13589 fh
->take_async_err();
13594 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13596 std::lock_guard
lock(client_lock
);
13597 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13598 tout(cct
) << "ll_sync_inode" << std::endl
;
13599 tout(cct
) << (unsigned long)in
<< std::endl
;
13604 return _fsync(in
, syncdataonly
);
13607 #ifdef FALLOC_FL_PUNCH_HOLE
13609 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13611 if (offset
< 0 || length
<= 0)
13614 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13615 return -EOPNOTSUPP
;
13617 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13618 return -EOPNOTSUPP
;
13620 Inode
*in
= fh
->inode
.get();
13622 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13623 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13627 if (in
->snapid
!= CEPH_NOSNAP
)
13630 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13633 uint64_t size
= offset
+ length
;
13634 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13636 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
)) {
13641 int r
= get_caps(fh
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13645 std::unique_ptr
<C_SaferCond
> onuninline
= nullptr;
13646 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13647 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13648 (have
& CEPH_CAP_FILE_BUFFER
)) {
13650 auto inline_iter
= in
->inline_data
.cbegin();
13651 int len
= in
->inline_data
.length();
13652 if (offset
< len
) {
13654 inline_iter
.copy(offset
, bl
);
13656 if (offset
+ size
> len
)
13657 size
= len
- offset
;
13659 bl
.append_zero(size
);
13660 if (offset
+ size
< len
) {
13661 inline_iter
+= size
;
13662 inline_iter
.copy(len
- offset
- size
, bl
);
13664 in
->inline_data
= bl
;
13665 in
->inline_version
++;
13667 in
->mtime
= in
->ctime
= ceph_clock_now();
13669 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13671 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13672 onuninline
.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13673 uninline_data(in
, onuninline
.get());
13676 C_SaferCond
onfinish("Client::_punch_hole flock");
13678 unsafe_sync_write
++;
13679 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13681 _invalidate_inode_cache(in
, offset
, length
);
13682 filer
->zero(in
->ino
, &in
->layout
,
13683 in
->snaprealm
->get_snap_context(),
13685 ceph::real_clock::now(),
13686 0, true, &onfinish
);
13687 in
->mtime
= in
->ctime
= ceph_clock_now();
13689 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13691 client_lock
.unlock();
13693 client_lock
.lock();
13694 _sync_write_commit(in
);
13696 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13697 uint64_t size
= offset
+ length
;
13698 if (size
> in
->size
) {
13700 in
->mtime
= in
->ctime
= ceph_clock_now();
13702 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13704 if (is_quota_bytes_approaching(in
, fh
->actor_perms
)) {
13705 check_caps(in
, CHECK_CAPS_NODELAY
);
13706 } else if (is_max_size_approaching(in
)) {
13712 if (nullptr != onuninline
) {
13713 client_lock
.unlock();
13714 int ret
= onuninline
->wait();
13715 client_lock
.lock();
13717 if (ret
>= 0 || ret
== -ECANCELED
) {
13718 in
->inline_data
.clear();
13719 in
->inline_version
= CEPH_INLINE_NONE
;
13720 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13726 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13731 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13733 return -EOPNOTSUPP
;
13739 int Client::ll_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13741 std::lock_guard
lock(client_lock
);
13742 ldout(cct
, 3) << __func__
<< " " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13743 tout(cct
) << __func__
<< " " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13744 tout(cct
) << (unsigned long)fh
<< std::endl
;
13749 return _fallocate(fh
, mode
, offset
, length
);
13752 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13754 std::lock_guard
lock(client_lock
);
13755 tout(cct
) << __func__
<< " " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13760 Fh
*fh
= get_filehandle(fd
);
13763 #if defined(__linux__) && defined(O_PATH)
13764 if (fh
->flags
& O_PATH
)
13767 return _fallocate(fh
, mode
, offset
, length
);
13770 int Client::ll_release(Fh
*fh
)
13772 std::lock_guard
lock(client_lock
);
13777 ldout(cct
, 3) << __func__
<< " (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13779 tout(cct
) << __func__
<< " (fh)" << std::endl
;
13780 tout(cct
) << (unsigned long)fh
<< std::endl
;
13782 if (ll_unclosed_fh_set
.count(fh
))
13783 ll_unclosed_fh_set
.erase(fh
);
13784 return _release_fh(fh
);
13787 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13789 std::lock_guard
lock(client_lock
);
13791 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13792 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13797 return _getlk(fh
, fl
, owner
);
13800 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13802 std::lock_guard
lock(client_lock
);
13804 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13805 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13810 return _setlk(fh
, fl
, owner
, sleep
);
13813 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13815 std::lock_guard
lock(client_lock
);
13817 ldout(cct
, 3) << __func__
<< " (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13818 tout(cct
) << __func__
<< " (fh)" << (unsigned long)fh
<< std::endl
;
13823 return _flock(fh
, cmd
, owner
);
13826 int Client::set_deleg_timeout(uint32_t timeout
)
13828 std::lock_guard
lock(client_lock
);
13831 * The whole point is to prevent blacklisting so we must time out the
13832 * delegation before the session autoclose timeout kicks in.
13834 if (timeout
>= mdsmap
->get_session_autoclose())
13837 deleg_timeout
= timeout
;
13841 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13845 std::lock_guard
lock(client_lock
);
13850 Inode
*inode
= fh
->inode
.get();
13853 case CEPH_DELEGATION_NONE
:
13854 inode
->unset_deleg(fh
);
13859 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13860 } catch (std::bad_alloc
&) {
13868 class C_Client_RequestInterrupt
: public Context
{
13873 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13876 void finish(int r
) override
{
13877 std::lock_guard
l(client
->client_lock
);
13878 ceph_assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13879 client
->_interrupt_filelock(req
);
13880 client
->put_request(req
);
13884 void Client::ll_interrupt(void *d
)
13886 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13887 ldout(cct
, 3) << __func__
<< " tid " << req
->get_tid() << dendl
;
13888 tout(cct
) << __func__
<< " tid " << req
->get_tid() << std::endl
;
13889 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13892 // =========================================
13895 // expose file layouts
13897 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13898 const UserPerm
& perms
)
13900 std::lock_guard
lock(client_lock
);
13905 filepath
path(relpath
);
13907 int r
= path_walk(path
, &in
, perms
);
13913 ldout(cct
, 3) << __func__
<< "(" << relpath
<< ") = 0" << dendl
;
13917 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13919 std::lock_guard
lock(client_lock
);
13924 Fh
*f
= get_filehandle(fd
);
13927 Inode
*in
= f
->inode
.get();
13931 ldout(cct
, 3) << __func__
<< "(" << fd
<< ") = 0" << dendl
;
13935 int64_t Client::get_default_pool_id()
13937 std::lock_guard
lock(client_lock
);
13942 /* first data pool is the default */
13943 return mdsmap
->get_first_data_pool();
13948 int64_t Client::get_pool_id(const char *pool_name
)
13950 std::lock_guard
lock(client_lock
);
13955 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13959 string
Client::get_pool_name(int64_t pool
)
13961 std::lock_guard
lock(client_lock
);
13966 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13967 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13971 int Client::get_pool_replication(int64_t pool
)
13973 std::lock_guard
lock(client_lock
);
13978 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13979 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13983 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13985 std::lock_guard
lock(client_lock
);
13990 Fh
*f
= get_filehandle(fd
);
13993 Inode
*in
= f
->inode
.get();
13995 vector
<ObjectExtent
> extents
;
13996 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13997 ceph_assert(extents
.size() == 1);
13999 objecter
->with_osdmap([&](const OSDMap
& o
) {
14000 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14001 o
.pg_to_acting_osds(pg
, osds
);
14008 * Return the remainder of the extent (stripe unit)
14010 * If length = 1 is passed to Striper::file_to_extents we get a single
14011 * extent back, but its length is one so we still need to compute the length
14012 * to the end of the stripe unit.
14014 * If length = su then we may get 1 or 2 objects back in the extents vector
14015 * which would have to be examined. Even then, the offsets are local to the
14016 * object, so matching up to the file offset is extra work.
14018 * It seems simpler to stick with length = 1 and manually compute the
14022 uint64_t su
= in
->layout
.stripe_unit
;
14023 *len
= su
- (off
% su
);
14029 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
14031 std::lock_guard
lock(client_lock
);
14038 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14039 return o
.crush
->get_full_location_ordered(id
, path
);
14043 int Client::get_file_stripe_address(int fd
, loff_t offset
,
14044 vector
<entity_addr_t
>& address
)
14046 std::lock_guard
lock(client_lock
);
14051 Fh
*f
= get_filehandle(fd
);
14054 Inode
*in
= f
->inode
.get();
14057 vector
<ObjectExtent
> extents
;
14058 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
14059 in
->truncate_size
, extents
);
14060 ceph_assert(extents
.size() == 1);
14062 // now we have the object and its 'layout'
14063 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14064 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
14066 o
.pg_to_acting_osds(pg
, osds
);
14069 for (unsigned i
= 0; i
< osds
.size(); i
++) {
14070 entity_addr_t addr
= o
.get_addrs(osds
[i
]).front();
14071 address
.push_back(addr
);
14077 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
14079 std::lock_guard
lock(client_lock
);
14084 return objecter
->with_osdmap([&](const OSDMap
& o
) {
14085 if (!o
.exists(osd
))
14088 addr
= o
.get_addrs(osd
).front();
14093 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
14094 loff_t length
, loff_t offset
)
14096 std::lock_guard
lock(client_lock
);
14101 Fh
*f
= get_filehandle(fd
);
14104 Inode
*in
= f
->inode
.get();
14106 // map to a list of extents
14107 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
14109 ldout(cct
, 3) << __func__
<< "(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
14114 /* find an osd with the same ip. -ENXIO if none. */
14115 int Client::get_local_osd()
14117 std::lock_guard
lock(client_lock
);
14122 objecter
->with_osdmap([this](const OSDMap
& o
) {
14123 if (o
.get_epoch() != local_osd_epoch
) {
14124 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddrs().front());
14125 local_osd_epoch
= o
.get_epoch();
14136 // ===============================
14138 void Client::ms_handle_connect(Connection
*con
)
14140 ldout(cct
, 10) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14143 bool Client::ms_handle_reset(Connection
*con
)
14145 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14149 void Client::ms_handle_remote_reset(Connection
*con
)
14151 ldout(cct
, 0) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14152 std::lock_guard
l(client_lock
);
14153 switch (con
->get_peer_type()) {
14154 case CEPH_ENTITY_TYPE_MDS
:
14156 // kludge to figure out which mds this is; fixme with a Connection* state
14157 mds_rank_t mds
= MDS_RANK_NONE
;
14158 MetaSession
*s
= NULL
;
14159 for (auto &p
: mds_sessions
) {
14160 if (mdsmap
->get_addrs(p
.first
) == con
->get_peer_addrs()) {
14166 assert (s
!= NULL
);
14167 switch (s
->state
) {
14168 case MetaSession::STATE_CLOSING
:
14169 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
14170 _closed_mds_session(s
);
14173 case MetaSession::STATE_OPENING
:
14175 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
14176 list
<Context
*> waiters
;
14177 waiters
.swap(s
->waiting_for_open
);
14178 _closed_mds_session(s
);
14179 MetaSession
*news
= _get_or_open_mds_session(mds
);
14180 news
->waiting_for_open
.swap(waiters
);
14184 case MetaSession::STATE_OPEN
:
14186 objecter
->maybe_request_map(); /* to check if we are blacklisted */
14187 if (cct
->_conf
.get_val
<bool>("client_reconnect_stale")) {
14188 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
14189 _closed_mds_session(s
);
14191 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
14192 s
->state
= MetaSession::STATE_STALE
;
14197 case MetaSession::STATE_NEW
:
14198 case MetaSession::STATE_CLOSED
:
14208 bool Client::ms_handle_refused(Connection
*con
)
14210 ldout(cct
, 1) << __func__
<< " on " << con
->get_peer_addr() << dendl
;
14214 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
14216 Inode
*quota_in
= root_ancestor
;
14217 SnapRealm
*realm
= in
->snaprealm
;
14219 ldout(cct
, 10) << __func__
<< " realm " << realm
->ino
<< dendl
;
14220 if (realm
->ino
!= in
->ino
) {
14221 auto p
= inode_map
.find(vinodeno_t(realm
->ino
, CEPH_NOSNAP
));
14222 if (p
== inode_map
.end())
14225 if (p
->second
->quota
.is_enable()) {
14226 quota_in
= p
->second
;
14230 realm
= realm
->pparent
;
14232 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << quota_in
->vino() << dendl
;
14237 * Traverse quota ancestors of the Inode, return true
14238 * if any of them passes the passed function
14240 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
14241 std::function
<bool (const Inode
&in
)> test
)
14244 ceph_assert(in
!= NULL
);
14249 if (in
== root_ancestor
) {
14250 // We're done traversing, drop out
14253 // Continue up the tree
14254 in
= get_quota_root(in
, perms
);
14261 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
14263 return check_quota_condition(in
, perms
,
14264 [](const Inode
&in
) {
14265 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
14269 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
14270 const UserPerm
& perms
)
14272 return check_quota_condition(in
, perms
,
14273 [&new_bytes
](const Inode
&in
) {
14274 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
14275 > in
.quota
.max_bytes
;
14279 bool Client::is_quota_bytes_approaching(Inode
*in
, const UserPerm
& perms
)
14281 ceph_assert(in
->size
>= in
->reported_size
);
14282 const uint64_t size
= in
->size
- in
->reported_size
;
14283 return check_quota_condition(in
, perms
,
14284 [&size
](const Inode
&in
) {
14285 if (in
.quota
.max_bytes
) {
14286 if (in
.rstat
.rbytes
>= in
.quota
.max_bytes
) {
14290 const uint64_t space
= in
.quota
.max_bytes
- in
.rstat
.rbytes
;
14291 return (space
>> 4) < size
;
14305 int Client::check_pool_perm(Inode
*in
, int need
)
14307 if (!cct
->_conf
->client_check_pool_perm
)
14310 int64_t pool_id
= in
->layout
.pool_id
;
14311 std::string pool_ns
= in
->layout
.pool_ns
;
14312 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
14315 auto it
= pool_perms
.find(perm_key
);
14316 if (it
== pool_perms
.end())
14318 if (it
->second
== POOL_CHECKING
) {
14319 // avoid concurrent checkings
14320 wait_on_list(waiting_for_pool_perm
);
14323 ceph_assert(have
& POOL_CHECKED
);
14329 if (in
->snapid
!= CEPH_NOSNAP
) {
14330 // pool permission check needs to write to the first object. But for snapshot,
14331 // head of the first object may have alread been deleted. To avoid creating
14332 // orphan object, skip the check for now.
14336 pool_perms
[perm_key
] = POOL_CHECKING
;
14339 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
14340 object_t oid
= oid_buf
;
14342 SnapContext nullsnapc
;
14344 C_SaferCond rd_cond
;
14345 ObjectOperation rd_op
;
14346 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
14348 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
14349 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
14351 C_SaferCond wr_cond
;
14352 ObjectOperation wr_op
;
14353 wr_op
.create(true);
14355 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
14356 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
14358 client_lock
.unlock();
14359 int rd_ret
= rd_cond
.wait();
14360 int wr_ret
= wr_cond
.wait();
14361 client_lock
.lock();
14363 bool errored
= false;
14365 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
14367 else if (rd_ret
!= -EPERM
) {
14368 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14369 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14373 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
14374 have
|= POOL_WRITE
;
14375 else if (wr_ret
!= -EPERM
) {
14376 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14377 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
14382 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14383 // Raise EIO because actual error code might be misleading for
14384 // userspace filesystem user.
14385 pool_perms
.erase(perm_key
);
14386 signal_cond_list(waiting_for_pool_perm
);
14390 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
14391 signal_cond_list(waiting_for_pool_perm
);
14394 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
14395 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14396 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
14399 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
14400 ldout(cct
, 10) << __func__
<< " on pool " << pool_id
<< " ns " << pool_ns
14401 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
14408 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
14410 if (acl_type
== POSIX_ACL
) {
14411 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14412 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14414 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
14420 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
14422 if (acl_type
== NO_ACL
)
14425 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
14429 if (acl_type
== POSIX_ACL
) {
14430 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
14431 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
14432 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
14433 r
= posix_acl_access_chmod(acl
, mode
);
14436 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
14442 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
14446 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
14447 const UserPerm
& perms
)
14449 if (acl_type
== NO_ACL
)
14452 if (S_ISLNK(*mode
))
14455 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
14459 if (acl_type
== POSIX_ACL
) {
14460 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
14461 map
<string
, bufferptr
> xattrs
;
14463 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
14464 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
14465 r
= posix_acl_inherit_mode(acl
, mode
);
14470 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14474 xattrs
[ACL_EA_ACCESS
] = acl
;
14477 if (S_ISDIR(*mode
))
14478 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14482 encode(xattrs
, xattrs_bl
);
14485 *mode
&= ~umask_cb(callback_handle
);
14490 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14494 void Client::set_filer_flags(int flags
)
14496 std::lock_guard
l(client_lock
);
14497 ceph_assert(flags
== 0 ||
14498 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14499 objecter
->add_global_op_flags(flags
);
14502 void Client::clear_filer_flags(int flags
)
14504 std::lock_guard
l(client_lock
);
14505 ceph_assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14506 objecter
->clear_global_op_flag(flags
);
14509 // called before mount
14510 void Client::set_uuid(const std::string
& uuid
)
14512 std::lock_guard
l(client_lock
);
14513 assert(initialized
);
14514 assert(!uuid
.empty());
14516 metadata
["uuid"] = uuid
;
14520 // called before mount. 0 means infinite
14521 void Client::set_session_timeout(unsigned timeout
)
14523 std::lock_guard
l(client_lock
);
14524 assert(initialized
);
14526 metadata
["timeout"] = stringify(timeout
);
14529 // called before mount
14530 int Client::start_reclaim(const std::string
& uuid
, unsigned flags
,
14531 const std::string
& fs_name
)
14533 std::lock_guard
l(client_lock
);
14541 auto it
= metadata
.find("uuid");
14542 if (it
!= metadata
.end() && it
->second
== uuid
)
14546 int r
= subscribe_mdsmap(fs_name
);
14548 lderr(cct
) << "mdsmap subscription failed: " << cpp_strerror(r
) << dendl
;
14552 if (metadata
.empty())
14553 populate_metadata("");
14555 while (mdsmap
->get_epoch() == 0)
14556 wait_on_list(waiting_for_mdsmap
);
14559 for (unsigned mds
= 0; mds
< mdsmap
->get_num_in_mds(); ) {
14560 if (!mdsmap
->is_up(mds
)) {
14561 ldout(cct
, 10) << "mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
14562 wait_on_list(waiting_for_mdsmap
);
14566 MetaSession
*session
;
14567 if (!have_open_session(mds
)) {
14568 session
= _get_or_open_mds_session(mds
);
14569 if (session
->state
== MetaSession::STATE_REJECTED
)
14571 if (session
->state
!= MetaSession::STATE_OPENING
) {
14575 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
14576 wait_on_context_list(session
->waiting_for_open
);
14580 session
= &mds_sessions
.at(mds
);
14581 if (!session
->mds_features
.test(CEPHFS_FEATURE_RECLAIM_CLIENT
))
14582 return -EOPNOTSUPP
;
14584 if (session
->reclaim_state
== MetaSession::RECLAIM_NULL
||
14585 session
->reclaim_state
== MetaSession::RECLAIMING
) {
14586 session
->reclaim_state
= MetaSession::RECLAIMING
;
14587 auto m
= make_message
<MClientReclaim
>(uuid
, flags
);
14588 session
->con
->send_message2(std::move(m
));
14589 wait_on_list(waiting_for_reclaim
);
14590 } else if (session
->reclaim_state
== MetaSession::RECLAIM_FAIL
) {
14591 return reclaim_errno
? : -ENOTRECOVERABLE
;
14597 // didn't find target session in any mds
14598 if (reclaim_target_addrs
.empty()) {
14599 if (flags
& CEPH_RECLAIM_RESET
)
14601 return -ENOTRECOVERABLE
;
14604 if (flags
& CEPH_RECLAIM_RESET
)
14607 // use blacklist to check if target session was killed
14608 // (config option mds_session_blacklist_on_evict needs to be true)
14610 if (!objecter
->wait_for_map(reclaim_osd_epoch
, &cond
)) {
14611 ldout(cct
, 10) << __func__
<< ": waiting for OSD epoch " << reclaim_osd_epoch
<< dendl
;
14612 client_lock
.unlock();
14614 client_lock
.lock();
14617 bool blacklisted
= objecter
->with_osdmap(
14618 [this](const OSDMap
&osd_map
) -> bool {
14619 return osd_map
.is_blacklisted(reclaim_target_addrs
);
14622 return -ENOTRECOVERABLE
;
14624 metadata
["reclaiming_uuid"] = uuid
;
14628 void Client::finish_reclaim()
14630 auto it
= metadata
.find("reclaiming_uuid");
14631 if (it
== metadata
.end()) {
14632 for (auto &p
: mds_sessions
)
14633 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14637 for (auto &p
: mds_sessions
) {
14638 p
.second
.reclaim_state
= MetaSession::RECLAIM_NULL
;
14639 auto m
= make_message
<MClientReclaim
>("", MClientReclaim::FLAG_FINISH
);
14640 p
.second
.con
->send_message2(std::move(m
));
14643 metadata
["uuid"] = it
->second
;
14644 metadata
.erase(it
);
14647 void Client::handle_client_reclaim_reply(const MConstRef
<MClientReclaimReply
>& reply
)
14649 mds_rank_t from
= mds_rank_t(reply
->get_source().num());
14650 ldout(cct
, 10) << __func__
<< " " << *reply
<< " from mds." << from
<< dendl
;
14652 MetaSession
*session
= _get_mds_session(from
, reply
->get_connection().get());
14654 ldout(cct
, 10) << " discarding reclaim reply from sessionless mds." << from
<< dendl
;
14658 if (reply
->get_result() >= 0) {
14659 session
->reclaim_state
= MetaSession::RECLAIM_OK
;
14660 if (reply
->get_epoch() > reclaim_osd_epoch
)
14661 reclaim_osd_epoch
= reply
->get_epoch();
14662 if (!reply
->get_addrs().empty())
14663 reclaim_target_addrs
= reply
->get_addrs();
14665 session
->reclaim_state
= MetaSession::RECLAIM_FAIL
;
14666 reclaim_errno
= reply
->get_result();
14669 signal_cond_list(waiting_for_reclaim
);
14673 * This is included in cap release messages, to cause
14674 * the MDS to wait until this OSD map epoch. It is necessary
14675 * in corner cases where we cancel RADOS ops, so that
14676 * nobody else tries to do IO to the same objects in
14677 * the same epoch as the cancelled ops.
14679 void Client::set_cap_epoch_barrier(epoch_t e
)
14681 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14682 cap_epoch_barrier
= e
;
14685 const char** Client::get_tracked_conf_keys() const
14687 static const char* keys
[] = {
14688 "client_cache_size",
14689 "client_cache_mid",
14691 "client_deleg_timeout",
14692 "client_deleg_break_on_open",
14698 void Client::handle_conf_change(const ConfigProxy
& conf
,
14699 const std::set
<std::string
> &changed
)
14701 std::lock_guard
lock(client_lock
);
14703 if (changed
.count("client_cache_mid")) {
14704 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14706 if (changed
.count("client_acl_type")) {
14708 if (cct
->_conf
->client_acl_type
== "posix_acl")
14709 acl_type
= POSIX_ACL
;
14713 void intrusive_ptr_add_ref(Inode
*in
)
14718 void intrusive_ptr_release(Inode
*in
)
14720 in
->client
->put_inode(in
);
14723 mds_rank_t
Client::_get_random_up_mds() const
14725 ceph_assert(ceph_mutex_is_locked_by_me(client_lock
));
14727 std::set
<mds_rank_t
> up
;
14728 mdsmap
->get_up_mds_set(up
);
14731 return MDS_RANK_NONE
;
14732 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14733 for (int n
= rand() % up
.size(); n
; n
--)
14739 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14740 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14742 monclient
->set_messenger(m
);
14743 objecter
->set_client_incarnation(0);
14746 StandaloneClient::~StandaloneClient()
14749 objecter
= nullptr;
14752 int StandaloneClient::init()
14757 client_lock
.lock();
14758 ceph_assert(!is_initialized());
14760 messenger
->add_dispatcher_tail(objecter
);
14761 messenger
->add_dispatcher_tail(this);
14763 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14764 int r
= monclient
->init();
14766 // need to do cleanup because we're in an intermediate init state
14768 client_lock
.unlock();
14769 objecter
->shutdown();
14770 objectcacher
->stop();
14771 monclient
->shutdown();
14776 client_lock
.unlock();
14782 void StandaloneClient::shutdown()
14784 Client::shutdown();
14785 objecter
->shutdown();
14786 monclient
->shutdown();