1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
22 #include <sys/param.h>
25 #include <sys/utsname.h>
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
35 #include <sys/xattr.h>
38 #if defined(__linux__)
39 #include <linux/falloc.h>
42 #include <sys/statvfs.h>
44 #include "common/config.h"
45 #include "common/version.h"
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
64 #include "mon/MonClient.h"
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
77 #define dout_subsys ceph_subsys_client
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
86 #include "Delegation.h"
88 #include "ClientSnapRealm.h"
90 #include "MetaSession.h"
91 #include "MetaRequest.h"
92 #include "ObjecterWriteback.h"
93 #include "posix_acl.h"
95 #include "include/assert.h"
96 #include "include/stat.h"
98 #include "include/cephfs/ceph_statx.h"
100 #if HAVE_GETGROUPLIST
107 #define dout_prefix *_dout << "client." << whoami << " "
109 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111 // FreeBSD fails to define this
115 // Darwin fails to define this
124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
128 Client
*client
= static_cast<Client
*>(p
);
129 client
->flush_set_callback(oset
);
135 Client::CommandHook::CommandHook(Client
*client
) :
140 bool Client::CommandHook::call(std::string command
, cmdmap_t
& cmdmap
,
141 std::string format
, bufferlist
& out
)
143 Formatter
*f
= Formatter::create(format
);
144 f
->open_object_section("result");
145 m_client
->client_lock
.Lock();
146 if (command
== "mds_requests")
147 m_client
->dump_mds_requests(f
);
148 else if (command
== "mds_sessions")
149 m_client
->dump_mds_sessions(f
);
150 else if (command
== "dump_cache")
151 m_client
->dump_cache(f
);
152 else if (command
== "kick_stale_sessions")
153 m_client
->_kick_stale_sessions();
154 else if (command
== "status")
155 m_client
->dump_status(f
);
157 assert(0 == "bad command registered");
158 m_client
->client_lock
.Unlock();
168 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
169 : inode(in
), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 void Client::_reset_faked_inos()
177 free_faked_inos
.clear();
178 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
179 last_used_faked_ino
= 0;
180 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
183 void Client::_assign_faked_ino(Inode
*in
)
185 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
186 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
187 last_used_faked_ino
= 0;
188 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
190 assert(it
!= free_faked_inos
.end());
191 if (last_used_faked_ino
< it
.get_start()) {
192 assert(it
.get_len() > 0);
193 last_used_faked_ino
= it
.get_start();
195 ++last_used_faked_ino
;
196 assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
198 in
->faked_ino
= last_used_faked_ino
;
199 free_faked_inos
.erase(in
->faked_ino
);
200 faked_ino_map
[in
->faked_ino
] = in
->vino();
203 void Client::_release_faked_ino(Inode
*in
)
205 free_faked_inos
.insert(in
->faked_ino
);
206 faked_ino_map
.erase(in
->faked_ino
);
209 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
214 else if (faked_ino_map
.count(ino
))
215 vino
= faked_ino_map
[ino
];
217 vino
= vinodeno_t(0, CEPH_NOSNAP
);
218 ldout(cct
, 10) << "map_faked_ino " << ino
<< " -> " << vino
<< dendl
;
222 vinodeno_t
Client::map_faked_ino(ino_t ino
)
224 Mutex::Locker
lock(client_lock
);
225 return _map_faked_ino(ino
);
230 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
231 : Dispatcher(m
->cct
),
232 m_command_hook(this),
233 timer(m
->cct
, client_lock
),
234 callback_handle(NULL
),
235 switch_interrupt_cb(NULL
),
237 ino_invalidate_cb(NULL
),
238 dentry_invalidate_cb(NULL
),
240 can_invalidate_dentries(false),
241 async_ino_invalidator(m
->cct
),
242 async_dentry_invalidator(m
->cct
),
243 interrupt_finisher(m
->cct
),
244 remount_finisher(m
->cct
),
245 objecter_finisher(m
->cct
),
247 messenger(m
), monclient(mc
),
249 whoami(mc
->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 mounted(false), unmounting(false), blacklisted(false),
253 local_osd(-ENXIO
), local_osd_epoch(0),
254 unsafe_sync_write(0),
255 client_lock("Client::client_lock"),
262 num_flushing_caps
= 0;
264 _dir_vxattrs_name_size
= _vxattrs_calcu_name_size(_dir_vxattrs
);
265 _file_vxattrs_name_size
= _vxattrs_calcu_name_size(_file_vxattrs
);
267 user_id
= cct
->_conf
->client_mount_uid
;
268 group_id
= cct
->_conf
->client_mount_gid
;
271 if (cct
->_conf
->client_acl_type
== "posix_acl")
272 acl_type
= POSIX_ACL
;
274 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
277 free_fd_set
.insert(10, 1<<30);
279 mdsmap
.reset(new MDSMap
);
282 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
284 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
285 client_flush_set_callback
, // all commit callback
287 cct
->_conf
->client_oc_size
,
288 cct
->_conf
->client_oc_max_objects
,
289 cct
->_conf
->client_oc_max_dirty
,
290 cct
->_conf
->client_oc_target_dirty
,
291 cct
->_conf
->client_oc_max_dirty_age
,
293 objecter_finisher
.start();
294 filer
.reset(new Filer(objecter
, &objecter_finisher
));
295 objecter
->enable_blacklist_events();
301 assert(!client_lock
.is_locked());
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
308 client_lock
.Unlock();
311 void Client::tear_down_cache()
314 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
318 ldout(cct
, 1) << "tear_down_cache forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
323 while (!opened_dirs
.empty()) {
324 dir_result_t
*dirp
= *opened_dirs
.begin();
325 ldout(cct
, 1) << "tear_down_cache forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
334 assert(lru
.lru_get_size() == 0);
337 assert(inode_map
.size() <= 1 + root_parents
.size());
338 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
342 while (!root_parents
.empty())
343 root_parents
.erase(root_parents
.begin());
348 assert(inode_map
.empty());
351 inodeno_t
Client::get_root_ino()
353 Mutex::Locker
l(client_lock
);
354 if (use_faked_inos())
355 return root
->faked_ino
;
360 Inode
*Client::get_root()
362 Mutex::Locker
l(client_lock
);
370 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
373 in
->make_long_path(path
);
374 ldout(cct
, 1) << "dump_inode: "
375 << (disconnected
? "DISCONNECTED ":"")
376 << "inode " << in
->ino
378 << " ref " << in
->get_num_ref()
382 f
->open_object_section("inode");
383 f
->dump_stream("path") << path
;
385 f
->dump_int("disconnected", 1);
392 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
393 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
394 it
!= in
->dir
->dentries
.end();
396 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
398 f
->open_object_section("dentry");
402 if (it
->second
->inode
)
403 dump_inode(f
, it
->second
->inode
.get(), did
, false);
408 void Client::dump_cache(Formatter
*f
)
412 ldout(cct
, 1) << "dump_cache" << dendl
;
415 f
->open_array_section("cache");
418 dump_inode(f
, root
, did
, true);
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
422 it
!= inode_map
.end();
424 if (did
.count(it
->second
))
426 dump_inode(f
, it
->second
, did
, true);
433 void Client::dump_status(Formatter
*f
)
435 assert(client_lock
.is_locked_by_me());
437 ldout(cct
, 1) << __func__
<< dendl
;
439 const epoch_t osd_epoch
440 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
443 f
->open_object_section("metadata");
444 for (const auto& kv
: metadata
)
445 f
->dump_string(kv
.first
.c_str(), kv
.second
);
448 f
->dump_int("dentry_count", lru
.lru_get_size());
449 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
450 f
->dump_int("id", get_nodeid().v
);
451 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr());
452 f
->dump_object("inst", inst
);
453 f
->dump_stream("inst_str") << inst
;
454 f
->dump_stream("addr_str") << inst
.addr
;
455 f
->dump_int("inode_count", inode_map
.size());
456 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
457 f
->dump_int("osd_epoch", osd_epoch
);
458 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
459 f
->dump_bool("blacklisted", blacklisted
);
466 objectcacher
->start();
469 assert(!initialized
);
471 messenger
->add_dispatcher_tail(this);
472 client_lock
.Unlock();
478 void Client::_finish_init()
482 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
483 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
484 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
485 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
486 logger
.reset(plb
.create_perf_counters());
487 cct
->get_perfcounters_collection()->add(logger
.get());
489 client_lock
.Unlock();
491 cct
->_conf
->add_observer(this);
493 AdminSocket
* admin_socket
= cct
->get_admin_socket();
494 int ret
= admin_socket
->register_command("mds_requests",
497 "show in-progress mds requests");
499 lderr(cct
) << "error registering admin socket command: "
500 << cpp_strerror(-ret
) << dendl
;
502 ret
= admin_socket
->register_command("mds_sessions",
505 "show mds session state");
507 lderr(cct
) << "error registering admin socket command: "
508 << cpp_strerror(-ret
) << dendl
;
510 ret
= admin_socket
->register_command("dump_cache",
513 "show in-memory metadata cache contents");
515 lderr(cct
) << "error registering admin socket command: "
516 << cpp_strerror(-ret
) << dendl
;
518 ret
= admin_socket
->register_command("kick_stale_sessions",
519 "kick_stale_sessions",
521 "kick sessions that were remote reset");
523 lderr(cct
) << "error registering admin socket command: "
524 << cpp_strerror(-ret
) << dendl
;
526 ret
= admin_socket
->register_command("status",
529 "show overall client status");
531 lderr(cct
) << "error registering admin socket command: "
532 << cpp_strerror(-ret
) << dendl
;
537 client_lock
.Unlock();
540 void Client::shutdown()
542 ldout(cct
, 1) << "shutdown" << dendl
;
544 // If we were not mounted, but were being used for sending
545 // MDS commands, we may have sessions that need closing.
548 client_lock
.Unlock();
550 cct
->_conf
->remove_observer(this);
552 AdminSocket
* admin_socket
= cct
->get_admin_socket();
553 admin_socket
->unregister_command("mds_requests");
554 admin_socket
->unregister_command("mds_sessions");
555 admin_socket
->unregister_command("dump_cache");
556 admin_socket
->unregister_command("kick_stale_sessions");
557 admin_socket
->unregister_command("status");
559 if (ino_invalidate_cb
) {
560 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
561 async_ino_invalidator
.wait_for_empty();
562 async_ino_invalidator
.stop();
565 if (dentry_invalidate_cb
) {
566 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
567 async_dentry_invalidator
.wait_for_empty();
568 async_dentry_invalidator
.stop();
571 if (switch_interrupt_cb
) {
572 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
573 interrupt_finisher
.wait_for_empty();
574 interrupt_finisher
.stop();
578 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
579 remount_finisher
.wait_for_empty();
580 remount_finisher
.stop();
583 objectcacher
->stop(); // outside of client_lock! this does a join.
589 client_lock
.Unlock();
591 objecter_finisher
.wait_for_empty();
592 objecter_finisher
.stop();
595 cct
->get_perfcounters_collection()->remove(logger
.get());
601 // ===================
602 // metadata cache stuff
604 void Client::trim_cache(bool trim_kernel_dcache
)
606 uint64_t max
= cct
->_conf
->client_cache_size
;
607 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
609 while (lru
.lru_get_size() != last
) {
610 last
= lru
.lru_get_size();
612 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
615 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
622 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
623 _invalidate_kernel_dcache();
626 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
627 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
631 while (!root_parents
.empty())
632 root_parents
.erase(root_parents
.begin());
638 void Client::trim_cache_for_reconnect(MetaSession
*s
)
640 mds_rank_t mds
= s
->mds_num
;
641 ldout(cct
, 20) << "trim_cache_for_reconnect mds." << mds
<< dendl
;
644 list
<Dentry
*> skipped
;
645 while (lru
.lru_get_size() > 0) {
646 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
650 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
651 dn
->dir
->parent_inode
->caps
.count(mds
)) {
655 skipped
.push_back(dn
);
658 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
659 lru
.lru_insert_mid(*p
);
661 ldout(cct
, 20) << "trim_cache_for_reconnect mds." << mds
662 << " trimmed " << trimmed
<< " dentries" << dendl
;
664 if (s
->caps
.size() > 0)
665 _invalidate_kernel_dcache();
668 void Client::trim_dentry(Dentry
*dn
)
670 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
671 << " in dir " << hex
<< dn
->dir
->parent_inode
->ino
674 Inode
*diri
= dn
->dir
->parent_inode
;
675 diri
->dir_release_count
++;
676 clear_dir_complete_and_ordered(diri
, true);
678 unlink(dn
, false, false); // drop dir, drop dentry
682 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
683 uint64_t truncate_seq
, uint64_t truncate_size
)
685 uint64_t prior_size
= in
->size
;
687 if (truncate_seq
> in
->truncate_seq
||
688 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
689 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
691 in
->reported_size
= size
;
692 if (truncate_seq
!= in
->truncate_seq
) {
693 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
694 << truncate_seq
<< dendl
;
695 in
->truncate_seq
= truncate_seq
;
696 in
->oset
.truncate_seq
= truncate_seq
;
698 // truncate cached file data
699 if (prior_size
> size
) {
700 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
704 // truncate inline data
705 if (in
->inline_version
< CEPH_INLINE_NONE
) {
706 uint32_t len
= in
->inline_data
.length();
708 in
->inline_data
.splice(size
, len
- size
);
711 if (truncate_seq
>= in
->truncate_seq
&&
712 in
->truncate_size
!= truncate_size
) {
714 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
715 << truncate_size
<< dendl
;
716 in
->truncate_size
= truncate_size
;
717 in
->oset
.truncate_size
= truncate_size
;
719 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
724 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
725 utime_t ctime
, utime_t mtime
, utime_t atime
)
727 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
728 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
730 if (time_warp_seq
> in
->time_warp_seq
)
731 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
732 << " is higher than local time_warp_seq "
733 << in
->time_warp_seq
<< dendl
;
736 // be careful with size, mtime, atime
737 if (issued
& (CEPH_CAP_FILE_EXCL
|
739 CEPH_CAP_FILE_BUFFER
|
741 CEPH_CAP_XATTR_EXCL
)) {
742 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
743 if (ctime
> in
->ctime
)
745 if (time_warp_seq
> in
->time_warp_seq
) {
746 //the mds updated times, so take those!
749 in
->time_warp_seq
= time_warp_seq
;
750 } else if (time_warp_seq
== in
->time_warp_seq
) {
752 if (mtime
> in
->mtime
)
754 if (atime
> in
->atime
)
756 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
757 //ignore mds values as we have a higher seq
760 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
761 if (time_warp_seq
>= in
->time_warp_seq
) {
765 in
->time_warp_seq
= time_warp_seq
;
769 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
770 << time_warp_seq
<< " is lower than local time_warp_seq "
776 void Client::_fragmap_remove_non_leaves(Inode
*in
)
778 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
779 if (!in
->dirfragtree
.is_leaf(p
->first
))
780 in
->fragmap
.erase(p
++);
785 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
787 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
788 if (p
->second
== mds
)
789 in
->fragmap
.erase(p
++);
794 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
795 MetaSession
*session
,
796 const UserPerm
& request_perms
)
799 bool was_new
= false;
800 if (inode_map
.count(st
->vino
)) {
801 in
= inode_map
[st
->vino
];
802 ldout(cct
, 12) << "add_update_inode had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
804 in
= new Inode(this, st
->vino
, &st
->layout
);
805 inode_map
[st
->vino
] = in
;
807 if (use_faked_inos())
808 _assign_faked_ino(in
);
814 } else if (!mounted
) {
815 root_parents
[root_ancestor
] = in
;
820 in
->ino
= st
->vino
.ino
;
821 in
->snapid
= st
->vino
.snapid
;
822 in
->mode
= st
->mode
& S_IFMT
;
827 if (in
->is_symlink())
828 in
->symlink
= st
->symlink
;
830 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
831 bool new_version
= false;
832 if (in
->version
== 0 ||
833 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
834 (in
->version
& ~1) < st
->version
))
838 in
->caps_issued(&issued
);
839 issued
|= in
->caps_dirty();
840 int new_issued
= ~issued
& (int)st
->cap
.caps
;
842 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
843 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
847 in
->btime
= st
->btime
;
850 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
851 !(issued
& CEPH_CAP_LINK_EXCL
)) {
852 in
->nlink
= st
->nlink
;
855 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
856 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
857 st
->ctime
, st
->mtime
, st
->atime
);
861 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
862 in
->layout
= st
->layout
;
863 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
867 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
868 in
->dirstat
= st
->dirstat
;
870 // dir_layout/rstat/quota are not tracked by capability, update them only if
871 // the inode stat is from auth mds
872 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
873 in
->dir_layout
= st
->dir_layout
;
874 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
875 in
->rstat
= st
->rstat
;
876 in
->quota
= st
->quota
;
878 // move me if/when version reflects fragtree changes.
879 if (in
->dirfragtree
!= st
->dirfragtree
) {
880 in
->dirfragtree
= st
->dirfragtree
;
881 _fragmap_remove_non_leaves(in
);
885 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
886 st
->xattrbl
.length() &&
887 st
->xattr_version
> in
->xattr_version
) {
888 bufferlist::iterator p
= st
->xattrbl
.begin();
889 ::decode(in
->xattrs
, p
);
890 in
->xattr_version
= st
->xattr_version
;
893 if (st
->inline_version
> in
->inline_version
) {
894 in
->inline_data
= st
->inline_data
;
895 in
->inline_version
= st
->inline_version
;
898 /* always take a newer change attr */
899 if (st
->change_attr
> in
->change_attr
)
900 in
->change_attr
= st
->change_attr
;
902 if (st
->version
> in
->version
)
903 in
->version
= st
->version
;
906 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
909 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
911 if (in
->snapid
== CEPH_NOSNAP
) {
912 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.seq
,
913 st
->cap
.mseq
, inodeno_t(st
->cap
.realm
), st
->cap
.flags
,
915 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
916 in
->max_size
= st
->max_size
;
917 in
->rstat
= st
->rstat
;
920 // setting I_COMPLETE needs to happen after adding the cap
922 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
923 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
924 in
->dirstat
.nfiles
== 0 &&
925 in
->dirstat
.nsubdirs
== 0) {
926 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
927 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
929 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
930 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
931 in
->dir
->readdir_cache
.clear();
932 for (const auto& p
: in
->dir
->dentries
) {
933 unlink(p
.second
, true, true); // keep dir, keep dentry
935 if (in
->dir
->dentries
.empty())
940 in
->snap_caps
|= st
->cap
.caps
;
948 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
950 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
951 Inode
*in
, utime_t from
, MetaSession
*session
,
955 if (dir
->dentries
.count(dname
))
956 dn
= dir
->dentries
[dname
];
958 ldout(cct
, 12) << "insert_dentry_inode '" << dname
<< "' vino " << in
->vino()
959 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
962 if (dn
&& dn
->inode
) {
963 if (dn
->inode
->vino() == in
->vino()) {
965 ldout(cct
, 12) << " had dentry " << dname
966 << " with correct vino " << dn
->inode
->vino()
969 ldout(cct
, 12) << " had dentry " << dname
970 << " with WRONG vino " << dn
->inode
->vino()
972 unlink(dn
, true, true); // keep dir, keep dentry
976 if (!dn
|| !dn
->inode
) {
977 InodeRef
tmp_ref(in
);
979 if (old_dentry
->dir
!= dir
) {
980 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
981 old_diri
->dir_ordered_count
++;
982 clear_dir_complete_and_ordered(old_diri
, false);
984 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
986 Inode
*diri
= dir
->parent_inode
;
987 diri
->dir_ordered_count
++;
988 clear_dir_complete_and_ordered(diri
, false);
989 dn
= link(dir
, dname
, in
, dn
);
992 update_dentry_lease(dn
, dlease
, from
, session
);
996 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
999 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1003 if (dlease
->mask
& CEPH_LOCK_DN
) {
1004 if (dttl
> dn
->lease_ttl
) {
1005 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1006 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1007 dn
->lease_ttl
= dttl
;
1008 dn
->lease_mds
= session
->mds_num
;
1009 dn
->lease_seq
= dlease
->seq
;
1010 dn
->lease_gen
= session
->cap_gen
;
1013 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1018 * update MDS location cache for a single inode
1020 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1023 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1024 if (dst
->auth
>= 0) {
1025 in
->fragmap
[dst
->frag
] = dst
->auth
;
1027 in
->fragmap
.erase(dst
->frag
);
1029 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1030 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1031 _fragmap_remove_non_leaves(in
);
1035 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1039 if (!st->dirfrag_dist.empty()) { // FIXME
1040 set<int> dist = st->dirfrag_dist.begin()->second;
1041 if (dist.empty() && !in->dir_contacts.empty())
1042 ldout(cct, 9) << "lost dist spec for " << in->ino
1043 << " " << dist << dendl;
1044 if (!dist.empty() && in->dir_contacts.empty())
1045 ldout(cct, 9) << "got dist spec for " << in->ino
1046 << " " << dist << dendl;
1047 in->dir_contacts = dist;
1052 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1054 if (diri
->flags
& I_COMPLETE
) {
1056 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1057 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1059 if (diri
->flags
& I_DIR_ORDERED
) {
1060 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1061 diri
->flags
&= ~I_DIR_ORDERED
;
1065 diri
->dir
->readdir_cache
.clear();
1070 * insert results from readdir or lssnap into the metadata cache.
1072 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1074 MClientReply
*reply
= request
->reply
;
1075 ConnectionRef con
= request
->reply
->get_connection();
1076 uint64_t features
= con
->get_features();
1078 dir_result_t
*dirp
= request
->dirp
;
1081 // the extra buffer list is only set for readdir and lssnap replies
1082 bufferlist::iterator p
= reply
->get_extra_bl().begin();
1085 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1087 diri
= open_snapdir(diri
);
1090 // only open dir if we're actually adding stuff to it!
1091 Dir
*dir
= diri
->open_dir();
1101 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1102 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1104 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1105 unsigned readdir_offset
= dirp
->next_offset
;
1106 string readdir_start
= dirp
->last_name
;
1107 assert(!readdir_start
.empty() || readdir_offset
== 2);
1109 unsigned last_hash
= 0;
1111 if (!readdir_start
.empty()) {
1112 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1113 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1114 /* mds understands offset_hash */
1115 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1119 if (fg
!= dst
.frag
) {
1120 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1124 readdir_start
.clear();
1125 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1129 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1130 << ", hash_order=" << hash_order
1131 << ", readdir_start " << readdir_start
1132 << ", last_hash " << last_hash
1133 << ", next_offset " << readdir_offset
<< dendl
;
1135 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1136 fg
.is_leftmost() && readdir_offset
== 2 &&
1137 !(hash_order
&& last_hash
)) {
1138 dirp
->release_count
= diri
->dir_release_count
;
1139 dirp
->ordered_count
= diri
->dir_ordered_count
;
1140 dirp
->start_shared_gen
= diri
->shared_gen
;
1141 dirp
->cache_index
= 0;
1144 dirp
->buffer_frag
= fg
;
1146 _readdir_drop_dirp_buffer(dirp
);
1147 dirp
->buffer
.reserve(numdn
);
1151 for (unsigned i
=0; i
<numdn
; i
++) {
1153 ::decode(dlease
, p
);
1154 InodeStat
ist(p
, features
);
1156 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1158 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1161 if (diri
->dir
->dentries
.count(dname
)) {
1162 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1163 if (olddn
->inode
!= in
) {
1164 // replace incorrect dentry
1165 unlink(olddn
, true, true); // keep dir, dentry
1166 dn
= link(dir
, dname
, in
, olddn
);
1167 assert(dn
== olddn
);
1175 dn
= link(dir
, dname
, in
, NULL
);
1178 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1180 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1181 if (hash
!= last_hash
)
1184 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1186 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1188 // add to readdir cache
1189 if (dirp
->release_count
== diri
->dir_release_count
&&
1190 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1191 dirp
->start_shared_gen
== diri
->shared_gen
) {
1192 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1194 assert(!dirp
->inode
->is_complete_and_ordered());
1195 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1197 dir
->readdir_cache
.push_back(dn
);
1198 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1199 if (dirp
->inode
->is_complete_and_ordered())
1200 assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1202 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1204 assert(0 == "unexpected readdir buffer idx");
1206 dirp
->cache_index
++;
1208 // add to cached result list
1209 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1210 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1214 dirp
->last_name
= dname
;
1216 dirp
->next_offset
= 2;
1218 dirp
->next_offset
= readdir_offset
;
1220 if (dir
->is_empty())
1227 * insert a trace from a MDS reply into the cache.
1229 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1231 MClientReply
*reply
= request
->reply
;
1232 int op
= request
->get_op();
1234 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1235 << " is_target=" << (int)reply
->head
.is_target
1236 << " is_dentry=" << (int)reply
->head
.is_dentry
1239 bufferlist::iterator p
= reply
->get_trace_bl().begin();
1240 if (request
->got_unsafe
) {
1241 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1247 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1249 Dentry
*d
= request
->dentry();
1251 Inode
*diri
= d
->dir
->parent_inode
;
1252 diri
->dir_release_count
++;
1253 clear_dir_complete_and_ordered(diri
, true);
1256 if (d
&& reply
->get_result() == 0) {
1257 if (op
== CEPH_MDS_OP_RENAME
) {
1259 Dentry
*od
= request
->old_dentry();
1260 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1262 unlink(od
, true, true); // keep dir, dentry
1263 } else if (op
== CEPH_MDS_OP_RMDIR
||
1264 op
== CEPH_MDS_OP_UNLINK
) {
1266 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1267 unlink(d
, true, true); // keep dir, dentry
1273 ConnectionRef con
= request
->reply
->get_connection();
1274 uint64_t features
= con
->get_features();
1275 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1278 SnapRealm
*realm
= NULL
;
1279 if (reply
->snapbl
.length())
1280 update_snap_trace(reply
->snapbl
, &realm
);
1282 ldout(cct
, 10) << " hrm "
1283 << " is_target=" << (int)reply
->head
.is_target
1284 << " is_dentry=" << (int)reply
->head
.is_dentry
1293 if (reply
->head
.is_dentry
) {
1294 dirst
.decode(p
, features
);
1297 ::decode(dlease
, p
);
1301 if (reply
->head
.is_target
) {
1302 ist
.decode(p
, features
);
1303 if (cct
->_conf
->client_debug_getattr_caps
) {
1304 unsigned wanted
= 0;
1305 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1306 wanted
= request
->head
.args
.getattr
.mask
;
1307 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1308 wanted
= request
->head
.args
.open
.mask
;
1310 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1311 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1312 assert(0 == "MDS reply does not contain xattrs");
1315 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1320 if (reply
->head
.is_dentry
) {
1321 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1323 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1326 Dir
*dir
= diri
->open_dir();
1327 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1328 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1331 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1332 dn
= diri
->dir
->dentries
[dname
];
1334 diri
->dir_ordered_count
++;
1335 clear_dir_complete_and_ordered(diri
, false);
1336 unlink(dn
, true, true); // keep dir, dentry
1339 if (dlease
.duration_ms
> 0) {
1341 Dir
*dir
= diri
->open_dir();
1342 dn
= link(dir
, dname
, NULL
, NULL
);
1344 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1347 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1348 op
== CEPH_MDS_OP_MKSNAP
) {
1349 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1350 // fake it for snap lookup
1351 vinodeno_t vino
= ist
.vino
;
1352 vino
.snapid
= CEPH_SNAPDIR
;
1353 assert(inode_map
.count(vino
));
1354 diri
= inode_map
[vino
];
1356 string dname
= request
->path
.last_dentry();
1359 dlease
.duration_ms
= 0;
1362 Dir
*dir
= diri
->open_dir();
1363 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1365 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1366 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1368 unlink(dn
, true, true); // keep dir, dentry
1374 if (op
== CEPH_MDS_OP_READDIR
||
1375 op
== CEPH_MDS_OP_LSSNAP
) {
1376 insert_readdir_results(request
, session
, in
);
1377 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1378 // hack: return parent inode instead
1382 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1383 // pin the target inode if its parent dentry is not pinned
1384 request
->set_other_inode(in
);
1389 put_snap_realm(realm
);
1391 request
->target
= in
;
1397 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1399 mds_rank_t mds
= MDS_RANK_NONE
;
1401 bool is_hash
= false;
1407 if (req
->resend_mds
>= 0) {
1408 mds
= req
->resend_mds
;
1409 req
->resend_mds
= -1;
1410 ldout(cct
, 10) << "choose_target_mds resend_mds specified as mds." << mds
<< dendl
;
1414 if (cct
->_conf
->client_use_random_mds
)
1420 ldout(cct
, 20) << "choose_target_mds starting with req->inode " << *in
<< dendl
;
1421 if (req
->path
.depth()) {
1422 hash
= in
->hash_dentry_name(req
->path
[0]);
1423 ldout(cct
, 20) << "choose_target_mds inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1424 << " on " << req
->path
[0]
1425 << " => " << hash
<< dendl
;
1430 in
= de
->inode
.get();
1431 ldout(cct
, 20) << "choose_target_mds starting with req->dentry inode " << *in
<< dendl
;
1433 in
= de
->dir
->parent_inode
;
1434 hash
= in
->hash_dentry_name(de
->name
);
1435 ldout(cct
, 20) << "choose_target_mds dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1436 << " on " << de
->name
1437 << " => " << hash
<< dendl
;
1442 if (in
->snapid
!= CEPH_NOSNAP
) {
1443 ldout(cct
, 10) << "choose_target_mds " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1444 while (in
->snapid
!= CEPH_NOSNAP
) {
1445 if (in
->snapid
== CEPH_SNAPDIR
)
1446 in
= in
->snapdir_parent
.get();
1447 else if (!in
->dn_set
.empty())
1448 /* In most cases there will only be one dentry, so getting it
1449 * will be the correct action. If there are multiple hard links,
1450 * I think the MDS should be able to redirect as needed*/
1451 in
= in
->get_first_parent()->dir
->parent_inode
;
1453 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1460 ldout(cct
, 20) << "choose_target_mds " << *in
<< " is_hash=" << is_hash
1461 << " hash=" << hash
<< dendl
;
1463 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1464 frag_t fg
= in
->dirfragtree
[hash
];
1465 if (in
->fragmap
.count(fg
)) {
1466 mds
= in
->fragmap
[fg
];
1469 } else if (in
->auth_cap
) {
1470 mds
= in
->auth_cap
->session
->mds_num
;
1473 ldout(cct
, 10) << "choose_target_mds from dirfragtree hash" << dendl
;
1478 if (req
->auth_is_best())
1480 if (!cap
&& !in
->caps
.empty())
1481 cap
= in
->caps
.begin()->second
;
1484 mds
= cap
->session
->mds_num
;
1485 ldout(cct
, 10) << "choose_target_mds from caps on inode " << *in
<< dendl
;
1492 mds
= _get_random_up_mds();
1493 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1497 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1502 void Client::connect_mds_targets(mds_rank_t mds
)
1504 ldout(cct
, 10) << "connect_mds_targets for mds." << mds
<< dendl
;
1505 assert(mds_sessions
.count(mds
));
1506 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1507 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1508 q
!= info
.export_targets
.end();
1510 if (mds_sessions
.count(*q
) == 0 &&
1511 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1512 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1513 << " export target mds." << *q
<< dendl
;
1514 _open_mds_session(*q
);
1519 void Client::dump_mds_sessions(Formatter
*f
)
1521 f
->dump_int("id", get_nodeid().v
);
1522 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr());
1523 f
->dump_object("inst", inst
);
1524 f
->dump_stream("inst_str") << inst
;
1525 f
->dump_stream("addr_str") << inst
.addr
;
1526 f
->open_array_section("sessions");
1527 for (map
<mds_rank_t
,MetaSession
*>::const_iterator p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ++p
) {
1528 f
->open_object_section("session");
1533 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1535 void Client::dump_mds_requests(Formatter
*f
)
1537 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1538 p
!= mds_requests
.end();
1540 f
->open_object_section("request");
1546 int Client::verify_reply_trace(int r
,
1547 MetaRequest
*request
, MClientReply
*reply
,
1548 InodeRef
*ptarget
, bool *pcreated
,
1549 const UserPerm
& perms
)
1551 // check whether this request actually did the create, and set created flag
1552 bufferlist extra_bl
;
1553 inodeno_t created_ino
;
1554 bool got_created_ino
= false;
1555 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1557 extra_bl
.claim(reply
->get_extra_bl());
1558 if (extra_bl
.length() >= 8) {
1559 // if the extra bufferlist has a buffer, we assume its the created inode
1560 // and that this request to create succeeded in actually creating
1561 // the inode (won the race with other create requests)
1562 ::decode(created_ino
, extra_bl
);
1563 got_created_ino
= true;
1564 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1568 *pcreated
= got_created_ino
;
1570 if (request
->target
) {
1571 *ptarget
= request
->target
;
1572 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1574 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1575 (*ptarget
) = p
->second
;
1576 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1578 // we got a traceless reply, and need to look up what we just
1579 // created. for now, do this by name. someday, do this by the
1580 // ino... which we know! FIXME.
1582 Dentry
*d
= request
->dentry();
1585 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1586 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1587 << " got_ino " << got_created_ino
1588 << " ino " << created_ino
1590 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1593 // if the dentry is not linked, just do our best. see #5021.
1594 assert(0 == "how did this happen? i want logs!");
1597 Inode
*in
= request
->inode();
1598 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1599 << in
->ino
<< dendl
;
1600 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1604 // verify ino returned in reply and trace_dist are the same
1605 if (got_created_ino
&&
1606 created_ino
.val
!= target
->ino
.val
) {
1607 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1611 ptarget
->swap(target
);
1623 * Blocking helper to make an MDS request.
1625 * If the ptarget flag is set, behavior changes slightly: the caller
1626 * expects to get a pointer to the inode we are creating or operating
1627 * on. As a result, we will follow up any traceless mutation reply
1628 * with a getattr or lookup to transparently handle a traceless reply
1629 * from the MDS (as when the MDS restarts and the client has to replay
1632 * @param request the MetaRequest to execute
1633 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1634 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1635 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1636 * @param use_mds [optional] prefer a specific mds (-1 for default)
1637 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1639 int Client::make_request(MetaRequest
*request
,
1640 const UserPerm
& perms
,
1641 InodeRef
*ptarget
, bool *pcreated
,
1647 // assign a unique tid
1648 ceph_tid_t tid
= ++last_tid
;
1649 request
->set_tid(tid
);
1652 request
->op_stamp
= ceph_clock_now();
1655 mds_requests
[tid
] = request
->get();
1656 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1659 request
->set_caller_perms(perms
);
1661 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1662 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1663 request
->set_oldest_client_tid(1);
1665 request
->set_oldest_client_tid(oldest_tid
);
1670 request
->resend_mds
= use_mds
;
1673 if (request
->aborted())
1677 request
->abort(-EBLACKLISTED
);
1683 request
->caller_cond
= &caller_cond
;
1686 Inode
*hash_diri
= NULL
;
1687 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1688 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1689 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1690 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1692 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1693 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1695 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1696 request
->resend_mds
= _get_random_up_mds();
1699 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1700 wait_on_list(waiting_for_mdsmap
);
1706 MetaSession
*session
= NULL
;
1707 if (!have_open_session(mds
)) {
1708 session
= _get_or_open_mds_session(mds
);
1711 if (session
->state
== MetaSession::STATE_OPENING
) {
1712 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1713 wait_on_context_list(session
->waiting_for_open
);
1714 // Abort requests on REJECT from MDS
1715 if (rejected_by_mds
.count(mds
)) {
1716 request
->abort(-EPERM
);
1722 if (!have_open_session(mds
))
1725 session
= mds_sessions
[mds
];
1729 send_request(request
, session
);
1732 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1733 request
->kick
= false;
1734 while (!request
->reply
&& // reply
1735 request
->resend_mds
< 0 && // forward
1737 caller_cond
.Wait(client_lock
);
1738 request
->caller_cond
= NULL
;
1740 // did we get a reply?
1745 if (!request
->reply
) {
1746 assert(request
->aborted());
1747 assert(!request
->got_unsafe
);
1748 r
= request
->get_abort_code();
1749 request
->item
.remove_myself();
1750 unregister_request(request
);
1751 put_request(request
); // ours
1756 MClientReply
*reply
= request
->reply
;
1757 request
->reply
= NULL
;
1758 r
= reply
->get_result();
1760 request
->success
= true;
1762 // kick dispatcher (we've got it!)
1763 assert(request
->dispatch_cond
);
1764 request
->dispatch_cond
->Signal();
1765 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1766 request
->dispatch_cond
= 0;
1768 if (r
>= 0 && ptarget
)
1769 r
= verify_reply_trace(r
, request
, reply
, ptarget
, pcreated
, perms
);
1772 pdirbl
->claim(reply
->get_extra_bl());
1775 utime_t lat
= ceph_clock_now();
1776 lat
-= request
->sent_stamp
;
1777 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1778 logger
->tinc(l_c_lat
, lat
);
1779 logger
->tinc(l_c_reply
, lat
);
1781 put_request(request
);
1787 void Client::unregister_request(MetaRequest
*req
)
1789 mds_requests
.erase(req
->tid
);
1790 if (req
->tid
== oldest_tid
) {
1791 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1793 if (p
== mds_requests
.end()) {
1797 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1798 oldest_tid
= p
->first
;
1807 void Client::put_request(MetaRequest
*request
)
1809 if (request
->_put()) {
1811 if (request
->success
)
1812 op
= request
->get_op();
1814 request
->take_other_inode(&other_in
);
1818 (op
== CEPH_MDS_OP_RMDIR
||
1819 op
== CEPH_MDS_OP_RENAME
||
1820 op
== CEPH_MDS_OP_RMSNAP
)) {
1821 _try_to_trim_inode(other_in
.get(), false);
1826 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1827 mds_rank_t mds
, int drop
,
1828 int unless
, int force
)
1830 ldout(cct
, 20) << "encode_inode_release enter(in:" << *in
<< ", req:" << req
1831 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1832 << ", have:" << ", force:" << force
<< ")" << dendl
;
1834 if (in
->caps
.count(mds
)) {
1835 Cap
*caps
= in
->caps
[mds
];
1836 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1837 if ((drop
& caps
->issued
) &&
1838 !(unless
& caps
->issued
)) {
1839 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(caps
->issued
) << dendl
;
1840 caps
->issued
&= ~drop
;
1841 caps
->implemented
&= ~drop
;
1843 ldout(cct
, 25) << "Now have: " << ccap_string(caps
->issued
) << dendl
;
1848 ceph_mds_request_release rel
;
1850 rel
.cap_id
= caps
->cap_id
;
1851 rel
.seq
= caps
->seq
;
1852 rel
.issue_seq
= caps
->issue_seq
;
1853 rel
.mseq
= caps
->mseq
;
1854 rel
.caps
= caps
->implemented
;
1855 rel
.wanted
= caps
->wanted
;
1858 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1861 ldout(cct
, 25) << "encode_inode_release exit(in:" << *in
<< ") released:"
1862 << released
<< dendl
;
1866 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1867 mds_rank_t mds
, int drop
, int unless
)
1869 ldout(cct
, 20) << "encode_dentry_release enter(dn:"
1870 << dn
<< ")" << dendl
;
1873 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1874 mds
, drop
, unless
, 1);
1875 if (released
&& dn
->lease_mds
== mds
) {
1876 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1877 MClientRequest::Release
& rel
= req
->cap_releases
.back();
1878 rel
.item
.dname_len
= dn
->name
.length();
1879 rel
.item
.dname_seq
= dn
->lease_seq
;
1880 rel
.dname
= dn
->name
;
1882 ldout(cct
, 25) << "encode_dentry_release exit(dn:"
1883 << dn
<< ")" << dendl
;
1888 * This requires the MClientRequest *request member to be set.
1889 * It will error out horribly without one.
1890 * Additionally, if you set any *drop member, you'd better have
1891 * set the corresponding dentry!
1893 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1895 ldout(cct
, 20) << "encode_cap_releases enter (req: "
1896 << req
<< ", mds: " << mds
<< ")" << dendl
;
1897 if (req
->inode_drop
&& req
->inode())
1898 encode_inode_release(req
->inode(), req
,
1899 mds
, req
->inode_drop
,
1902 if (req
->old_inode_drop
&& req
->old_inode())
1903 encode_inode_release(req
->old_inode(), req
,
1904 mds
, req
->old_inode_drop
,
1905 req
->old_inode_unless
);
1906 if (req
->other_inode_drop
&& req
->other_inode())
1907 encode_inode_release(req
->other_inode(), req
,
1908 mds
, req
->other_inode_drop
,
1909 req
->other_inode_unless
);
1911 if (req
->dentry_drop
&& req
->dentry())
1912 encode_dentry_release(req
->dentry(), req
,
1913 mds
, req
->dentry_drop
,
1914 req
->dentry_unless
);
1916 if (req
->old_dentry_drop
&& req
->old_dentry())
1917 encode_dentry_release(req
->old_dentry(), req
,
1918 mds
, req
->old_dentry_drop
,
1919 req
->old_dentry_unless
);
1920 ldout(cct
, 25) << "encode_cap_releases exit (req: "
1921 << req
<< ", mds " << mds
<<dendl
;
1924 bool Client::have_open_session(mds_rank_t mds
)
1927 mds_sessions
.count(mds
) &&
1928 (mds_sessions
[mds
]->state
== MetaSession::STATE_OPEN
||
1929 mds_sessions
[mds
]->state
== MetaSession::STATE_STALE
);
1932 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1934 if (mds_sessions
.count(mds
) == 0)
1936 MetaSession
*s
= mds_sessions
[mds
];
1942 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1944 if (mds_sessions
.count(mds
))
1945 return mds_sessions
[mds
];
1946 return _open_mds_session(mds
);
1950 * Populate a map of strings with client-identifying metadata,
1951 * such as the hostname. Call this once at initialization.
1953 void Client::populate_metadata(const std::string
&mount_root
)
1959 metadata
["hostname"] = u
.nodename
;
1960 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1962 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1965 metadata
["pid"] = stringify(getpid());
1967 // Ceph entity id (the '0' in "client.0")
1968 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1970 // Our mount position
1971 if (!mount_root
.empty()) {
1972 metadata
["root"] = mount_root
;
1976 metadata
["ceph_version"] = pretty_version_to_str();
1977 metadata
["ceph_sha1"] = git_version_to_str();
1979 // Apply any metadata from the user's configured overrides
1980 std::vector
<std::string
> tokens
;
1981 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
1982 for (const auto &i
: tokens
) {
1983 auto eqpos
= i
.find("=");
1984 // Throw out anything that isn't of the form "<str>=<str>"
1985 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
1986 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
1989 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
1994 * Optionally add or override client metadata fields.
1996 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
1998 Mutex::Locker
l(client_lock
);
1999 assert(initialized
);
2001 if (metadata
.count(k
)) {
2002 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
2003 << "' from '" << metadata
[k
] << "' to '" << v
<< "'" << dendl
;
2009 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2011 ldout(cct
, 10) << "_open_mds_session mds." << mds
<< dendl
;
2012 assert(mds_sessions
.count(mds
) == 0);
2013 MetaSession
*session
= new MetaSession
;
2014 session
->mds_num
= mds
;
2016 session
->inst
= mdsmap
->get_inst(mds
);
2017 session
->con
= messenger
->get_connection(session
->inst
);
2018 session
->state
= MetaSession::STATE_OPENING
;
2019 session
->mds_state
= MDSMap::STATE_NULL
;
2020 mds_sessions
[mds
] = session
;
2022 // Maybe skip sending a request to open if this MDS daemon
2023 // has previously sent us a REJECT.
2024 if (rejected_by_mds
.count(mds
)) {
2025 if (rejected_by_mds
[mds
] == session
->inst
) {
2026 ldout(cct
, 4) << "_open_mds_session mds." << mds
<< " skipping "
2027 "because we were rejected" << dendl
;
2030 ldout(cct
, 4) << "_open_mds_session mds." << mds
<< " old inst "
2031 "rejected us, trying with new inst" << dendl
;
2032 rejected_by_mds
.erase(mds
);
2036 MClientSession
*m
= new MClientSession(CEPH_SESSION_REQUEST_OPEN
);
2037 m
->client_meta
= metadata
;
2038 session
->con
->send_message(m
);
2042 void Client::_close_mds_session(MetaSession
*s
)
2044 ldout(cct
, 2) << "_close_mds_session mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2045 s
->state
= MetaSession::STATE_CLOSING
;
2046 s
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2049 void Client::_closed_mds_session(MetaSession
*s
)
2051 s
->state
= MetaSession::STATE_CLOSED
;
2052 s
->con
->mark_down();
2053 signal_context_list(s
->waiting_for_open
);
2054 mount_cond
.Signal();
2055 remove_session_caps(s
);
2056 kick_requests_closed(s
);
2057 mds_sessions
.erase(s
->mds_num
);
2061 void Client::handle_client_session(MClientSession
*m
)
2063 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2064 ldout(cct
, 10) << "handle_client_session " << *m
<< " from mds." << from
<< dendl
;
2066 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2068 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2073 switch (m
->get_op()) {
2074 case CEPH_SESSION_OPEN
:
2075 renew_caps(session
);
2076 session
->state
= MetaSession::STATE_OPEN
;
2078 mount_cond
.Signal();
2080 connect_mds_targets(from
);
2081 signal_context_list(session
->waiting_for_open
);
2084 case CEPH_SESSION_CLOSE
:
2085 _closed_mds_session(session
);
2088 case CEPH_SESSION_RENEWCAPS
:
2089 if (session
->cap_renew_seq
== m
->get_seq()) {
2091 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2092 wake_inode_waiters(session
);
2096 case CEPH_SESSION_STALE
:
2097 // invalidate session caps/leases
2099 session
->cap_ttl
= ceph_clock_now();
2100 session
->cap_ttl
-= 1;
2101 renew_caps(session
);
2104 case CEPH_SESSION_RECALL_STATE
:
2105 trim_caps(session
, m
->get_max_caps());
2108 case CEPH_SESSION_FLUSHMSG
:
2109 session
->con
->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2112 case CEPH_SESSION_FORCE_RO
:
2113 force_session_readonly(session
);
2116 case CEPH_SESSION_REJECT
:
2117 rejected_by_mds
[session
->mds_num
] = session
->inst
;
2118 _closed_mds_session(session
);
2129 bool Client::_any_stale_sessions() const
2131 assert(client_lock
.is_locked_by_me());
2133 for (const auto &i
: mds_sessions
) {
2134 if (i
.second
->state
== MetaSession::STATE_STALE
) {
2142 void Client::_kick_stale_sessions()
2144 ldout(cct
, 1) << "kick_stale_sessions" << dendl
;
2146 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2147 p
!= mds_sessions
.end(); ) {
2148 MetaSession
*s
= p
->second
;
2150 if (s
->state
== MetaSession::STATE_STALE
)
2151 _closed_mds_session(s
);
2155 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2156 bool drop_cap_releases
)
2159 mds_rank_t mds
= session
->mds_num
;
2160 ldout(cct
, 10) << "send_request rebuilding request " << request
->get_tid()
2161 << " for mds." << mds
<< dendl
;
2162 MClientRequest
*r
= build_client_request(request
);
2163 if (request
->dentry()) {
2164 r
->set_dentry_wanted();
2166 if (request
->got_unsafe
) {
2167 r
->set_replayed_op();
2168 if (request
->target
)
2169 r
->head
.ino
= request
->target
->ino
;
2171 encode_cap_releases(request
, mds
);
2172 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2173 request
->cap_releases
.clear();
2175 r
->releases
.swap(request
->cap_releases
);
2177 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2178 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2179 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2180 r
->set_osdmap_epoch(o
.get_epoch());
2184 if (request
->mds
== -1) {
2185 request
->sent_stamp
= ceph_clock_now();
2186 ldout(cct
, 20) << "send_request set sent_stamp to " << request
->sent_stamp
<< dendl
;
2190 Inode
*in
= request
->inode();
2191 if (in
&& in
->caps
.count(mds
))
2192 request
->sent_on_mseq
= in
->caps
[mds
]->mseq
;
2194 session
->requests
.push_back(&request
->item
);
2196 ldout(cct
, 10) << "send_request " << *r
<< " to mds." << mds
<< dendl
;
2197 session
->con
->send_message(r
);
2200 MClientRequest
* Client::build_client_request(MetaRequest
*request
)
2202 MClientRequest
*req
= new MClientRequest(request
->get_op());
2203 req
->set_tid(request
->tid
);
2204 req
->set_stamp(request
->op_stamp
);
2205 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2207 // if the filepath's haven't been set, set them!
2208 if (request
->path
.empty()) {
2209 Inode
*in
= request
->inode();
2210 Dentry
*de
= request
->dentry();
2212 in
->make_nosnap_relative_path(request
->path
);
2215 de
->inode
->make_nosnap_relative_path(request
->path
);
2217 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2218 request
->path
.push_dentry(de
->name
);
2220 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2221 << " No path, inode, or appropriately-endowed dentry given!"
2223 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2224 << " No path, inode, or dentry given!"
2227 req
->set_filepath(request
->get_filepath());
2228 req
->set_filepath2(request
->get_filepath2());
2229 req
->set_data(request
->data
);
2230 req
->set_retry_attempt(request
->retry_attempt
++);
2231 req
->head
.num_fwd
= request
->num_fwd
;
2233 int gid_count
= request
->perms
.get_gids(&_gids
);
2234 req
->set_gid_list(gid_count
, _gids
);
2240 void Client::handle_client_request_forward(MClientRequestForward
*fwd
)
2242 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2243 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2248 ceph_tid_t tid
= fwd
->get_tid();
2250 if (mds_requests
.count(tid
) == 0) {
2251 ldout(cct
, 10) << "handle_client_request_forward no pending request on tid " << tid
<< dendl
;
2256 MetaRequest
*request
= mds_requests
[tid
];
2259 // reset retry counter
2260 request
->retry_attempt
= 0;
2262 // request not forwarded, or dest mds has no session.
2264 ldout(cct
, 10) << "handle_client_request tid " << tid
2265 << " fwd " << fwd
->get_num_fwd()
2266 << " to mds." << fwd
->get_dest_mds()
2267 << ", resending to " << fwd
->get_dest_mds()
2271 request
->item
.remove_myself();
2272 request
->num_fwd
= fwd
->get_num_fwd();
2273 request
->resend_mds
= fwd
->get_dest_mds();
2274 request
->caller_cond
->Signal();
2279 bool Client::is_dir_operation(MetaRequest
*req
)
2281 int op
= req
->get_op();
2282 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2283 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2284 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2285 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2290 void Client::handle_client_reply(MClientReply
*reply
)
2292 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2293 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2299 ceph_tid_t tid
= reply
->get_tid();
2300 bool is_safe
= reply
->is_safe();
2302 if (mds_requests
.count(tid
) == 0) {
2303 lderr(cct
) << "handle_client_reply no pending request on tid " << tid
2304 << " safe is:" << is_safe
<< dendl
;
2308 MetaRequest
*request
= mds_requests
.at(tid
);
2310 ldout(cct
, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2311 << " tid " << tid
<< dendl
;
2313 if (request
->got_unsafe
&& !is_safe
) {
2314 //duplicate response
2315 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2316 << mds_num
<< " safe:" << is_safe
<< dendl
;
2321 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2322 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2323 << " from mds." << request
->mds
<< dendl
;
2324 request
->send_to_auth
= true;
2325 request
->resend_mds
= choose_target_mds(request
);
2326 Inode
*in
= request
->inode();
2327 if (request
->resend_mds
>= 0 &&
2328 request
->resend_mds
== request
->mds
&&
2330 in
->caps
.count(request
->resend_mds
) == 0 ||
2331 request
->sent_on_mseq
== in
->caps
[request
->resend_mds
]->mseq
)) {
2332 // have to return ESTALE
2334 request
->caller_cond
->Signal();
2338 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2341 assert(request
->reply
== NULL
);
2342 request
->reply
= reply
;
2343 insert_trace(request
, session
);
2345 // Handle unsafe reply
2347 request
->got_unsafe
= true;
2348 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2349 if (is_dir_operation(request
)) {
2350 Inode
*dir
= request
->inode();
2352 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2354 if (request
->target
) {
2355 InodeRef
&in
= request
->target
;
2356 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2360 // Only signal the caller once (on the first reply):
2361 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2362 if (!is_safe
|| !request
->got_unsafe
) {
2364 request
->dispatch_cond
= &cond
;
2367 ldout(cct
, 20) << "handle_client_reply signalling caller " << (void*)request
->caller_cond
<< dendl
;
2368 request
->caller_cond
->Signal();
2370 // wake for kick back
2371 while (request
->dispatch_cond
) {
2372 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid " << tid
<< " " << &cond
<< dendl
;
2373 cond
.Wait(client_lock
);
2378 // the filesystem change is committed to disk
2379 // we're done, clean up
2380 if (request
->got_unsafe
) {
2381 request
->unsafe_item
.remove_myself();
2382 request
->unsafe_dir_item
.remove_myself();
2383 request
->unsafe_target_item
.remove_myself();
2384 signal_cond_list(request
->waitfor_safe
);
2386 request
->item
.remove_myself();
2387 unregister_request(request
);
2390 mount_cond
.Signal();
2393 void Client::_handle_full_flag(int64_t pool
)
2395 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2396 << "on " << pool
<< dendl
;
2397 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2398 // to do this rather than blocking, because otherwise when we fill up we
2399 // potentially lock caps forever on files with dirty pages, and we need
2400 // to be able to release those caps to the MDS so that it can delete files
2401 // and free up space.
2402 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2404 // For all inodes with layouts in this pool and a pending flush write op
2405 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2406 // from ObjectCacher so that it doesn't re-issue the write in response to
2407 // the ENOSPC error.
2408 // Fortunately since we're cancelling everything in a given pool, we don't
2409 // need to know which ops belong to which ObjectSet, we can just blow all
2410 // the un-flushed cached data away and mark any dirty inodes' async_err
2411 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2412 // affecting this pool, and all the objectsets we're purging were also
2414 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2415 i
!= inode_map
.end(); ++i
)
2417 Inode
*inode
= i
->second
;
2418 if (inode
->oset
.dirty_or_tx
2419 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2420 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2421 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2422 objectcacher
->purge_set(&inode
->oset
);
2423 inode
->set_async_err(-ENOSPC
);
2427 if (cancelled_epoch
!= (epoch_t
)-1) {
2428 set_cap_epoch_barrier(cancelled_epoch
);
2432 void Client::handle_osd_map(MOSDMap
*m
)
2434 std::set
<entity_addr_t
> new_blacklists
;
2435 objecter
->consume_blacklist_events(&new_blacklists
);
2437 const auto myaddr
= messenger
->get_myaddr();
2438 if (!blacklisted
&& new_blacklists
.count(myaddr
)) {
2439 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2440 return o
.get_epoch();
2442 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2444 for (std::map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2445 p
!= mds_requests
.end(); ) {
2446 auto req
= p
->second
;
2448 req
->abort(-EBLACKLISTED
);
2449 if (req
->caller_cond
) {
2451 req
->caller_cond
->Signal();
2455 // Progress aborts on any requests that were on this waitlist. Any
2456 // requests that were on a waiting_for_open session waitlist
2457 // will get kicked during close session below.
2458 signal_cond_list(waiting_for_mdsmap
);
2460 // Force-close all sessions: assume this is not abandoning any state
2461 // on the MDS side because the MDS will have seen the blacklist too.
2462 while(!mds_sessions
.empty()) {
2463 auto i
= mds_sessions
.begin();
2464 auto session
= i
->second
;
2465 _closed_mds_session(session
);
2468 // Since we know all our OSD ops will fail, cancel them all preemtively,
2469 // so that on an unhealthy cluster we can umount promptly even if e.g.
2470 // some PGs were inaccessible.
2471 objecter
->op_cancel_writes(-EBLACKLISTED
);
2473 } else if (blacklisted
) {
2474 // Handle case where we were blacklisted but no longer are
2475 blacklisted
= objecter
->with_osdmap([myaddr
](const OSDMap
&o
){
2476 return o
.is_blacklisted(myaddr
);});
2479 // Always subscribe to next osdmap for blacklisted client
2480 // until this client is not blacklisted.
2482 objecter
->maybe_request_map();
2485 if (objecter
->osdmap_full_flag()) {
2486 _handle_full_flag(-1);
2488 // Accumulate local list of full pools so that I can drop
2489 // the objecter lock before re-entering objecter in
2491 std::vector
<int64_t> full_pools
;
2493 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2494 for (const auto& kv
: o
.get_pools()) {
2495 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2496 full_pools
.push_back(kv
.first
);
2501 for (auto p
: full_pools
)
2502 _handle_full_flag(p
);
2504 // Subscribe to subsequent maps to watch for the full flag going
2505 // away. For the global full flag objecter does this for us, but
2506 // it pays no attention to the per-pool full flag so in this branch
2507 // we do it ourselves.
2508 if (!full_pools
.empty()) {
2509 objecter
->maybe_request_map();
2517 // ------------------------
2518 // incoming messages
2521 bool Client::ms_dispatch(Message
*m
)
2523 Mutex::Locker
l(client_lock
);
2525 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2530 switch (m
->get_type()) {
2531 // mounting and mds sessions
2532 case CEPH_MSG_MDS_MAP
:
2533 handle_mds_map(static_cast<MMDSMap
*>(m
));
2535 case CEPH_MSG_FS_MAP
:
2536 handle_fs_map(static_cast<MFSMap
*>(m
));
2538 case CEPH_MSG_FS_MAP_USER
:
2539 handle_fs_map_user(static_cast<MFSMapUser
*>(m
));
2541 case CEPH_MSG_CLIENT_SESSION
:
2542 handle_client_session(static_cast<MClientSession
*>(m
));
2545 case CEPH_MSG_OSD_MAP
:
2546 handle_osd_map(static_cast<MOSDMap
*>(m
));
2550 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2551 handle_client_request_forward(static_cast<MClientRequestForward
*>(m
));
2553 case CEPH_MSG_CLIENT_REPLY
:
2554 handle_client_reply(static_cast<MClientReply
*>(m
));
2557 case CEPH_MSG_CLIENT_SNAP
:
2558 handle_snap(static_cast<MClientSnap
*>(m
));
2560 case CEPH_MSG_CLIENT_CAPS
:
2561 handle_caps(static_cast<MClientCaps
*>(m
));
2563 case CEPH_MSG_CLIENT_LEASE
:
2564 handle_lease(static_cast<MClientLease
*>(m
));
2566 case MSG_COMMAND_REPLY
:
2567 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2568 handle_command_reply(static_cast<MCommandReply
*>(m
));
2573 case CEPH_MSG_CLIENT_QUOTA
:
2574 handle_quota(static_cast<MClientQuota
*>(m
));
2583 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2584 << "+" << inode_map
.size() << dendl
;
2585 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2587 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2588 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2589 mount_cond
.Signal();
2591 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2592 << "+" << inode_map
.size() << dendl
;
2599 void Client::handle_fs_map(MFSMap
*m
)
2601 fsmap
.reset(new FSMap(m
->get_fsmap()));
2604 signal_cond_list(waiting_for_fsmap
);
2606 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2609 void Client::handle_fs_map_user(MFSMapUser
*m
)
2611 fsmap_user
.reset(new FSMapUser
);
2612 *fsmap_user
= m
->get_fsmap();
2615 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2616 signal_cond_list(waiting_for_fsmap
);
2619 void Client::handle_mds_map(MMDSMap
* m
)
2621 mds_gid_t old_inc
, new_inc
;
2622 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2623 ldout(cct
, 1) << "handle_mds_map epoch " << m
->get_epoch()
2624 << " is identical to or older than our "
2625 << mdsmap
->get_epoch() << dendl
;
2630 ldout(cct
, 1) << "handle_mds_map epoch " << m
->get_epoch() << dendl
;
2632 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2633 oldmap
.swap(mdsmap
);
2635 mdsmap
->decode(m
->get_encoded());
2637 // Cancel any commands for missing or laggy GIDs
2638 std::list
<ceph_tid_t
> cancel_ops
;
2639 auto &commands
= command_table
.get_commands();
2640 for (const auto &i
: commands
) {
2641 auto &op
= i
.second
;
2642 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2643 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2644 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2645 cancel_ops
.push_back(i
.first
);
2647 std::ostringstream ss
;
2648 ss
<< "MDS " << op_mds_gid
<< " went away";
2649 *(op
.outs
) = ss
.str();
2651 op
.con
->mark_down();
2653 op
.on_finish
->complete(-ETIMEDOUT
);
2658 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2659 i
!= cancel_ops
.end(); ++i
) {
2660 command_table
.erase(*i
);
2664 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2665 p
!= mds_sessions
.end(); ) {
2666 mds_rank_t mds
= p
->first
;
2667 MetaSession
*session
= p
->second
;
2670 int oldstate
= oldmap
->get_state(mds
);
2671 int newstate
= mdsmap
->get_state(mds
);
2672 if (!mdsmap
->is_up(mds
)) {
2673 session
->con
->mark_down();
2674 } else if (mdsmap
->get_inst(mds
) != session
->inst
) {
2675 old_inc
= oldmap
->get_incarnation(mds
);
2676 new_inc
= mdsmap
->get_incarnation(mds
);
2677 if (old_inc
!= new_inc
) {
2678 ldout(cct
, 1) << "mds incarnation changed from "
2679 << old_inc
<< " to " << new_inc
<< dendl
;
2680 oldstate
= MDSMap::STATE_NULL
;
2682 session
->con
->mark_down();
2683 session
->inst
= mdsmap
->get_inst(mds
);
2684 // When new MDS starts to take over, notify kernel to trim unused entries
2685 // in its dcache/icache. Hopefully, the kernel will release some unused
2686 // inodes before the new MDS enters reconnect state.
2687 trim_cache_for_reconnect(session
);
2688 } else if (oldstate
== newstate
)
2689 continue; // no change
2691 session
->mds_state
= newstate
;
2692 if (old_inc
!= new_inc
&& newstate
> MDSMap::STATE_RECONNECT
) {
2693 // missed reconnect close the session so that it can be reopened
2694 _closed_mds_session(session
);
2697 if (newstate
== MDSMap::STATE_RECONNECT
) {
2698 session
->con
= messenger
->get_connection(session
->inst
);
2699 send_reconnect(session
);
2700 } else if (newstate
>= MDSMap::STATE_ACTIVE
) {
2701 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2702 // kick new requests
2703 kick_requests(session
);
2704 kick_flushing_caps(session
);
2705 signal_context_list(session
->waiting_for_open
);
2706 kick_maxsize_requests(session
);
2707 wake_inode_waiters(session
);
2709 connect_mds_targets(mds
);
2710 } else if (newstate
== MDSMap::STATE_NULL
&&
2711 mds
>= mdsmap
->get_max_mds()) {
2712 _closed_mds_session(session
);
2716 // kick any waiting threads
2717 signal_cond_list(waiting_for_mdsmap
);
2721 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2724 void Client::send_reconnect(MetaSession
*session
)
2726 mds_rank_t mds
= session
->mds_num
;
2727 ldout(cct
, 10) << "send_reconnect to mds." << mds
<< dendl
;
2729 // trim unused caps to reduce MDS's cache rejoin time
2730 trim_cache_for_reconnect(session
);
2732 session
->readonly
= false;
2734 if (session
->release
) {
2735 session
->release
->put();
2736 session
->release
= NULL
;
2739 // reset my cap seq number
2741 //connect to the mds' offload targets
2742 connect_mds_targets(mds
);
2743 //make sure unsafe requests get saved
2744 resend_unsafe_requests(session
);
2746 MClientReconnect
*m
= new MClientReconnect
;
2748 // i have an open session.
2749 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2750 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2751 p
!= inode_map
.end();
2753 Inode
*in
= p
->second
;
2754 if (in
->caps
.count(mds
)) {
2755 ldout(cct
, 10) << " caps on " << p
->first
2756 << " " << ccap_string(in
->caps
[mds
]->issued
)
2757 << " wants " << ccap_string(in
->caps_wanted())
2760 in
->make_long_path(path
);
2761 ldout(cct
, 10) << " path " << path
<< dendl
;
2764 _encode_filelocks(in
, flockbl
);
2766 Cap
*cap
= in
->caps
[mds
];
2767 cap
->seq
= 0; // reset seq.
2768 cap
->issue_seq
= 0; // reset seq.
2769 cap
->mseq
= 0; // reset seq.
2770 cap
->issued
= cap
->implemented
;
2772 snapid_t snap_follows
= 0;
2773 if (!in
->cap_snaps
.empty())
2774 snap_follows
= in
->cap_snaps
.begin()->first
;
2776 m
->add_cap(p
->first
.ino
,
2778 path
.get_ino(), path
.get_path(), // ino
2779 in
->caps_wanted(), // wanted
2780 cap
->issued
, // issued
2785 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2786 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2787 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2788 did_snaprealm
.insert(in
->snaprealm
->ino
);
2793 early_kick_flushing_caps(session
);
2795 session
->con
->send_message(m
);
2797 mount_cond
.Signal();
2801 void Client::kick_requests(MetaSession
*session
)
2803 ldout(cct
, 10) << "kick_requests for mds." << session
->mds_num
<< dendl
;
2804 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2805 p
!= mds_requests
.end();
2807 MetaRequest
*req
= p
->second
;
2808 if (req
->got_unsafe
)
2810 if (req
->aborted()) {
2811 if (req
->caller_cond
) {
2813 req
->caller_cond
->Signal();
2817 if (req
->retry_attempt
> 0)
2818 continue; // new requests only
2819 if (req
->mds
== session
->mds_num
) {
2820 send_request(p
->second
, session
);
2825 void Client::resend_unsafe_requests(MetaSession
*session
)
2827 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2830 send_request(*iter
, session
);
2832 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2833 // process completed requests in clientreplay stage.
2834 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2835 p
!= mds_requests
.end();
2837 MetaRequest
*req
= p
->second
;
2838 if (req
->got_unsafe
)
2842 if (req
->retry_attempt
== 0)
2843 continue; // old requests only
2844 if (req
->mds
== session
->mds_num
)
2845 send_request(req
, session
, true);
2849 void Client::wait_unsafe_requests()
2851 list
<MetaRequest
*> last_unsafe_reqs
;
2852 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2853 p
!= mds_sessions
.end();
2855 MetaSession
*s
= p
->second
;
2856 if (!s
->unsafe_requests
.empty()) {
2857 MetaRequest
*req
= s
->unsafe_requests
.back();
2859 last_unsafe_reqs
.push_back(req
);
2863 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2864 p
!= last_unsafe_reqs
.end();
2866 MetaRequest
*req
= *p
;
2867 if (req
->unsafe_item
.is_on_list())
2868 wait_on_list(req
->waitfor_safe
);
2873 void Client::kick_requests_closed(MetaSession
*session
)
2875 ldout(cct
, 10) << "kick_requests_closed for mds." << session
->mds_num
<< dendl
;
2876 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2877 p
!= mds_requests
.end(); ) {
2878 MetaRequest
*req
= p
->second
;
2880 if (req
->mds
== session
->mds_num
) {
2881 if (req
->caller_cond
) {
2883 req
->caller_cond
->Signal();
2885 req
->item
.remove_myself();
2886 if (req
->got_unsafe
) {
2887 lderr(cct
) << "kick_requests_closed removing unsafe request " << req
->get_tid() << dendl
;
2888 req
->unsafe_item
.remove_myself();
2889 req
->unsafe_dir_item
.remove_myself();
2890 req
->unsafe_target_item
.remove_myself();
2891 signal_cond_list(req
->waitfor_safe
);
2892 unregister_request(req
);
2896 assert(session
->requests
.empty());
2897 assert(session
->unsafe_requests
.empty());
2907 void Client::got_mds_push(MetaSession
*s
)
2910 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2911 if (s
->state
== MetaSession::STATE_CLOSING
) {
2912 s
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2916 void Client::handle_lease(MClientLease
*m
)
2918 ldout(cct
, 10) << "handle_lease " << *m
<< dendl
;
2920 assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2922 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2923 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2929 got_mds_push(session
);
2931 ceph_seq_t seq
= m
->get_seq();
2934 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
2935 if (inode_map
.count(vino
) == 0) {
2936 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
2939 in
= inode_map
[vino
];
2941 if (m
->get_mask() & CEPH_LOCK_DN
) {
2942 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
2943 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
2946 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
2947 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
2952 m
->get_connection()->send_message(
2954 CEPH_MDS_LEASE_RELEASE
, seq
,
2955 m
->get_mask(), m
->get_ino(), m
->get_first(), m
->get_last(), m
->dname
));
2959 void Client::put_inode(Inode
*in
, int n
)
2961 ldout(cct
, 10) << "put_inode on " << *in
<< dendl
;
2962 int left
= in
->_put(n
);
2965 remove_all_caps(in
);
2967 ldout(cct
, 10) << "put_inode deleting " << *in
<< dendl
;
2968 bool unclean
= objectcacher
->release_set(&in
->oset
);
2970 inode_map
.erase(in
->vino());
2971 if (use_faked_inos())
2972 _release_faked_ino(in
);
2977 while (!root_parents
.empty())
2978 root_parents
.erase(root_parents
.begin());
2985 void Client::close_dir(Dir
*dir
)
2987 Inode
*in
= dir
->parent_inode
;
2988 ldout(cct
, 15) << "close_dir dir " << dir
<< " on " << in
<< dendl
;
2989 assert(dir
->is_empty());
2990 assert(in
->dir
== dir
);
2991 assert(in
->dn_set
.size() < 2); // dirs can't be hard-linked
2992 if (!in
->dn_set
.empty())
2993 in
->get_first_parent()->put(); // unpin dentry
2997 put_inode(in
); // unpin inode
3001 * Don't call this with in==NULL, use get_or_create for that
3002 * leave dn set to default NULL unless you're trying to add
3003 * a new inode to a pre-created Dentry
3005 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
3008 // create a new Dentry
3014 dir
->dentries
[dn
->name
] = dn
;
3015 lru
.lru_insert_mid(dn
); // mid or top?
3017 dir
->num_null_dentries
++;
3019 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3020 << " dn " << dn
<< " (new dn)" << dendl
;
3024 dir
->num_null_dentries
--;
3025 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
3026 << " dn " << dn
<< " (old dn)" << dendl
;
3029 if (in
) { // link to inode
3033 dn
->get(); // dir -> dn pin
3035 dn
->get(); // ll_ref -> dn pin
3038 assert(in
->dn_set
.count(dn
) == 0);
3040 // only one parent for directories!
3041 if (in
->is_dir() && !in
->dn_set
.empty()) {
3042 Dentry
*olddn
= in
->get_first_parent();
3043 assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3044 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3045 old_diri
->dir_release_count
++;
3046 clear_dir_complete_and_ordered(old_diri
, true);
3047 unlink(olddn
, true, true); // keep dir, dentry
3050 in
->dn_set
.insert(dn
);
3052 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dn_set
<< dendl
;
3058 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3062 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3063 << " inode " << dn
->inode
<< dendl
;
3065 // unlink from inode
3069 dn
->put(); // dir -> dn pin
3071 dn
->put(); // ll_ref -> dn pin
3074 assert(in
->dn_set
.count(dn
));
3075 in
->dn_set
.erase(dn
);
3076 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dn_set
<< dendl
;
3082 dn
->dir
->num_null_dentries
++;
3084 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3087 dn
->dir
->dentries
.erase(dn
->name
);
3089 dn
->dir
->num_null_dentries
--;
3090 if (dn
->dir
->is_empty() && !keepdir
)
3101 * For asynchronous flushes, check for errors from the IO and
3102 * update the inode if necessary
3104 class C_Client_FlushComplete
: public Context
{
3109 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3110 void finish(int r
) override
{
3111 assert(client
->client_lock
.is_locked_by_me());
3113 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3114 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3115 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3116 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3117 inode
->set_async_err(r
);
3127 void Client::get_cap_ref(Inode
*in
, int cap
)
3129 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3130 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3131 ldout(cct
, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in
<< dendl
;
3134 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3135 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3136 ldout(cct
, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in
<< dendl
;
3139 in
->get_cap_ref(cap
);
3142 void Client::put_cap_ref(Inode
*in
, int cap
)
3144 int last
= in
->put_cap_ref(cap
);
3147 int drop
= last
& ~in
->caps_issued();
3148 if (in
->snapid
== CEPH_NOSNAP
) {
3149 if ((last
& CEPH_CAP_FILE_WR
) &&
3150 !in
->cap_snaps
.empty() &&
3151 in
->cap_snaps
.rbegin()->second
.writing
) {
3152 ldout(cct
, 10) << "put_cap_ref finishing pending cap_snap on " << *in
<< dendl
;
3153 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3154 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3155 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3157 if (last
& CEPH_CAP_FILE_BUFFER
) {
3158 for (auto &p
: in
->cap_snaps
)
3159 p
.second
.dirty_data
= 0;
3160 signal_cond_list(in
->waitfor_commit
);
3161 ldout(cct
, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3165 if (last
& CEPH_CAP_FILE_CACHE
) {
3166 ldout(cct
, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in
<< dendl
;
3172 put_inode(in
, put_nref
);
3176 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3178 int r
= check_pool_perm(in
, need
);
3183 int file_wanted
= in
->caps_file_wanted();
3184 if ((file_wanted
& need
) != need
) {
3185 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3186 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3192 int have
= in
->caps_issued(&implemented
);
3194 bool waitfor_caps
= false;
3195 bool waitfor_commit
= false;
3197 if (have
& need
& CEPH_CAP_FILE_WR
) {
3199 (endoff
>= (loff_t
)in
->max_size
||
3200 endoff
> (loff_t
)(in
->size
<< 1)) &&
3201 endoff
> (loff_t
)in
->wanted_max_size
) {
3202 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3203 in
->wanted_max_size
= endoff
;
3207 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3208 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3209 waitfor_caps
= true;
3211 if (!in
->cap_snaps
.empty()) {
3212 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3213 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3214 waitfor_caps
= true;
3216 for (auto &p
: in
->cap_snaps
) {
3217 if (p
.second
.dirty_data
) {
3218 waitfor_commit
= true;
3222 if (waitfor_commit
) {
3223 _flush(in
, new C_Client_FlushComplete(this, in
));
3224 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3229 if (!waitfor_caps
&& !waitfor_commit
) {
3230 if ((have
& need
) == need
) {
3231 int revoking
= implemented
& ~have
;
3232 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3233 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3234 << " revoking " << ccap_string(revoking
)
3236 if ((revoking
& want
) == 0) {
3237 *phave
= need
| (have
& want
);
3238 in
->get_cap_ref(need
);
3242 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3243 waitfor_caps
= true;
3246 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3247 in
->auth_cap
->session
->readonly
)
3250 if (in
->flags
& I_CAP_DROPPED
) {
3251 int mds_wanted
= in
->caps_mds_wanted();
3252 if ((mds_wanted
& need
) != need
) {
3253 int ret
= _renew_caps(in
);
3258 if ((mds_wanted
& file_wanted
) ==
3259 (file_wanted
& (CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
))) {
3260 in
->flags
&= ~I_CAP_DROPPED
;
3265 wait_on_list(in
->waitfor_caps
);
3266 else if (waitfor_commit
)
3267 wait_on_list(in
->waitfor_commit
);
3271 int Client::get_caps_used(Inode
*in
)
3273 unsigned used
= in
->caps_used();
3274 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3275 !objectcacher
->set_is_empty(&in
->oset
))
3276 used
|= CEPH_CAP_FILE_CACHE
;
3280 void Client::cap_delay_requeue(Inode
*in
)
3282 ldout(cct
, 10) << "cap_delay_requeue on " << *in
<< dendl
;
3283 in
->hold_caps_until
= ceph_clock_now();
3284 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3285 delayed_list
.push_back(&in
->delay_cap_item
);
3288 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3289 bool sync
, int used
, int want
, int retain
,
3290 int flush
, ceph_tid_t flush_tid
)
3292 int held
= cap
->issued
| cap
->implemented
;
3293 int revoking
= cap
->implemented
& ~cap
->issued
;
3294 retain
&= ~revoking
;
3295 int dropping
= cap
->issued
& ~retain
;
3296 int op
= CEPH_CAP_OP_UPDATE
;
3298 ldout(cct
, 10) << "send_cap " << *in
3299 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3300 << (sync
? " sync " : " async ")
3301 << " used " << ccap_string(used
)
3302 << " want " << ccap_string(want
)
3303 << " flush " << ccap_string(flush
)
3304 << " retain " << ccap_string(retain
)
3305 << " held "<< ccap_string(held
)
3306 << " revoking " << ccap_string(revoking
)
3307 << " dropping " << ccap_string(dropping
)
3310 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3311 const int would_have_issued
= cap
->issued
& retain
;
3312 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3314 // - tell the server we think issued is whatever they issued plus whatever we implemented
3315 // - leave what we have implemented in place
3316 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3317 cap
->issued
= cap
->issued
| cap
->implemented
;
3319 // Make an exception for revoking xattr caps: we are injecting
3320 // failure to release other caps, but allow xattr because client
3321 // will block on xattr ops if it can't release these to MDS (#9800)
3322 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3323 cap
->issued
^= xattr_mask
& revoking
;
3324 cap
->implemented
^= xattr_mask
& revoking
;
3326 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3327 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3330 cap
->issued
&= retain
;
3331 cap
->implemented
&= cap
->issued
| used
;
3334 snapid_t follows
= 0;
3337 follows
= in
->snaprealm
->get_snap_context().seq
;
3339 MClientCaps
*m
= new MClientCaps(op
,
3342 cap
->cap_id
, cap
->seq
,
3348 m
->caller_uid
= in
->cap_dirtier_uid
;
3349 m
->caller_gid
= in
->cap_dirtier_gid
;
3351 m
->head
.issue_seq
= cap
->issue_seq
;
3352 m
->set_tid(flush_tid
);
3354 m
->head
.uid
= in
->uid
;
3355 m
->head
.gid
= in
->gid
;
3356 m
->head
.mode
= in
->mode
;
3358 m
->head
.nlink
= in
->nlink
;
3360 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3361 ::encode(in
->xattrs
, m
->xattrbl
);
3362 m
->head
.xattr_version
= in
->xattr_version
;
3366 m
->max_size
= in
->max_size
;
3367 m
->truncate_seq
= in
->truncate_seq
;
3368 m
->truncate_size
= in
->truncate_size
;
3369 m
->mtime
= in
->mtime
;
3370 m
->atime
= in
->atime
;
3371 m
->ctime
= in
->ctime
;
3372 m
->btime
= in
->btime
;
3373 m
->time_warp_seq
= in
->time_warp_seq
;
3374 m
->change_attr
= in
->change_attr
;
3376 m
->flags
|= CLIENT_CAPS_SYNC
;
3378 if (flush
& CEPH_CAP_FILE_WR
) {
3379 m
->inline_version
= in
->inline_version
;
3380 m
->inline_data
= in
->inline_data
;
3383 in
->reported_size
= in
->size
;
3384 m
->set_snap_follows(follows
);
3386 if (cap
== in
->auth_cap
) {
3387 m
->set_max_size(in
->wanted_max_size
);
3388 in
->requested_max_size
= in
->wanted_max_size
;
3389 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3392 if (!session
->flushing_caps_tids
.empty())
3393 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3395 session
->con
->send_message(m
);
3398 static bool is_max_size_approaching(Inode
*in
)
3400 /* mds will adjust max size according to the reported size */
3401 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3403 if (in
->size
>= in
->max_size
)
3405 /* half of previous max_size increment has been used */
3406 if (in
->max_size
> in
->reported_size
&&
3407 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3415 * Examine currently used and wanted versus held caps. Release, flush or ack
3416 * revoked caps to the MDS as appropriate.
3418 * @param in the inode to check
3419 * @param flags flags to apply to cap check
3421 void Client::check_caps(Inode
*in
, unsigned flags
)
3423 unsigned wanted
= in
->caps_wanted();
3424 unsigned used
= get_caps_used(in
);
3427 if (in
->is_dir() && (in
->flags
& I_COMPLETE
)) {
3428 // we do this here because we don't want to drop to Fs (and then
3429 // drop the Fs if we do a create!) if that alone makes us send lookups
3430 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3431 wanted
|= CEPH_CAP_FILE_EXCL
;
3435 int issued
= in
->caps_issued(&implemented
);
3436 int revoking
= implemented
& ~issued
;
3438 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3441 retain
|= CEPH_CAP_ANY
;
3443 retain
|= CEPH_CAP_ANY_SHARED
;
3446 ldout(cct
, 10) << "check_caps on " << *in
3447 << " wanted " << ccap_string(wanted
)
3448 << " used " << ccap_string(used
)
3449 << " issued " << ccap_string(issued
)
3450 << " revoking " << ccap_string(revoking
)
3451 << " flags=" << flags
3454 if (in
->snapid
!= CEPH_NOSNAP
)
3455 return; //snap caps last forever, can't write
3457 if (in
->caps
.empty())
3458 return; // guard if at end of func
3460 if ((revoking
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) &&
3461 (used
& CEPH_CAP_FILE_CACHE
) && !(used
& CEPH_CAP_FILE_BUFFER
)) {
3463 used
&= ~CEPH_CAP_FILE_CACHE
;
3466 if (!in
->cap_snaps
.empty())
3469 if (flags
& CHECK_CAPS_NODELAY
)
3470 in
->hold_caps_until
= utime_t();
3472 cap_delay_requeue(in
);
3474 utime_t now
= ceph_clock_now();
3476 map
<mds_rank_t
, Cap
*>::iterator it
= in
->caps
.begin();
3477 while (it
!= in
->caps
.end()) {
3478 mds_rank_t mds
= it
->first
;
3479 Cap
*cap
= it
->second
;
3482 MetaSession
*session
= mds_sessions
[mds
];
3486 if (in
->auth_cap
&& cap
!= in
->auth_cap
)
3487 cap_used
&= ~in
->auth_cap
->issued
;
3489 revoking
= cap
->implemented
& ~cap
->issued
;
3491 ldout(cct
, 10) << " cap mds." << mds
3492 << " issued " << ccap_string(cap
->issued
)
3493 << " implemented " << ccap_string(cap
->implemented
)
3494 << " revoking " << ccap_string(revoking
) << dendl
;
3496 if (in
->wanted_max_size
> in
->max_size
&&
3497 in
->wanted_max_size
> in
->requested_max_size
&&
3498 cap
== in
->auth_cap
)
3501 /* approaching file_max? */
3502 if ((cap
->issued
& CEPH_CAP_FILE_WR
) &&
3503 cap
== in
->auth_cap
&&
3504 is_max_size_approaching(in
)) {
3505 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3506 << ", reported " << in
->reported_size
<< dendl
;
3510 /* completed revocation? */
3511 if (revoking
&& (revoking
& cap_used
) == 0) {
3512 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
->implemented
& ~cap
->issued
) << dendl
;
3516 /* want more caps from mds? */
3517 if (wanted
& ~(cap
->wanted
| cap
->issued
))
3520 if (!revoking
&& unmounting
&& (cap_used
== 0))
3523 if (wanted
== cap
->wanted
&& // mds knows what we want.
3524 ((cap
->issued
& ~retain
) == 0) &&// and we don't have anything we wouldn't like
3525 !in
->dirty_caps
) // and we have no dirty caps
3528 if (now
< in
->hold_caps_until
) {
3529 ldout(cct
, 10) << "delaying cap release" << dendl
;
3534 // re-send old cap/snapcap flushes first.
3535 if (session
->mds_state
>= MDSMap::STATE_RECONNECT
&&
3536 session
->mds_state
< MDSMap::STATE_ACTIVE
&&
3537 session
->early_flushing_caps
.count(in
) == 0) {
3538 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3539 << " to mds." << session
->mds_num
<< dendl
;
3540 session
->early_flushing_caps
.insert(in
);
3541 if (in
->cap_snaps
.size())
3542 flush_snaps(in
, true);
3543 if (in
->flushing_caps
)
3544 flush_caps(in
, session
, flags
& CHECK_CAPS_SYNCHRONOUS
);
3548 ceph_tid_t flush_tid
;
3549 if (in
->auth_cap
== cap
&& in
->dirty_caps
) {
3550 flushing
= mark_caps_flushing(in
, &flush_tid
);
3556 send_cap(in
, session
, cap
, flags
& CHECK_CAPS_SYNCHRONOUS
, cap_used
, wanted
,
3557 retain
, flushing
, flush_tid
);
3562 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3564 int used
= get_caps_used(in
);
3565 int dirty
= in
->caps_dirty();
3566 ldout(cct
, 10) << "queue_cap_snap " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3568 if (in
->cap_snaps
.size() &&
3569 in
->cap_snaps
.rbegin()->second
.writing
) {
3570 ldout(cct
, 10) << "queue_cap_snap already have pending cap_snap on " << *in
<< dendl
;
3572 } else if (in
->caps_dirty() ||
3573 (used
& CEPH_CAP_FILE_WR
) ||
3574 (dirty
& CEPH_CAP_ANY_WR
)) {
3575 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3576 assert(capsnapem
.second
== true); /* element inserted */
3577 CapSnap
&capsnap
= capsnapem
.first
->second
;
3578 capsnap
.context
= old_snapc
;
3579 capsnap
.issued
= in
->caps_issued();
3580 capsnap
.dirty
= in
->caps_dirty();
3582 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3584 capsnap
.uid
= in
->uid
;
3585 capsnap
.gid
= in
->gid
;
3586 capsnap
.mode
= in
->mode
;
3587 capsnap
.btime
= in
->btime
;
3588 capsnap
.xattrs
= in
->xattrs
;
3589 capsnap
.xattr_version
= in
->xattr_version
;
3591 if (used
& CEPH_CAP_FILE_WR
) {
3592 ldout(cct
, 10) << "queue_cap_snap WR used on " << *in
<< dendl
;
3593 capsnap
.writing
= 1;
3595 finish_cap_snap(in
, capsnap
, used
);
3598 ldout(cct
, 10) << "queue_cap_snap not dirty|writing on " << *in
<< dendl
;
3602 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3604 ldout(cct
, 10) << "finish_cap_snap " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3605 capsnap
.size
= in
->size
;
3606 capsnap
.mtime
= in
->mtime
;
3607 capsnap
.atime
= in
->atime
;
3608 capsnap
.ctime
= in
->ctime
;
3609 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3610 capsnap
.change_attr
= in
->change_attr
;
3612 capsnap
.dirty
|= in
->caps_dirty();
3614 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3615 capsnap
.inline_data
= in
->inline_data
;
3616 capsnap
.inline_version
= in
->inline_version
;
3619 if (used
& CEPH_CAP_FILE_BUFFER
) {
3620 ldout(cct
, 10) << "finish_cap_snap " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3621 << " WRBUFFER, delaying" << dendl
;
3623 capsnap
.dirty_data
= 0;
3628 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3630 ldout(cct
, 10) << "_flushed_cap_snap seq " << seq
<< " on " << *in
<< dendl
;
3631 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3635 void Client::flush_snaps(Inode
*in
, bool all_again
)
3637 ldout(cct
, 10) << "flush_snaps on " << *in
<< " all_again " << all_again
<< dendl
;
3638 assert(in
->cap_snaps
.size());
3641 assert(in
->auth_cap
);
3642 MetaSession
*session
= in
->auth_cap
->session
;
3643 int mseq
= in
->auth_cap
->mseq
;
3645 for (auto &p
: in
->cap_snaps
) {
3646 CapSnap
&capsnap
= p
.second
;
3648 // only flush once per session
3649 if (capsnap
.flush_tid
> 0)
3653 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3654 << " follows " << p
.first
3655 << " size " << capsnap
.size
3656 << " mtime " << capsnap
.mtime
3657 << " dirty_data=" << capsnap
.dirty_data
3658 << " writing=" << capsnap
.writing
3659 << " on " << *in
<< dendl
;
3660 if (capsnap
.dirty_data
|| capsnap
.writing
)
3663 if (capsnap
.flush_tid
== 0) {
3664 capsnap
.flush_tid
= ++last_flush_tid
;
3665 if (!in
->flushing_cap_item
.is_on_list())
3666 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3667 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3670 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_FLUSHSNAP
, in
->ino
, in
->snaprealm
->ino
, 0, mseq
,
3673 m
->caller_uid
= user_id
;
3675 m
->caller_gid
= group_id
;
3677 m
->set_client_tid(capsnap
.flush_tid
);
3678 m
->head
.snap_follows
= p
.first
;
3680 m
->head
.caps
= capsnap
.issued
;
3681 m
->head
.dirty
= capsnap
.dirty
;
3683 m
->head
.uid
= capsnap
.uid
;
3684 m
->head
.gid
= capsnap
.gid
;
3685 m
->head
.mode
= capsnap
.mode
;
3686 m
->btime
= capsnap
.btime
;
3688 m
->size
= capsnap
.size
;
3690 m
->head
.xattr_version
= capsnap
.xattr_version
;
3691 ::encode(capsnap
.xattrs
, m
->xattrbl
);
3693 m
->ctime
= capsnap
.ctime
;
3694 m
->btime
= capsnap
.btime
;
3695 m
->mtime
= capsnap
.mtime
;
3696 m
->atime
= capsnap
.atime
;
3697 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3698 m
->change_attr
= capsnap
.change_attr
;
3700 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3701 m
->inline_version
= in
->inline_version
;
3702 m
->inline_data
= in
->inline_data
;
3705 assert(!session
->flushing_caps_tids
.empty());
3706 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3708 session
->con
->send_message(m
);
3714 void Client::wait_on_list(list
<Cond
*>& ls
)
3717 ls
.push_back(&cond
);
3718 cond
.Wait(client_lock
);
3722 void Client::signal_cond_list(list
<Cond
*>& ls
)
3724 for (list
<Cond
*>::iterator it
= ls
.begin(); it
!= ls
.end(); ++it
)
3728 void Client::wait_on_context_list(list
<Context
*>& ls
)
3733 ls
.push_back(new C_Cond(&cond
, &done
, &r
));
3735 cond
.Wait(client_lock
);
3738 void Client::signal_context_list(list
<Context
*>& ls
)
3740 while (!ls
.empty()) {
3741 ls
.front()->complete(0);
3746 void Client::wake_inode_waiters(MetaSession
*s
)
3748 xlist
<Cap
*>::iterator iter
= s
->caps
.begin();
3749 while (!iter
.end()){
3750 signal_cond_list((*iter
)->inode
->waitfor_caps
);
3756 // flush dirty data (from objectcache)
3758 class C_Client_CacheInvalidate
: public Context
{
3762 int64_t offset
, length
;
3764 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3765 client(c
), offset(off
), length(len
) {
3766 if (client
->use_faked_inos())
3767 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3771 void finish(int r
) override
{
3772 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3773 assert(!client
->client_lock
.is_locked_by_me());
3774 client
->_async_invalidate(ino
, offset
, length
);
3778 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3782 ldout(cct
, 10) << "_async_invalidate " << ino
<< " " << off
<< "~" << len
<< dendl
;
3783 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3786 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3788 if (ino_invalidate_cb
)
3789 // we queue the invalidate, which calls the callback and decrements the ref
3790 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3793 void Client::_invalidate_inode_cache(Inode
*in
)
3795 ldout(cct
, 10) << "_invalidate_inode_cache " << *in
<< dendl
;
3797 // invalidate our userspace inode cache
3798 if (cct
->_conf
->client_oc
) {
3799 objectcacher
->release_set(&in
->oset
);
3800 if (!objectcacher
->set_is_empty(&in
->oset
))
3801 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3804 _schedule_invalidate_callback(in
, 0, 0);
3807 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3809 ldout(cct
, 10) << "_invalidate_inode_cache " << *in
<< " " << off
<< "~" << len
<< dendl
;
3811 // invalidate our userspace inode cache
3812 if (cct
->_conf
->client_oc
) {
3813 vector
<ObjectExtent
> ls
;
3814 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3815 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3818 _schedule_invalidate_callback(in
, off
, len
);
3821 bool Client::_release(Inode
*in
)
3823 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3824 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3825 _invalidate_inode_cache(in
);
3831 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3833 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3835 if (!in
->oset
.dirty_or_tx
) {
3836 ldout(cct
, 10) << " nothing to flush" << dendl
;
3837 onfinish
->complete(0);
3841 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3842 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3843 objectcacher
->purge_set(&in
->oset
);
3845 onfinish
->complete(-ENOSPC
);
3850 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3853 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3855 assert(client_lock
.is_locked());
3856 if (!in
->oset
.dirty_or_tx
) {
3857 ldout(cct
, 10) << " nothing to flush" << dendl
;
3861 Mutex
flock("Client::_flush_range flock");
3864 Context
*onflush
= new C_SafeCond(&flock
, &cond
, &safe
);
3865 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3866 offset
, size
, onflush
);
3869 client_lock
.Unlock();
3878 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3880 // Mutex::Locker l(client_lock);
3881 assert(client_lock
.is_locked()); // will be called via dispatch() -> objecter -> ...
3882 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3887 void Client::_flushed(Inode
*in
)
3889 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3891 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3896 // checks common to add_update_cap, handle_cap_grant
3897 void Client::check_cap_issue(Inode
*in
, Cap
*cap
, unsigned issued
)
3899 unsigned had
= in
->caps_issued();
3901 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3902 !(had
& CEPH_CAP_FILE_CACHE
))
3905 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3906 !(had
& CEPH_CAP_FILE_SHARED
)) {
3910 clear_dir_complete_and_ordered(in
, true);
3914 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3915 unsigned issued
, unsigned seq
, unsigned mseq
, inodeno_t realm
,
3916 int flags
, const UserPerm
& cap_perms
)
3919 mds_rank_t mds
= mds_session
->mds_num
;
3920 if (in
->caps
.count(mds
)) {
3921 cap
= in
->caps
[mds
];
3924 * auth mds of the inode changed. we received the cap export
3925 * message, but still haven't received the cap import message.
3926 * handle_cap_export() updated the new auth MDS' cap.
3928 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3929 * a message that was send before the cap import message. So
3930 * don't remove caps.
3932 if (ceph_seq_cmp(seq
, cap
->seq
) <= 0) {
3933 assert(cap
== in
->auth_cap
);
3934 assert(cap
->cap_id
== cap_id
);
3937 issued
|= cap
->issued
;
3938 flags
|= CEPH_CAP_FLAG_AUTH
;
3941 mds_session
->num_caps
++;
3942 if (!in
->is_any_caps()) {
3943 assert(in
->snaprealm
== 0);
3944 in
->snaprealm
= get_snap_realm(realm
);
3945 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3946 ldout(cct
, 15) << "add_update_cap first one, opened snaprealm " << in
->snaprealm
<< dendl
;
3948 in
->caps
[mds
] = cap
= new Cap
;
3950 mds_session
->caps
.push_back(&cap
->cap_item
);
3951 cap
->session
= mds_session
;
3953 cap
->gen
= mds_session
->cap_gen
;
3956 check_cap_issue(in
, cap
, issued
);
3958 if (flags
& CEPH_CAP_FLAG_AUTH
) {
3959 if (in
->auth_cap
!= cap
&&
3960 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
3961 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
3962 ldout(cct
, 10) << "add_update_cap changing auth cap: "
3963 << "add myself to new auth MDS' flushing caps list" << dendl
;
3964 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
3970 unsigned old_caps
= cap
->issued
;
3971 cap
->cap_id
= cap_id
;
3972 cap
->issued
= issued
;
3973 cap
->implemented
|= issued
;
3975 cap
->issue_seq
= seq
;
3977 cap
->gen
= mds_session
->cap_gen
;
3978 cap
->latest_perms
= cap_perms
;
3979 ldout(cct
, 10) << "add_update_cap issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
->issued
)
3980 << " from mds." << mds
3984 if ((issued
& ~old_caps
) && in
->auth_cap
== cap
) {
3985 // non-auth MDS is revoking the newly grant caps ?
3986 for (map
<mds_rank_t
,Cap
*>::iterator it
= in
->caps
.begin(); it
!= in
->caps
.end(); ++it
) {
3987 if (it
->second
== cap
)
3989 if (it
->second
->implemented
& ~it
->second
->issued
& issued
) {
3990 check_caps(in
, CHECK_CAPS_NODELAY
);
3996 if (issued
& ~old_caps
)
3997 signal_cond_list(in
->waitfor_caps
);
4000 void Client::remove_cap(Cap
*cap
, bool queue_release
)
4002 Inode
*in
= cap
->inode
;
4003 MetaSession
*session
= cap
->session
;
4004 mds_rank_t mds
= cap
->session
->mds_num
;
4006 ldout(cct
, 10) << "remove_cap mds." << mds
<< " on " << *in
<< dendl
;
4008 if (queue_release
) {
4009 session
->enqueue_cap_release(
4017 if (in
->auth_cap
== cap
) {
4018 if (in
->flushing_cap_item
.is_on_list()) {
4019 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
4020 in
->flushing_cap_item
.remove_myself();
4022 in
->auth_cap
= NULL
;
4024 assert(in
->caps
.count(mds
));
4025 in
->caps
.erase(mds
);
4027 cap
->cap_item
.remove_myself();
4031 if (!in
->is_any_caps()) {
4032 ldout(cct
, 15) << "remove_cap last one, closing snaprealm " << in
->snaprealm
<< dendl
;
4033 in
->snaprealm_item
.remove_myself();
4034 put_snap_realm(in
->snaprealm
);
4039 void Client::remove_all_caps(Inode
*in
)
4041 while (!in
->caps
.empty())
4042 remove_cap(in
->caps
.begin()->second
, true);
4045 void Client::remove_session_caps(MetaSession
*s
)
4047 ldout(cct
, 10) << "remove_session_caps mds." << s
->mds_num
<< dendl
;
4049 while (s
->caps
.size()) {
4050 Cap
*cap
= *s
->caps
.begin();
4051 Inode
*in
= cap
->inode
;
4052 bool dirty_caps
= false, cap_snaps
= false;
4053 if (in
->auth_cap
== cap
) {
4054 cap_snaps
= !in
->cap_snaps
.empty();
4055 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4056 in
->wanted_max_size
= 0;
4057 in
->requested_max_size
= 0;
4058 in
->flags
|= I_CAP_DROPPED
;
4060 remove_cap(cap
, false);
4061 signal_cond_list(in
->waitfor_caps
);
4063 InodeRef
tmp_ref(in
);
4064 in
->cap_snaps
.clear();
4067 lderr(cct
) << "remove_session_caps still has dirty|flushing caps on " << *in
<< dendl
;
4068 if (in
->flushing_caps
) {
4069 num_flushing_caps
--;
4070 in
->flushing_cap_tids
.clear();
4072 in
->flushing_caps
= 0;
4073 in
->mark_caps_clean();
4077 s
->flushing_caps_tids
.clear();
4081 int Client::_do_remount(bool retry_on_error
)
4083 uint64_t max_retries
= cct
->_conf
->get_val
<uint64_t>("mds_max_retries_on_remount_failure");
4086 int r
= remount_cb(callback_handle
);
4088 retries_on_invalidate
= 0;
4091 client_t whoami
= get_nodeid();
4094 "failed to remount (to trim kernel dentries): "
4095 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4098 "failed to remount (to trim kernel dentries): "
4099 "return code = " << r
<< dendl
;
4102 (cct
->_conf
->get_val
<bool>("client_die_on_failed_remount") ||
4103 cct
->_conf
->get_val
<bool>("client_die_on_failed_dentry_invalidate")) &&
4104 !(retry_on_error
&& (++retries_on_invalidate
< max_retries
));
4105 if (should_abort
&& !unmounting
) {
4106 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4113 class C_Client_Remount
: public Context
{
4117 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4118 void finish(int r
) override
{
4120 client
->_do_remount(true);
4124 void Client::_invalidate_kernel_dcache()
4128 if (can_invalidate_dentries
) {
4129 if (dentry_invalidate_cb
&& root
->dir
) {
4130 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4131 p
!= root
->dir
->dentries
.end();
4133 if (p
->second
->inode
)
4134 _schedule_invalidate_dentry_callback(p
->second
, false);
4137 } else if (remount_cb
) {
4139 // when remounting a file system, linux kernel trims all unused dentries in the fs
4140 remount_finisher
.queue(new C_Client_Remount(this));
4144 void Client::_trim_negative_child_dentries(InodeRef
& in
)
4150 if (dir
&& dir
->dentries
.size() == dir
->num_null_dentries
) {
4151 for (auto p
= dir
->dentries
.begin(); p
!= dir
->dentries
.end(); ) {
4152 Dentry
*dn
= p
->second
;
4155 if (dn
->lru_is_expireable())
4156 unlink(dn
, true, false); // keep dir, drop dentry
4158 if (dir
->dentries
.empty()) {
4163 if (in
->flags
& I_SNAPDIR_OPEN
) {
4164 InodeRef snapdir
= open_snapdir(in
.get());
4165 _trim_negative_child_dentries(snapdir
);
4169 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4171 mds_rank_t mds
= s
->mds_num
;
4172 size_t caps_size
= s
->caps
.size();
4173 ldout(cct
, 10) << "trim_caps mds." << mds
<< " max " << max
4174 << " caps " << caps_size
<< dendl
;
4176 uint64_t trimmed
= 0;
4177 auto p
= s
->caps
.begin();
4178 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4179 * looking at from getting deleted during traversal. */
4180 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4182 InodeRef
in(cap
->inode
);
4184 // Increment p early because it will be invalidated if cap
4185 // is deleted inside remove_cap
4188 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4189 int mine
= cap
->issued
| cap
->implemented
;
4190 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4191 // disposable non-auth cap
4192 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4193 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4194 cap
= (remove_cap(cap
, true), nullptr);
4198 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4199 _trim_negative_child_dentries(in
);
4201 set
<Dentry
*>::iterator q
= in
->dn_set
.begin();
4202 while (q
!= in
->dn_set
.end()) {
4204 if (dn
->lru_is_expireable()) {
4205 if (can_invalidate_dentries
&&
4206 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4207 // Only issue one of these per DN for inodes in root: handle
4208 // others more efficiently by calling for root-child DNs at
4209 // the end of this function.
4210 _schedule_invalidate_dentry_callback(dn
, true);
4212 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4215 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4219 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4220 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4225 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4226 for (const auto &dn
: to_trim
) {
4231 caps_size
= s
->caps
.size();
4232 if (caps_size
> max
)
4233 _invalidate_kernel_dcache();
4236 void Client::force_session_readonly(MetaSession
*s
)
4239 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4240 Inode
*in
= (*p
)->inode
;
4241 if (in
->caps_wanted() & CEPH_CAP_FILE_WR
)
4242 signal_cond_list(in
->waitfor_caps
);
4246 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4248 MetaSession
*session
= in
->auth_cap
->session
;
4250 int flushing
= in
->dirty_caps
;
4253 ceph_tid_t flush_tid
= ++last_flush_tid
;
4254 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4256 if (!in
->flushing_caps
) {
4257 ldout(cct
, 10) << "mark_caps_flushing " << ccap_string(flushing
) << " " << *in
<< dendl
;
4258 num_flushing_caps
++;
4260 ldout(cct
, 10) << "mark_caps_flushing (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4263 in
->flushing_caps
|= flushing
;
4264 in
->mark_caps_clean();
4266 if (!in
->flushing_cap_item
.is_on_list())
4267 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4268 session
->flushing_caps_tids
.insert(flush_tid
);
4274 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4276 for (auto &p
: in
->cap_snaps
) {
4277 CapSnap
&capsnap
= p
.second
;
4278 if (capsnap
.flush_tid
> 0) {
4279 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4280 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4283 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4284 it
!= in
->flushing_cap_tids
.end();
4286 old_s
->flushing_caps_tids
.erase(it
->first
);
4287 new_s
->flushing_caps_tids
.insert(it
->first
);
4289 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4293 * Flush all caps back to the MDS. Because the callers generally wait on the
4294 * result of this function (syncfs and umount cases), we set
4295 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4297 void Client::flush_caps_sync()
4299 ldout(cct
, 10) << __func__
<< dendl
;
4300 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4302 unsigned flags
= CHECK_CAPS_NODELAY
;
4306 delayed_list
.pop_front();
4307 if (p
.end() && dirty_list
.empty())
4308 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4309 check_caps(in
, flags
);
4313 p
= dirty_list
.begin();
4315 unsigned flags
= CHECK_CAPS_NODELAY
;
4320 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4321 check_caps(in
, flags
);
4325 void Client::flush_caps(Inode
*in
, MetaSession
*session
, bool sync
)
4327 ldout(cct
, 10) << "flush_caps " << in
<< " mds." << session
->mds_num
<< dendl
;
4328 Cap
*cap
= in
->auth_cap
;
4329 assert(cap
->session
== session
);
4331 for (map
<ceph_tid_t
,int>::iterator p
= in
->flushing_cap_tids
.begin();
4332 p
!= in
->flushing_cap_tids
.end();
4334 bool req_sync
= false;
4336 /* If this is a synchronous request, then flush the journal on last one */
4337 if (sync
&& (p
->first
== in
->flushing_cap_tids
.rbegin()->first
))
4340 send_cap(in
, session
, cap
, req_sync
,
4341 (get_caps_used(in
) | in
->caps_dirty()),
4342 in
->caps_wanted(), (cap
->issued
| cap
->implemented
),
4343 p
->second
, p
->first
);
4347 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4349 while (in
->flushing_caps
) {
4350 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4351 assert(it
!= in
->flushing_cap_tids
.end());
4352 if (it
->first
> want
)
4354 ldout(cct
, 10) << "wait_sync_caps on " << *in
<< " flushing "
4355 << ccap_string(it
->second
) << " want " << want
4356 << " last " << it
->first
<< dendl
;
4357 wait_on_list(in
->waitfor_caps
);
4361 void Client::wait_sync_caps(ceph_tid_t want
)
4364 ldout(cct
, 10) << "wait_sync_caps want " << want
<< " (last is " << last_flush_tid
<< ", "
4365 << num_flushing_caps
<< " total flushing)" << dendl
;
4366 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
4367 p
!= mds_sessions
.end();
4369 MetaSession
*s
= p
->second
;
4370 if (s
->flushing_caps_tids
.empty())
4372 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4373 if (oldest_tid
<= want
) {
4374 ldout(cct
, 10) << " waiting on mds." << p
->first
<< " tid " << oldest_tid
4375 << " (want " << want
<< ")" << dendl
;
4376 sync_cond
.Wait(client_lock
);
4382 void Client::kick_flushing_caps(MetaSession
*session
)
4384 mds_rank_t mds
= session
->mds_num
;
4385 ldout(cct
, 10) << "kick_flushing_caps mds." << mds
<< dendl
;
4387 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4389 if (session
->early_flushing_caps
.count(in
))
4391 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4392 if (in
->cap_snaps
.size())
4393 flush_snaps(in
, true);
4394 if (in
->flushing_caps
)
4395 flush_caps(in
, session
);
4398 session
->early_flushing_caps
.clear();
4401 void Client::early_kick_flushing_caps(MetaSession
*session
)
4403 session
->early_flushing_caps
.clear();
4405 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4407 assert(in
->auth_cap
);
4409 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4410 // stage. This guarantees that MDS processes the cap flush message before issuing
4411 // the flushing caps to other client.
4412 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
)
4415 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4416 << " to mds." << session
->mds_num
<< dendl
;
4418 session
->early_flushing_caps
.insert(in
);
4420 if (in
->cap_snaps
.size())
4421 flush_snaps(in
, true);
4422 if (in
->flushing_caps
)
4423 flush_caps(in
, session
);
4428 void Client::kick_maxsize_requests(MetaSession
*session
)
4430 xlist
<Cap
*>::iterator iter
= session
->caps
.begin();
4431 while (!iter
.end()){
4432 (*iter
)->inode
->requested_max_size
= 0;
4433 (*iter
)->inode
->wanted_max_size
= 0;
4434 signal_cond_list((*iter
)->inode
->waitfor_caps
);
4439 void SnapRealm::build_snap_context()
4441 set
<snapid_t
> snaps
;
4442 snapid_t max_seq
= seq
;
4444 // start with prior_parents?
4445 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4446 snaps
.insert(prior_parent_snaps
[i
]);
4448 // current parent's snaps
4450 const SnapContext
& psnapc
= pparent
->get_snap_context();
4451 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4452 if (psnapc
.snaps
[i
] >= parent_since
)
4453 snaps
.insert(psnapc
.snaps
[i
]);
4454 if (psnapc
.seq
> max_seq
)
4455 max_seq
= psnapc
.seq
;
4459 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4460 snaps
.insert(my_snaps
[i
]);
4463 cached_snap_context
.seq
= max_seq
;
4464 cached_snap_context
.snaps
.resize(0);
4465 cached_snap_context
.snaps
.reserve(snaps
.size());
4466 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4467 cached_snap_context
.snaps
.push_back(*p
);
4470 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4475 while (!q
.empty()) {
4479 ldout(cct
, 10) << "invalidate_snaprealm_and_children " << *realm
<< dendl
;
4480 realm
->invalidate_cache();
4482 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4483 p
!= realm
->pchildren
.end();
4489 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4491 SnapRealm
*realm
= snap_realms
[r
];
4493 snap_realms
[r
] = realm
= new SnapRealm(r
);
4494 ldout(cct
, 20) << "get_snap_realm " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4499 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4501 if (snap_realms
.count(r
) == 0) {
4502 ldout(cct
, 20) << "get_snap_realm_maybe " << r
<< " fail" << dendl
;
4505 SnapRealm
*realm
= snap_realms
[r
];
4506 ldout(cct
, 20) << "get_snap_realm_maybe " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4511 void Client::put_snap_realm(SnapRealm
*realm
)
4513 ldout(cct
, 20) << "put_snap_realm " << realm
->ino
<< " " << realm
4514 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4515 if (--realm
->nref
== 0) {
4516 snap_realms
.erase(realm
->ino
);
4517 if (realm
->pparent
) {
4518 realm
->pparent
->pchildren
.erase(realm
);
4519 put_snap_realm(realm
->pparent
);
4525 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4527 if (realm
->parent
!= parent
) {
4528 ldout(cct
, 10) << "adjust_realm_parent " << *realm
4529 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4530 realm
->parent
= parent
;
4531 if (realm
->pparent
) {
4532 realm
->pparent
->pchildren
.erase(realm
);
4533 put_snap_realm(realm
->pparent
);
4535 realm
->pparent
= get_snap_realm(parent
);
4536 realm
->pparent
->pchildren
.insert(realm
);
4542 static bool has_new_snaps(const SnapContext
& old_snapc
,
4543 const SnapContext
& new_snapc
)
4545 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4549 void Client::update_snap_trace(bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4551 SnapRealm
*first_realm
= NULL
;
4552 ldout(cct
, 10) << "update_snap_trace len " << bl
.length() << dendl
;
4554 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4556 bufferlist::iterator p
= bl
.begin();
4560 SnapRealm
*realm
= get_snap_realm(info
.ino());
4562 bool invalidate
= false;
4564 if (info
.seq() > realm
->seq
) {
4565 ldout(cct
, 10) << "update_snap_trace " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4569 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4570 // flush me + children
4573 while (!q
.empty()) {
4574 SnapRealm
*realm
= q
.front();
4577 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4578 p
!= realm
->pchildren
.end();
4582 if (dirty_realms
.count(realm
) == 0) {
4584 dirty_realms
[realm
] = realm
->get_snap_context();
4590 realm
->seq
= info
.seq();
4591 realm
->created
= info
.created();
4592 realm
->parent_since
= info
.parent_since();
4593 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4594 realm
->my_snaps
= info
.my_snaps
;
4598 // _always_ verify parent
4599 if (adjust_realm_parent(realm
, info
.parent()))
4603 invalidate_snaprealm_and_children(realm
);
4604 ldout(cct
, 15) << "update_snap_trace " << *realm
<< " self|parent updated" << dendl
;
4605 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4607 ldout(cct
, 10) << "update_snap_trace " << *realm
<< " seq " << info
.seq()
4608 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4612 first_realm
= realm
;
4614 put_snap_realm(realm
);
4617 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4618 q
!= dirty_realms
.end();
4620 SnapRealm
*realm
= q
->first
;
4621 // if there are new snaps ?
4622 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4623 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4624 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4628 queue_cap_snap(in
, q
->second
);
4631 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4633 put_snap_realm(realm
);
4637 *realm_ret
= first_realm
;
4639 put_snap_realm(first_realm
);
4642 void Client::handle_snap(MClientSnap
*m
)
4644 ldout(cct
, 10) << "handle_snap " << *m
<< dendl
;
4645 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4646 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4652 got_mds_push(session
);
4654 map
<Inode
*, SnapContext
> to_move
;
4655 SnapRealm
*realm
= 0;
4657 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4658 assert(m
->head
.split
);
4660 bufferlist::iterator p
= m
->bl
.begin();
4662 assert(info
.ino() == m
->head
.split
);
4664 // flush, then move, ino's.
4665 realm
= get_snap_realm(info
.ino());
4666 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4667 for (vector
<inodeno_t
>::iterator p
= m
->split_inos
.begin();
4668 p
!= m
->split_inos
.end();
4670 vinodeno_t
vino(*p
, CEPH_NOSNAP
);
4671 if (inode_map
.count(vino
)) {
4672 Inode
*in
= inode_map
[vino
];
4673 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4675 if (in
->snaprealm
->created
> info
.created()) {
4676 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4677 << *in
->snaprealm
<< dendl
;
4680 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4683 in
->snaprealm_item
.remove_myself();
4684 to_move
[in
] = in
->snaprealm
->get_snap_context();
4685 put_snap_realm(in
->snaprealm
);
4689 // move child snaprealms, too
4690 for (vector
<inodeno_t
>::iterator p
= m
->split_realms
.begin();
4691 p
!= m
->split_realms
.end();
4693 ldout(cct
, 10) << "adjusting snaprealm " << *p
<< " parent" << dendl
;
4694 SnapRealm
*child
= get_snap_realm_maybe(*p
);
4697 adjust_realm_parent(child
, realm
->ino
);
4698 put_snap_realm(child
);
4702 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4705 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4706 Inode
*in
= p
->first
;
4707 in
->snaprealm
= realm
;
4708 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4710 // queue for snap writeback
4711 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4712 queue_cap_snap(in
, p
->second
);
4714 put_snap_realm(realm
);
4720 void Client::handle_quota(MClientQuota
*m
)
4722 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4723 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4729 got_mds_push(session
);
4731 ldout(cct
, 10) << "handle_quota " << *m
<< " from mds." << mds
<< dendl
;
4733 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4734 if (inode_map
.count(vino
)) {
4736 in
= inode_map
[vino
];
4739 in
->quota
= m
->quota
;
4740 in
->rstat
= m
->rstat
;
4747 void Client::handle_caps(MClientCaps
*m
)
4749 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4750 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4756 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4757 // Pause RADOS operations until we see the required epoch
4758 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4761 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4762 // Record the barrier so that we will transmit it to MDS when releasing
4763 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4766 got_mds_push(session
);
4768 m
->clear_payload(); // for if/when we send back to MDS
4771 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4772 if (inode_map
.count(vino
))
4773 in
= inode_map
[vino
];
4775 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4776 ldout(cct
, 5) << "handle_caps don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4777 session
->enqueue_cap_release(
4784 ldout(cct
, 5) << "handle_caps don't have vino " << vino
<< ", dropping" << dendl
;
4788 // in case the mds is waiting on e.g. a revocation
4789 flush_cap_releases();
4793 switch (m
->get_op()) {
4794 case CEPH_CAP_OP_EXPORT
:
4795 return handle_cap_export(session
, in
, m
);
4796 case CEPH_CAP_OP_FLUSHSNAP_ACK
:
4797 return handle_cap_flushsnap_ack(session
, in
, m
);
4798 case CEPH_CAP_OP_IMPORT
:
4799 handle_cap_import(session
, in
, m
);
4802 if (in
->caps
.count(mds
) == 0) {
4803 ldout(cct
, 5) << "handle_caps don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4808 Cap
*cap
= in
->caps
[mds
];
4810 switch (m
->get_op()) {
4811 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4812 case CEPH_CAP_OP_IMPORT
:
4813 case CEPH_CAP_OP_REVOKE
:
4814 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, cap
, m
);
4815 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, cap
, m
);
4821 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4823 mds_rank_t mds
= session
->mds_num
;
4825 ldout(cct
, 5) << "handle_cap_import ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4826 << " IMPORT from mds." << mds
<< dendl
;
4828 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4831 if (m
->peer
.cap_id
&& in
->caps
.count(peer_mds
)) {
4832 cap
= in
->caps
[peer_mds
];
4834 cap_perms
= cap
->latest_perms
;
4839 SnapRealm
*realm
= NULL
;
4840 update_snap_trace(m
->snapbl
, &realm
);
4842 add_update_cap(in
, session
, m
->get_cap_id(),
4843 m
->get_caps(), m
->get_seq(), m
->get_mseq(), m
->get_realm(),
4844 CEPH_CAP_FLAG_AUTH
, cap_perms
);
4846 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4847 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4851 put_snap_realm(realm
);
4853 if (in
->auth_cap
&& in
->auth_cap
->session
->mds_num
== mds
) {
4854 // reflush any/all caps (if we are now the auth_cap)
4855 if (in
->cap_snaps
.size())
4856 flush_snaps(in
, true);
4857 if (in
->flushing_caps
)
4858 flush_caps(in
, session
);
4862 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4864 mds_rank_t mds
= session
->mds_num
;
4866 ldout(cct
, 5) << "handle_cap_export ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4867 << " EXPORT from mds." << mds
<< dendl
;
4870 if (in
->caps
.count(mds
))
4871 cap
= in
->caps
[mds
];
4873 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4875 if (cap
&& cap
->cap_id
== m
->get_cap_id()) {
4876 if (m
->peer
.cap_id
) {
4877 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4878 if (in
->caps
.count(peer_mds
)) {
4879 Cap
*tcap
= in
->caps
[peer_mds
];
4880 if (tcap
->cap_id
== m
->peer
.cap_id
&&
4881 ceph_seq_cmp(tcap
->seq
, m
->peer
.seq
) < 0) {
4882 tcap
->cap_id
= m
->peer
.cap_id
;
4883 tcap
->seq
= m
->peer
.seq
- 1;
4884 tcap
->issue_seq
= tcap
->seq
;
4885 tcap
->issued
|= cap
->issued
;
4886 tcap
->implemented
|= cap
->issued
;
4887 if (cap
== in
->auth_cap
)
4888 in
->auth_cap
= tcap
;
4889 if (in
->auth_cap
== tcap
&& in
->flushing_cap_item
.is_on_list())
4890 adjust_session_flushing_caps(in
, session
, tsession
);
4893 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
->issued
,
4894 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4895 cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4899 if (cap
== in
->auth_cap
)
4900 in
->flags
|= I_CAP_DROPPED
;
4903 remove_cap(cap
, false);
4909 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4911 mds_rank_t mds
= session
->mds_num
;
4912 assert(in
->caps
[mds
]);
4914 ldout(cct
, 10) << "handle_cap_trunc on ino " << *in
4915 << " size " << in
->size
<< " -> " << m
->get_size()
4919 in
->caps_issued(&issued
);
4920 issued
|= in
->caps_dirty();
4921 update_inode_file_size(in
, issued
, m
->get_size(),
4922 m
->get_truncate_seq(), m
->get_truncate_size());
4926 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, MClientCaps
*m
)
4928 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4929 int dirty
= m
->get_dirty();
4933 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4934 it
!= in
->flushing_cap_tids
.end(); ) {
4935 if (it
->first
== flush_ack_tid
)
4936 cleaned
= it
->second
;
4937 if (it
->first
<= flush_ack_tid
) {
4938 session
->flushing_caps_tids
.erase(it
->first
);
4939 in
->flushing_cap_tids
.erase(it
++);
4943 cleaned
&= ~it
->second
;
4949 ldout(cct
, 5) << "handle_cap_flush_ack mds." << session
->mds_num
4950 << " cleaned " << ccap_string(cleaned
) << " on " << *in
4951 << " with " << ccap_string(dirty
) << dendl
;
4954 signal_cond_list(in
->waitfor_caps
);
4955 if (session
->flushing_caps_tids
.empty() ||
4956 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
4961 in
->cap_dirtier_uid
= -1;
4962 in
->cap_dirtier_gid
= -1;
4966 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
4968 if (in
->flushing_caps
) {
4969 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
4970 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
4971 in
->flushing_caps
&= ~cleaned
;
4972 if (in
->flushing_caps
== 0) {
4973 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
4974 num_flushing_caps
--;
4975 if (in
->cap_snaps
.empty())
4976 in
->flushing_cap_item
.remove_myself();
4978 if (!in
->caps_dirty())
4987 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4989 mds_rank_t mds
= session
->mds_num
;
4990 assert(in
->caps
[mds
]);
4991 snapid_t follows
= m
->get_snap_follows();
4993 if (in
->cap_snaps
.count(follows
)) {
4994 CapSnap
&capsnap
= in
->cap_snaps
.at(follows
);
4995 if (m
->get_client_tid() != capsnap
.flush_tid
) {
4996 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != " << capsnap
.flush_tid
<< dendl
;
4998 ldout(cct
, 5) << "handle_cap_flushedsnap mds." << mds
<< " flushed snap follows " << follows
4999 << " on " << *in
<< dendl
;
5001 if (in
->get_num_ref() == 1)
5002 tmp_ref
= in
; // make sure inode not get freed while erasing item from in->cap_snaps
5003 if (in
->flushing_caps
== 0 && in
->cap_snaps
.empty())
5004 in
->flushing_cap_item
.remove_myself();
5005 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
5006 in
->cap_snaps
.erase(follows
);
5009 ldout(cct
, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds
<< " flushed snap follows " << follows
5010 << " on " << *in
<< dendl
;
5011 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5017 class C_Client_DentryInvalidate
: public Context
{
5024 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
5025 client(c
), name(dn
->name
) {
5026 if (client
->use_faked_inos()) {
5027 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
5029 ino
.ino
= dn
->inode
->faked_ino
;
5031 dirino
= dn
->dir
->parent_inode
->vino();
5033 ino
= dn
->inode
->vino();
5036 ino
.ino
= inodeno_t();
5038 void finish(int r
) override
{
5039 // _async_dentry_invalidate is responsible for its own locking
5040 assert(!client
->client_lock
.is_locked_by_me());
5041 client
->_async_dentry_invalidate(dirino
, ino
, name
);
5045 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
5049 ldout(cct
, 10) << "_async_dentry_invalidate '" << name
<< "' ino " << ino
5050 << " in dir " << dirino
<< dendl
;
5051 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
5054 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
5056 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
5057 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
5060 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
5062 int ref
= in
->get_num_ref();
5064 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5065 for (auto p
= in
->dir
->dentries
.begin();
5066 p
!= in
->dir
->dentries
.end(); ) {
5067 Dentry
*dn
= p
->second
;
5069 /* rmsnap removes whole subtree, need trim inodes recursively.
5070 * we don't need to invalidate dentries recursively. because
5071 * invalidating a directory dentry effectively invalidate
5073 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5074 _try_to_trim_inode(dn
->inode
.get(), false);
5076 if (dn
->lru_is_expireable())
5077 unlink(dn
, true, false); // keep dir, drop dentry
5079 if (in
->dir
->dentries
.empty()) {
5085 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5086 InodeRef snapdir
= open_snapdir(in
);
5087 _try_to_trim_inode(snapdir
.get(), false);
5091 if (ref
> 0 && in
->ll_ref
> 0 && sched_inval
) {
5092 set
<Dentry
*>::iterator q
= in
->dn_set
.begin();
5093 while (q
!= in
->dn_set
.end()) {
5095 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5096 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5097 _schedule_invalidate_dentry_callback(dn
, true);
5098 unlink(dn
, true, true);
5103 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, MClientCaps
*m
)
5105 mds_rank_t mds
= session
->mds_num
;
5106 int used
= get_caps_used(in
);
5107 int wanted
= in
->caps_wanted();
5109 const int old_caps
= cap
->issued
;
5110 const int new_caps
= m
->get_caps();
5111 ldout(cct
, 5) << "handle_cap_grant on in " << m
->get_ino()
5112 << " mds." << mds
<< " seq " << m
->get_seq()
5113 << " caps now " << ccap_string(new_caps
)
5114 << " was " << ccap_string(old_caps
) << dendl
;
5115 cap
->seq
= m
->get_seq();
5116 cap
->gen
= session
->cap_gen
;
5120 in
->caps_issued(&issued
);
5121 issued
|= in
->caps_dirty();
5123 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5124 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5125 in
->mode
= m
->head
.mode
;
5126 in
->uid
= m
->head
.uid
;
5127 in
->gid
= m
->head
.gid
;
5128 in
->btime
= m
->btime
;
5130 bool deleted_inode
= false;
5131 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5132 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5133 in
->nlink
= m
->head
.nlink
;
5134 if (in
->nlink
== 0 &&
5135 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5136 deleted_inode
= true;
5138 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5139 m
->xattrbl
.length() &&
5140 m
->head
.xattr_version
> in
->xattr_version
) {
5141 bufferlist::iterator p
= m
->xattrbl
.begin();
5142 ::decode(in
->xattrs
, p
);
5143 in
->xattr_version
= m
->head
.xattr_version
;
5146 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5147 in
->dirstat
.nfiles
= m
->get_nfiles();
5148 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5151 if (new_caps
& CEPH_CAP_ANY_RD
) {
5152 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5153 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5156 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5157 in
->layout
= m
->get_layout();
5158 update_inode_file_size(in
, issued
, m
->get_size(),
5159 m
->get_truncate_seq(), m
->get_truncate_size());
5162 if (m
->inline_version
> in
->inline_version
) {
5163 in
->inline_data
= m
->inline_data
;
5164 in
->inline_version
= m
->inline_version
;
5167 /* always take a newer change attr */
5168 if (m
->get_change_attr() > in
->change_attr
)
5169 in
->change_attr
= m
->get_change_attr();
5172 if (cap
== in
->auth_cap
&&
5173 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5174 (m
->get_max_size() != in
->max_size
)) {
5175 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5176 in
->max_size
= m
->get_max_size();
5177 if (in
->max_size
> in
->wanted_max_size
) {
5178 in
->wanted_max_size
= 0;
5179 in
->requested_max_size
= 0;
5184 if (m
->get_op() == CEPH_CAP_OP_IMPORT
&& m
->get_wanted() != wanted
)
5187 check_cap_issue(in
, cap
, new_caps
);
5190 int revoked
= old_caps
& ~new_caps
;
5192 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5193 cap
->issued
= new_caps
;
5194 cap
->implemented
|= new_caps
;
5196 // recall delegations if we're losing caps necessary for them
5197 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5198 in
->recall_deleg(false);
5199 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5200 in
->recall_deleg(true);
5202 if ((used
& revoked
& CEPH_CAP_FILE_BUFFER
) &&
5203 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5204 // waitin' for flush
5205 } else if (revoked
& CEPH_CAP_FILE_CACHE
) {
5209 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5212 } else if (old_caps
== new_caps
) {
5213 ldout(cct
, 10) << " caps unchanged at " << ccap_string(old_caps
) << dendl
;
5215 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~old_caps
) << dendl
;
5216 cap
->issued
= new_caps
;
5217 cap
->implemented
|= new_caps
;
5219 if (cap
== in
->auth_cap
) {
5220 // non-auth MDS is revoking the newly grant caps ?
5221 for (map
<mds_rank_t
, Cap
*>::iterator it
= in
->caps
.begin(); it
!= in
->caps
.end(); ++it
) {
5222 if (it
->second
== cap
)
5224 if (it
->second
->implemented
& ~it
->second
->issued
& new_caps
) {
5237 signal_cond_list(in
->waitfor_caps
);
5239 // may drop inode's last ref
5241 _try_to_trim_inode(in
, true);
5246 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5248 if (perms
.uid() == 0)
5251 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5252 int ret
= _posix_acl_permission(in
, perms
, want
);
5257 // check permissions before doing anything else
5258 if (!in
->check_mode(perms
, want
))
5263 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5264 const UserPerm
& perms
)
5266 int r
= _getattr_for_perm(in
, perms
);
5271 if (strncmp(name
, "system.", 7) == 0) {
5272 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5275 r
= inode_permission(in
, perms
, want
);
5278 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5282 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5283 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5287 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5288 const UserPerm
& perms
)
5290 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5291 int r
= _getattr_for_perm(in
, perms
);
5295 if (mask
& CEPH_SETATTR_SIZE
) {
5296 r
= inode_permission(in
, perms
, MAY_WRITE
);
5302 if (mask
& CEPH_SETATTR_UID
) {
5303 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5306 if (mask
& CEPH_SETATTR_GID
) {
5307 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5308 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5312 if (mask
& CEPH_SETATTR_MODE
) {
5313 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5316 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5317 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5318 stx
->stx_mode
&= ~S_ISGID
;
5321 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5322 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5323 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5324 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5325 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5326 check_mask
|= CEPH_SETATTR_MTIME
;
5327 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5328 check_mask
|= CEPH_SETATTR_ATIME
;
5329 if (check_mask
& mask
) {
5332 r
= inode_permission(in
, perms
, MAY_WRITE
);
5340 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5344 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5346 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5349 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5351 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5352 want
= MAY_READ
| MAY_WRITE
;
5353 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5355 if (flags
& O_TRUNC
)
5359 switch (in
->mode
& S_IFMT
) {
5364 if (want
& MAY_WRITE
) {
5371 r
= _getattr_for_perm(in
, perms
);
5375 r
= inode_permission(in
, perms
, want
);
5377 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5381 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5383 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5384 int r
= _getattr_for_perm(dir
, perms
);
5388 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5390 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5394 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5396 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5397 int r
= _getattr_for_perm(dir
, perms
);
5401 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5403 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5407 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5409 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5410 int r
= _getattr_for_perm(dir
, perms
);
5414 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5418 /* 'name == NULL' means rmsnap */
5419 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5421 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5424 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5428 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5432 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5434 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5435 int r
= _getattr_for_perm(in
, perms
);
5439 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5445 if (!S_ISREG(in
->mode
))
5448 if (in
->mode
& S_ISUID
)
5451 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5454 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5456 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5460 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5462 int mask
= CEPH_STAT_CAP_MODE
;
5464 if (acl_type
!= NO_ACL
) {
5465 mask
|= CEPH_STAT_CAP_XATTR
;
5466 force
= in
->xattr_version
== 0;
5468 return _getattr(in
, mask
, perms
, force
);
5471 vinodeno_t
Client::_get_vino(Inode
*in
)
5473 /* The caller must hold the client lock */
5474 return vinodeno_t(in
->ino
, in
->snapid
);
5477 inodeno_t
Client::_get_inodeno(Inode
*in
)
5479 /* The caller must hold the client lock */
5485 * Resolve an MDS spec to a list of MDS daemon GIDs.
5487 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5488 * It may be '*' in which case it matches all GIDs.
5490 * If no error is returned, the `targets` vector will be populated with at least
5493 int Client::resolve_mds(
5494 const std::string
&mds_spec
,
5495 std::vector
<mds_gid_t
> *targets
)
5498 assert(targets
!= nullptr);
5501 std::stringstream ss
;
5502 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5504 // We got a role, resolve it to a GID
5505 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5506 << role
<< "'" << dendl
;
5508 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5512 std::string strtol_err
;
5513 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5514 if (strtol_err
.empty()) {
5515 // It is a possible GID
5516 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5517 if (fsmap
->gid_exists(mds_gid
)) {
5518 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5519 targets
->push_back(mds_gid
);
5521 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5525 } else if (mds_spec
== "*") {
5526 // It is a wildcard: use all MDSs
5527 const auto mds_info
= fsmap
->get_mds_info();
5529 if (mds_info
.empty()) {
5530 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5534 for (const auto i
: mds_info
) {
5535 targets
->push_back(i
.first
);
5538 // It did not parse as an integer, it is not a wildcard, it must be a name
5539 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5541 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5543 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5547 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5548 << "' to GID " << mds_gid
<< dendl
;
5549 targets
->push_back(mds_gid
);
5558 * Authenticate with mon and establish global ID
5560 int Client::authenticate()
5562 assert(client_lock
.is_locked_by_me());
5564 if (monclient
->is_authenticated()) {
5568 client_lock
.Unlock();
5569 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5575 whoami
= monclient
->get_global_id();
5576 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5581 int Client::fetch_fsmap(bool user
)
5584 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5585 // rather than MDSMap because no one MDSMap contains all the daemons, and
5586 // a `tell` can address any daemon.
5587 version_t fsmap_latest
;
5590 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5591 client_lock
.Unlock();
5594 } while (r
== -EAGAIN
);
5597 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5601 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5604 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5605 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5606 monclient
->renew_subs();
5607 wait_on_list(waiting_for_fsmap
);
5610 assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5612 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5613 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5614 monclient
->renew_subs();
5615 wait_on_list(waiting_for_fsmap
);
5618 assert(fsmap
->get_epoch() >= fsmap_latest
);
5620 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5621 << fsmap_latest
<< dendl
;
5627 * @mds_spec one of ID, rank, GID, "*"
5630 int Client::mds_command(
5631 const std::string
&mds_spec
,
5632 const vector
<string
>& cmd
,
5633 const bufferlist
& inbl
,
5638 Mutex::Locker
lock(client_lock
);
5649 r
= fetch_fsmap(false);
5654 // Look up MDS target(s) of the command
5655 std::vector
<mds_gid_t
> targets
;
5656 r
= resolve_mds(mds_spec
, &targets
);
5661 // If daemons are laggy, we won't send them commands. If all
5662 // are laggy then we fail.
5663 std::vector
<mds_gid_t
> non_laggy
;
5664 for (const auto gid
: targets
) {
5665 const auto info
= fsmap
->get_info_gid(gid
);
5666 if (!info
.laggy()) {
5667 non_laggy
.push_back(gid
);
5670 if (non_laggy
.size() == 0) {
5671 *outs
= "All targeted MDS daemons are laggy";
5675 if (metadata
.empty()) {
5676 // We are called on an unmounted client, so metadata
5677 // won't be initialized yet.
5678 populate_metadata("");
5681 // Send commands to targets
5682 C_GatherBuilder
gather(cct
, onfinish
);
5683 for (const auto target_gid
: non_laggy
) {
5684 const auto info
= fsmap
->get_info_gid(target_gid
);
5686 // Open a connection to the target MDS
5687 entity_inst_t inst
= info
.get_inst();
5688 ConnectionRef conn
= messenger
->get_connection(inst
);
5690 // Generate MDSCommandOp state
5691 auto &op
= command_table
.start_command();
5693 op
.on_finish
= gather
.new_sub();
5698 op
.mds_gid
= target_gid
;
5701 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5702 << " tid=" << op
.tid
<< cmd
<< dendl
;
5704 // Construct and send MCommand
5705 MCommand
*m
= op
.get_message(monclient
->get_fsid());
5706 conn
->send_message(m
);
5713 void Client::handle_command_reply(MCommandReply
*m
)
5715 ceph_tid_t
const tid
= m
->get_tid();
5717 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5719 if (!command_table
.exists(tid
)) {
5720 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5725 auto &op
= command_table
.get_command(tid
);
5727 op
.outbl
->claim(m
->get_data());
5734 op
.on_finish
->complete(m
->r
);
5737 command_table
.erase(tid
);
5742 // -------------------
5745 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5748 Mutex::Locker
lock(client_lock
);
5751 ldout(cct
, 5) << "already mounted" << dendl
;
5757 int r
= authenticate();
5759 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5763 std::string want
= "mdsmap";
5764 const auto &mds_ns
= cct
->_conf
->client_mds_namespace
;
5765 if (!mds_ns
.empty()) {
5766 r
= fetch_fsmap(true);
5769 fs_cluster_id_t cid
= fsmap_user
->get_fs_cid(mds_ns
);
5770 if (cid
== FS_CLUSTER_ID_NONE
)
5773 std::ostringstream oss
;
5774 oss
<< want
<< "." << cid
;
5777 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5779 monclient
->sub_want(want
, 0, 0);
5780 monclient
->renew_subs();
5782 tick(); // start tick
5786 auto availability
= mdsmap
->is_cluster_available();
5787 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5789 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5790 return CEPH_FUSE_NO_MDS_UP
;
5791 } else if (availability
== MDSMap::AVAILABLE
) {
5792 // Continue to mount
5794 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5795 // Else, wait. MDSMonitor will update the map to bring
5796 // us to a conclusion eventually.
5797 wait_on_list(waiting_for_mdsmap
);
5799 // Unexpected value!
5805 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5807 filepath
fp(CEPH_INO_ROOT
);
5808 if (!mount_root
.empty()) {
5809 fp
= filepath(mount_root
.c_str());
5812 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5813 req
->set_filepath(fp
);
5814 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5815 int res
= make_request(req
, perms
);
5817 if (res
== -EACCES
&& root
) {
5818 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5836 if (!cct
->_conf
->client_trace
.empty()) {
5837 traceout
.open(cct
->_conf
->client_trace
.c_str());
5838 if (traceout
.is_open()) {
5839 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5841 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5846 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5847 ldout(cct, 3) << "op: struct stat st;" << dendl;
5848 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5849 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5850 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5851 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5852 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5853 ldout(cct, 3) << "op: int fd;" << dendl;
5860 void Client::_close_sessions()
5862 while (!mds_sessions
.empty()) {
5863 // send session closes!
5864 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
5865 p
!= mds_sessions
.end();
5867 if (p
->second
->state
!= MetaSession::STATE_CLOSING
) {
5868 _close_mds_session(p
->second
);
5872 // wait for sessions to close
5873 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5874 mount_cond
.Wait(client_lock
);
5878 void Client::flush_mdlog_sync()
5880 if (mds_requests
.empty())
5882 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
5883 p
!= mds_sessions
.end();
5885 MetaSession
*s
= p
->second
;
5890 void Client::flush_mdlog(MetaSession
*session
)
5892 // Only send this to Luminous or newer MDS daemons, older daemons
5893 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5894 const uint64_t features
= session
->con
->get_features();
5895 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5896 MClientSession
*m
= new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5897 session
->con
->send_message(m
);
5902 void Client::_unmount()
5907 ldout(cct
, 2) << "unmounting" << dendl
;
5912 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5913 while (!mds_requests
.empty()) {
5914 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests" << dendl
;
5915 mount_cond
.Wait(client_lock
);
5919 timer
.cancel_event(tick_event
);
5924 // clean up any unclosed files
5925 while (!fd_map
.empty()) {
5926 Fh
*fh
= fd_map
.begin()->second
;
5927 fd_map
.erase(fd_map
.begin());
5928 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
5932 while (!ll_unclosed_fh_set
.empty()) {
5933 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
5935 ll_unclosed_fh_set
.erase(fh
);
5936 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
5940 while (!opened_dirs
.empty()) {
5941 dir_result_t
*dirp
= *opened_dirs
.begin();
5942 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
5949 ldout(cct
, 0) << " skipping clean shutdown, we are blacklisted" << dendl
;
5951 if (cct
->_conf
->client_oc
) {
5952 // Purge all cached data so that ObjectCacher doesn't get hung up
5953 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5954 // is to just leave things marked dirty
5955 // (http://tracker.ceph.com/issues/9105)
5956 for (const auto &i
: inode_map
) {
5957 objectcacher
->purge_set(&(i
.second
->oset
));
5965 while (unsafe_sync_write
> 0) {
5966 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting" << dendl
;
5967 mount_cond
.Wait(client_lock
);
5970 if (cct
->_conf
->client_oc
) {
5971 // flush/release all buffered data
5972 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
5973 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
5974 p
!= inode_map
.end();
5978 Inode
*in
= p
->second
;
5980 ldout(cct
, 0) << "null inode_map entry ino " << p
->first
<< dendl
;
5983 if (!in
->caps
.empty()) {
5984 InodeRef
tmp_ref(in
);
5986 _flush(in
, new C_Client_FlushComplete(this, in
));
5992 wait_sync_caps(last_flush_tid
);
5997 while (lru
.lru_get_size() > 0 ||
5998 !inode_map
.empty()) {
5999 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
6000 << "+" << inode_map
.size() << " items"
6001 << ", waiting (for caps to release?)"
6003 utime_t until
= ceph_clock_now() + utime_t(5, 0);
6004 int r
= mount_cond
.WaitUntil(client_lock
, until
);
6005 if (r
== ETIMEDOUT
) {
6009 assert(lru
.lru_get_size() == 0);
6010 assert(inode_map
.empty());
6013 if (!cct
->_conf
->client_trace
.empty()) {
6014 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
6022 ldout(cct
, 2) << "unmounted." << dendl
;
6025 void Client::unmount()
6027 Mutex::Locker
lock(client_lock
);
6031 void Client::flush_cap_releases()
6033 // send any cap releases
6034 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
6035 p
!= mds_sessions
.end();
6037 if (p
->second
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
6039 if (cct
->_conf
->client_inject_release_failure
) {
6040 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
6041 p
->second
->release
->put();
6043 p
->second
->con
->send_message(p
->second
->release
);
6045 p
->second
->release
= 0;
6052 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
6053 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
6054 assert(0 == cct
->_conf
->set_val("client_debug_inject_tick_delay", "0"));
6055 cct
->_conf
->apply_changes(NULL
);
6058 ldout(cct
, 21) << "tick" << dendl
;
6059 tick_event
= timer
.add_event_after(
6060 cct
->_conf
->client_tick_interval
,
6061 new FunctionContext([this](int) {
6062 // Called back via Timer, which takes client_lock for us
6063 assert(client_lock
.is_locked_by_me());
6066 utime_t now
= ceph_clock_now();
6068 if (!mounted
&& !mds_requests
.empty()) {
6069 MetaRequest
*req
= mds_requests
.begin()->second
;
6070 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6071 req
->abort(-ETIMEDOUT
);
6072 if (req
->caller_cond
) {
6074 req
->caller_cond
->Signal();
6076 signal_cond_list(waiting_for_mdsmap
);
6077 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
6078 p
!= mds_sessions
.end();
6080 signal_context_list(p
->second
->waiting_for_open
);
6084 if (mdsmap
->get_epoch()) {
6086 utime_t el
= now
- last_cap_renew
;
6087 if (el
> mdsmap
->get_session_timeout() / 3.0)
6090 flush_cap_releases();
6094 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6098 if (in
->hold_caps_until
> now
)
6100 delayed_list
.pop_front();
6101 check_caps(in
, CHECK_CAPS_NODELAY
);
6107 void Client::renew_caps()
6109 ldout(cct
, 10) << "renew_caps()" << dendl
;
6110 last_cap_renew
= ceph_clock_now();
6112 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
6113 p
!= mds_sessions
.end();
6115 ldout(cct
, 15) << "renew_caps requesting from mds." << p
->first
<< dendl
;
6116 if (mdsmap
->get_state(p
->first
) >= MDSMap::STATE_REJOIN
)
6117 renew_caps(p
->second
);
6121 void Client::renew_caps(MetaSession
*session
)
6123 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6124 session
->last_cap_renew_request
= ceph_clock_now();
6125 uint64_t seq
= ++session
->cap_renew_seq
;
6126 session
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6130 // ===============================================================
6131 // high level (POSIXy) interface
6133 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6134 InodeRef
*target
, const UserPerm
& perms
)
6136 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6137 MetaRequest
*req
= new MetaRequest(op
);
6139 dir
->make_nosnap_relative_path(path
);
6140 path
.push_dentry(name
);
6141 req
->set_filepath(path
);
6142 req
->set_inode(dir
);
6143 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6144 mask
|= DEBUG_GETATTR_CAPS
;
6145 req
->head
.args
.getattr
.mask
= mask
;
6147 ldout(cct
, 10) << "_do_lookup on " << path
<< dendl
;
6149 int r
= make_request(req
, perms
, target
);
6150 ldout(cct
, 10) << "_do_lookup res is " << r
<< dendl
;
6154 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6155 const UserPerm
& perms
)
6160 if (!dir
->is_dir()) {
6165 if (dname
== "..") {
6166 if (dir
->dn_set
.empty())
6169 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6178 if (dname
.length() > NAME_MAX
) {
6183 if (dname
== cct
->_conf
->client_snapdir
&&
6184 dir
->snapid
== CEPH_NOSNAP
) {
6185 *target
= open_snapdir(dir
);
6190 dir
->dir
->dentries
.count(dname
)) {
6191 dn
= dir
->dir
->dentries
[dname
];
6193 ldout(cct
, 20) << "_lookup have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6194 << " seq " << dn
->lease_seq
6197 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6198 // is dn lease valid?
6199 utime_t now
= ceph_clock_now();
6200 if (dn
->lease_mds
>= 0 &&
6201 dn
->lease_ttl
> now
&&
6202 mds_sessions
.count(dn
->lease_mds
)) {
6203 MetaSession
*s
= mds_sessions
[dn
->lease_mds
];
6204 if (s
->cap_ttl
> now
&&
6205 s
->cap_gen
== dn
->lease_gen
) {
6206 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6207 // make trim_caps() behave.
6208 dir
->try_touch_cap(dn
->lease_mds
);
6211 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
6212 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6215 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6216 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6217 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6219 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6220 ldout(cct
, 10) << "_lookup concluded ENOENT locally for "
6221 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6226 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6229 // can we conclude ENOENT locally?
6230 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6231 (dir
->flags
& I_COMPLETE
)) {
6232 ldout(cct
, 10) << "_lookup concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6237 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6242 *target
= dn
->inode
;
6250 ldout(cct
, 10) << "_lookup " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6252 ldout(cct
, 10) << "_lookup " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6256 int Client::get_or_create(Inode
*dir
, const char* name
,
6257 Dentry
**pdn
, bool expect_null
)
6260 ldout(cct
, 20) << "get_or_create " << *dir
<< " name " << name
<< dendl
;
6262 if (dir
->dir
->dentries
.count(name
)) {
6263 Dentry
*dn
= dir
->dir
->dentries
[name
];
6265 // is dn lease valid?
6266 utime_t now
= ceph_clock_now();
6268 dn
->lease_mds
>= 0 &&
6269 dn
->lease_ttl
> now
&&
6270 mds_sessions
.count(dn
->lease_mds
)) {
6271 MetaSession
*s
= mds_sessions
[dn
->lease_mds
];
6272 if (s
->cap_ttl
> now
&&
6273 s
->cap_gen
== dn
->lease_gen
) {
6280 // otherwise link up a new one
6281 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6288 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6289 const UserPerm
& perms
, bool followsym
, int mask
)
6291 filepath path
= origpath
;
6293 if (origpath
.absolute())
6299 ldout(cct
, 10) << "path_walk " << path
<< dendl
;
6304 while (i
< path
.depth() && cur
) {
6306 const string
&dname
= path
[i
];
6307 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6308 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6310 if (cct
->_conf
->client_permissions
) {
6311 int r
= may_lookup(cur
.get(), perms
);
6314 caps
= CEPH_CAP_AUTH_SHARED
;
6317 /* Get extra requested caps on the last component */
6318 if (i
== (path
.depth() - 1))
6320 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6323 // only follow trailing symlink if followsym. always follow
6324 // 'directory' symlinks.
6325 if (next
&& next
->is_symlink()) {
6327 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6328 if (symlinks
> MAXSYMLINKS
) {
6332 if (i
< path
.depth() - 1) {
6334 // replace consumed components of path with symlink dir target
6335 filepath
resolved(next
->symlink
.c_str());
6336 resolved
.append(path
.postfixpath(i
+ 1));
6339 if (next
->symlink
[0] == '/') {
6343 } else if (followsym
) {
6344 if (next
->symlink
[0] == '/') {
6345 path
= next
->symlink
.c_str();
6350 filepath
more(next
->symlink
.c_str());
6351 // we need to remove the symlink component from off of the path
6352 // before adding the target that the symlink points to. remain
6353 // at the same position in the path.
6373 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6375 Mutex::Locker
lock(client_lock
);
6376 tout(cct
) << "link" << std::endl
;
6377 tout(cct
) << relexisting
<< std::endl
;
6378 tout(cct
) << relpath
<< std::endl
;
6383 filepath
existing(relexisting
);
6386 int r
= path_walk(existing
, &in
, perm
, true);
6389 if (std::string(relpath
) == "/") {
6393 filepath
path(relpath
);
6394 string name
= path
.last_dentry();
6397 r
= path_walk(path
, &dir
, perm
, true);
6400 if (cct
->_conf
->client_permissions
) {
6401 if (S_ISDIR(in
->mode
)) {
6405 r
= may_hardlink(in
.get(), perm
);
6408 r
= may_create(dir
.get(), perm
);
6412 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6416 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6418 Mutex::Locker
lock(client_lock
);
6419 tout(cct
) << "unlink" << std::endl
;
6420 tout(cct
) << relpath
<< std::endl
;
6425 if (std::string(relpath
) == "/")
6428 filepath
path(relpath
);
6429 string name
= path
.last_dentry();
6432 int r
= path_walk(path
, &dir
, perm
);
6435 if (cct
->_conf
->client_permissions
) {
6436 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6440 return _unlink(dir
.get(), name
.c_str(), perm
);
6443 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6445 Mutex::Locker
lock(client_lock
);
6446 tout(cct
) << "rename" << std::endl
;
6447 tout(cct
) << relfrom
<< std::endl
;
6448 tout(cct
) << relto
<< std::endl
;
6453 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6456 filepath
from(relfrom
);
6458 string fromname
= from
.last_dentry();
6460 string toname
= to
.last_dentry();
6463 InodeRef fromdir
, todir
;
6464 int r
= path_walk(from
, &fromdir
, perm
);
6467 r
= path_walk(to
, &todir
, perm
);
6471 if (cct
->_conf
->client_permissions
) {
6472 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6475 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6476 if (r
< 0 && r
!= -ENOENT
)
6479 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6486 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6488 Mutex::Locker
lock(client_lock
);
6489 tout(cct
) << "mkdir" << std::endl
;
6490 tout(cct
) << relpath
<< std::endl
;
6491 tout(cct
) << mode
<< std::endl
;
6492 ldout(cct
, 10) << "mkdir: " << relpath
<< dendl
;
6497 if (std::string(relpath
) == "/")
6500 filepath
path(relpath
);
6501 string name
= path
.last_dentry();
6504 int r
= path_walk(path
, &dir
, perm
);
6507 if (cct
->_conf
->client_permissions
) {
6508 r
= may_create(dir
.get(), perm
);
6512 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6515 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6517 Mutex::Locker
lock(client_lock
);
6518 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6519 tout(cct
) << "mkdirs" << std::endl
;
6520 tout(cct
) << relpath
<< std::endl
;
6521 tout(cct
) << mode
<< std::endl
;
6526 //get through existing parts of path
6527 filepath
path(relpath
);
6529 int r
= 0, caps
= 0;
6532 for (i
=0; i
<path
.depth(); ++i
) {
6533 if (cct
->_conf
->client_permissions
) {
6534 r
= may_lookup(cur
.get(), perms
);
6537 caps
= CEPH_CAP_AUTH_SHARED
;
6539 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6544 //check that we have work left to do
6545 if (i
==path
.depth()) return -EEXIST
;
6546 if (r
!=-ENOENT
) return r
;
6547 ldout(cct
, 20) << "mkdirs got through " << i
<< " directories on path " << relpath
<< dendl
;
6548 //make new directory at each level
6549 for (; i
<path
.depth(); ++i
) {
6550 if (cct
->_conf
->client_permissions
) {
6551 r
= may_create(cur
.get(), perms
);
6556 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6558 //check proper creation/existence
6559 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6560 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6564 //move to new dir and continue
6566 ldout(cct
, 20) << "mkdirs: successfully created directory "
6567 << filepath(cur
->ino
).get_path() << dendl
;
6572 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6574 Mutex::Locker
lock(client_lock
);
6575 tout(cct
) << "rmdir" << std::endl
;
6576 tout(cct
) << relpath
<< std::endl
;
6581 if (std::string(relpath
) == "/")
6584 filepath
path(relpath
);
6585 string name
= path
.last_dentry();
6588 int r
= path_walk(path
, &dir
, perms
);
6591 if (cct
->_conf
->client_permissions
) {
6592 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6596 return _rmdir(dir
.get(), name
.c_str(), perms
);
6599 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6601 Mutex::Locker
lock(client_lock
);
6602 tout(cct
) << "mknod" << std::endl
;
6603 tout(cct
) << relpath
<< std::endl
;
6604 tout(cct
) << mode
<< std::endl
;
6605 tout(cct
) << rdev
<< std::endl
;
6610 if (std::string(relpath
) == "/")
6613 filepath
path(relpath
);
6614 string name
= path
.last_dentry();
6617 int r
= path_walk(path
, &dir
, perms
);
6620 if (cct
->_conf
->client_permissions
) {
6621 int r
= may_create(dir
.get(), perms
);
6625 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6630 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6632 Mutex::Locker
lock(client_lock
);
6633 tout(cct
) << "symlink" << std::endl
;
6634 tout(cct
) << target
<< std::endl
;
6635 tout(cct
) << relpath
<< std::endl
;
6640 if (std::string(relpath
) == "/")
6643 filepath
path(relpath
);
6644 string name
= path
.last_dentry();
6647 int r
= path_walk(path
, &dir
, perms
);
6650 if (cct
->_conf
->client_permissions
) {
6651 int r
= may_create(dir
.get(), perms
);
6655 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6658 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6660 Mutex::Locker
lock(client_lock
);
6661 tout(cct
) << "readlink" << std::endl
;
6662 tout(cct
) << relpath
<< std::endl
;
6667 filepath
path(relpath
);
6669 int r
= path_walk(path
, &in
, perms
, false);
6673 return _readlink(in
.get(), buf
, size
);
6676 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6678 if (!in
->is_symlink())
6681 // copy into buf (at most size bytes)
6682 int r
= in
->symlink
.length();
6685 memcpy(buf
, in
->symlink
.c_str(), r
);
6692 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6694 bool yes
= in
->caps_issued_mask(mask
, true);
6696 ldout(cct
, 10) << "_getattr mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6700 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6702 in
->make_nosnap_relative_path(path
);
6703 req
->set_filepath(path
);
6705 req
->head
.args
.getattr
.mask
= mask
;
6707 int res
= make_request(req
, perms
);
6708 ldout(cct
, 10) << "_getattr result=" << res
<< dendl
;
6712 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6713 const UserPerm
& perms
, InodeRef
*inp
)
6715 int issued
= in
->caps_issued();
6717 ldout(cct
, 10) << "_setattr mask " << mask
<< " issued " <<
6718 ccap_string(issued
) << dendl
;
6720 if (in
->snapid
!= CEPH_NOSNAP
) {
6723 if ((mask
& CEPH_SETATTR_SIZE
) &&
6724 (unsigned long)stx
->stx_size
> in
->size
&&
6725 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6730 // make the change locally?
6731 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6732 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6733 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6734 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6735 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6738 * This works because we implicitly flush the caps as part of the
6739 * request, so the cap update check will happen with the writeback
6740 * cap context, and then the setattr check will happen with the
6743 * In reality this pattern is likely pretty rare (different users
6744 * setattr'ing the same file). If that turns out not to be the
6745 * case later, we can build a more complex pipelined cap writeback
6749 mask
|= CEPH_SETATTR_CTIME
;
6754 // caller just needs us to bump the ctime
6755 in
->ctime
= ceph_clock_now();
6756 in
->cap_dirtier_uid
= perms
.uid();
6757 in
->cap_dirtier_gid
= perms
.gid();
6758 if (issued
& CEPH_CAP_AUTH_EXCL
)
6759 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6760 else if (issued
& CEPH_CAP_FILE_EXCL
)
6761 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6762 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6763 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6765 mask
|= CEPH_SETATTR_CTIME
;
6768 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6769 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6771 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6773 if (mask
& CEPH_SETATTR_UID
) {
6774 in
->ctime
= ceph_clock_now();
6775 in
->cap_dirtier_uid
= perms
.uid();
6776 in
->cap_dirtier_gid
= perms
.gid();
6777 in
->uid
= stx
->stx_uid
;
6778 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6779 mask
&= ~CEPH_SETATTR_UID
;
6781 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6783 if (mask
& CEPH_SETATTR_GID
) {
6784 in
->ctime
= ceph_clock_now();
6785 in
->cap_dirtier_uid
= perms
.uid();
6786 in
->cap_dirtier_gid
= perms
.gid();
6787 in
->gid
= stx
->stx_gid
;
6788 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6789 mask
&= ~CEPH_SETATTR_GID
;
6791 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6794 if (mask
& CEPH_SETATTR_MODE
) {
6795 in
->ctime
= ceph_clock_now();
6796 in
->cap_dirtier_uid
= perms
.uid();
6797 in
->cap_dirtier_gid
= perms
.gid();
6798 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6799 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6800 mask
&= ~CEPH_SETATTR_MODE
;
6801 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6802 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6803 /* Must squash the any setuid/setgid bits with an ownership change */
6804 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6805 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6808 if (mask
& CEPH_SETATTR_BTIME
) {
6809 in
->ctime
= ceph_clock_now();
6810 in
->cap_dirtier_uid
= perms
.uid();
6811 in
->cap_dirtier_gid
= perms
.gid();
6812 in
->btime
= utime_t(stx
->stx_btime
);
6813 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6814 mask
&= ~CEPH_SETATTR_BTIME
;
6815 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6817 } else if (mask
& CEPH_SETATTR_SIZE
) {
6818 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6819 mask
|= CEPH_SETATTR_KILL_SGUID
;
6822 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6823 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6824 if (mask
& CEPH_SETATTR_MTIME
)
6825 in
->mtime
= utime_t(stx
->stx_mtime
);
6826 if (mask
& CEPH_SETATTR_ATIME
)
6827 in
->atime
= utime_t(stx
->stx_atime
);
6828 in
->ctime
= ceph_clock_now();
6829 in
->cap_dirtier_uid
= perms
.uid();
6830 in
->cap_dirtier_gid
= perms
.gid();
6831 in
->time_warp_seq
++;
6832 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6833 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6842 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6846 in
->make_nosnap_relative_path(path
);
6847 req
->set_filepath(path
);
6850 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6851 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6853 if (mask
& CEPH_SETATTR_MODE
) {
6854 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
6855 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6856 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6858 if (mask
& CEPH_SETATTR_UID
) {
6859 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
6860 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6861 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6863 if (mask
& CEPH_SETATTR_GID
) {
6864 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
6865 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6866 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6868 if (mask
& CEPH_SETATTR_BTIME
) {
6869 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
6870 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6872 if (mask
& CEPH_SETATTR_MTIME
) {
6873 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
6874 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6877 if (mask
& CEPH_SETATTR_ATIME
) {
6878 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
6879 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
6882 if (mask
& CEPH_SETATTR_SIZE
) {
6883 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
6884 req
->head
.args
.setattr
.size
= stx
->stx_size
;
6885 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
6888 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
6891 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6894 req
->head
.args
.setattr
.mask
= mask
;
6896 req
->regetattr_mask
= mask
;
6898 int res
= make_request(req
, perms
, inp
);
6899 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
6903 /* Note that we only care about attrs that setattr cares about */
6904 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
6906 stx
->stx_size
= st
->st_size
;
6907 stx
->stx_mode
= st
->st_mode
;
6908 stx
->stx_uid
= st
->st_uid
;
6909 stx
->stx_gid
= st
->st_gid
;
6910 stx
->stx_mtime
= st
->st_mtim
;
6911 stx
->stx_atime
= st
->st_atim
;
6914 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6915 const UserPerm
& perms
, InodeRef
*inp
)
6917 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
6920 if (mask
& CEPH_SETATTR_MODE
)
6921 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
6925 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
6926 const UserPerm
& perms
)
6928 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
6929 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
6930 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
6931 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
6932 if (cct
->_conf
->client_permissions
) {
6933 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
6937 return __setattrx(in
.get(), stx
, mask
, perms
);
6940 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
6941 const UserPerm
& perms
)
6943 struct ceph_statx stx
;
6945 stat_to_statx(attr
, &stx
);
6946 mask
&= ~CEPH_SETATTR_BTIME
;
6948 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
6949 mask
&= ~CEPH_SETATTR_UID
;
6951 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
6952 mask
&= ~CEPH_SETATTR_GID
;
6955 return _setattrx(in
, &stx
, mask
, perms
);
6958 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
6959 const UserPerm
& perms
)
6961 Mutex::Locker
lock(client_lock
);
6962 tout(cct
) << "setattr" << std::endl
;
6963 tout(cct
) << relpath
<< std::endl
;
6964 tout(cct
) << mask
<< std::endl
;
6969 filepath
path(relpath
);
6971 int r
= path_walk(path
, &in
, perms
);
6974 return _setattr(in
, attr
, mask
, perms
);
6977 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
6978 const UserPerm
& perms
, int flags
)
6980 Mutex::Locker
lock(client_lock
);
6981 tout(cct
) << "setattrx" << std::endl
;
6982 tout(cct
) << relpath
<< std::endl
;
6983 tout(cct
) << mask
<< std::endl
;
6988 filepath
path(relpath
);
6990 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
6993 return _setattrx(in
, stx
, mask
, perms
);
6996 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
6998 Mutex::Locker
lock(client_lock
);
6999 tout(cct
) << "fsetattr" << std::endl
;
7000 tout(cct
) << fd
<< std::endl
;
7001 tout(cct
) << mask
<< std::endl
;
7006 Fh
*f
= get_filehandle(fd
);
7009 #if defined(__linux__) && defined(O_PATH)
7010 if (f
->flags
& O_PATH
)
7013 return _setattr(f
->inode
, attr
, mask
, perms
);
7016 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
7018 Mutex::Locker
lock(client_lock
);
7019 tout(cct
) << "fsetattr" << std::endl
;
7020 tout(cct
) << fd
<< std::endl
;
7021 tout(cct
) << mask
<< std::endl
;
7026 Fh
*f
= get_filehandle(fd
);
7029 #if defined(__linux__) && defined(O_PATH)
7030 if (f
->flags
& O_PATH
)
7033 return _setattrx(f
->inode
, stx
, mask
, perms
);
7036 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
7037 frag_info_t
*dirstat
, int mask
)
7039 ldout(cct
, 3) << "stat enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7040 Mutex::Locker
lock(client_lock
);
7041 tout(cct
) << "stat" << std::endl
;
7042 tout(cct
) << relpath
<< std::endl
;
7047 filepath
path(relpath
);
7049 int r
= path_walk(path
, &in
, perms
, true, mask
);
7052 r
= _getattr(in
, mask
, perms
);
7054 ldout(cct
, 3) << "stat exit on error!" << dendl
;
7057 fill_stat(in
, stbuf
, dirstat
);
7058 ldout(cct
, 3) << "stat exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7062 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7066 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7067 if (flags
& AT_NO_ATTR_SYNC
)
7070 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7071 mask
|= CEPH_CAP_PIN
;
7072 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7073 mask
|= CEPH_CAP_AUTH_SHARED
;
7074 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7075 mask
|= CEPH_CAP_LINK_SHARED
;
7076 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7077 mask
|= CEPH_CAP_FILE_SHARED
;
7078 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7079 mask
|= CEPH_CAP_XATTR_SHARED
;
7084 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7085 const UserPerm
& perms
,
7086 unsigned int want
, unsigned int flags
)
7088 ldout(cct
, 3) << "statx enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7089 Mutex::Locker
lock(client_lock
);
7090 tout(cct
) << "statx" << std::endl
;
7091 tout(cct
) << relpath
<< std::endl
;
7096 filepath
path(relpath
);
7099 unsigned mask
= statx_to_mask(flags
, want
);
7101 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7105 r
= _getattr(in
, mask
, perms
);
7107 ldout(cct
, 3) << "statx exit on error!" << dendl
;
7111 fill_statx(in
, mask
, stx
);
7112 ldout(cct
, 3) << "statx exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7116 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7117 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7119 ldout(cct
, 3) << "lstat enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7120 Mutex::Locker
lock(client_lock
);
7121 tout(cct
) << "lstat" << std::endl
;
7122 tout(cct
) << relpath
<< std::endl
;
7127 filepath
path(relpath
);
7129 // don't follow symlinks
7130 int r
= path_walk(path
, &in
, perms
, false, mask
);
7133 r
= _getattr(in
, mask
, perms
);
7135 ldout(cct
, 3) << "lstat exit on error!" << dendl
;
7138 fill_stat(in
, stbuf
, dirstat
);
7139 ldout(cct
, 3) << "lstat exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7143 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7145 ldout(cct
, 10) << "fill_stat on " << in
->ino
<< " snap/dev" << in
->snapid
7146 << " mode 0" << oct
<< in
->mode
<< dec
7147 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7148 memset(st
, 0, sizeof(struct stat
));
7149 if (use_faked_inos())
7150 st
->st_ino
= in
->faked_ino
;
7152 st
->st_ino
= in
->ino
;
7153 st
->st_dev
= in
->snapid
;
7154 st
->st_mode
= in
->mode
;
7155 st
->st_rdev
= in
->rdev
;
7157 switch (in
->nlink
) {
7159 st
->st_nlink
= 0; /* dir is unlinked */
7162 st
->st_nlink
= 1 /* parent dentry */
7164 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7170 st
->st_nlink
= in
->nlink
;
7172 st
->st_uid
= in
->uid
;
7173 st
->st_gid
= in
->gid
;
7174 if (in
->ctime
> in
->mtime
) {
7175 stat_set_ctime_sec(st
, in
->ctime
.sec());
7176 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7178 stat_set_ctime_sec(st
, in
->mtime
.sec());
7179 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7181 stat_set_atime_sec(st
, in
->atime
.sec());
7182 stat_set_atime_nsec(st
, in
->atime
.nsec());
7183 stat_set_mtime_sec(st
, in
->mtime
.sec());
7184 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7186 if (cct
->_conf
->client_dirsize_rbytes
)
7187 st
->st_size
= in
->rstat
.rbytes
;
7189 st
->st_size
= in
->dirstat
.size();
7192 st
->st_size
= in
->size
;
7193 st
->st_blocks
= (in
->size
+ 511) >> 9;
7195 st
->st_blksize
= MAX(in
->layout
.stripe_unit
, 4096);
7198 *dirstat
= in
->dirstat
;
7202 return in
->caps_issued();
7205 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7207 ldout(cct
, 10) << "fill_statx on " << in
->ino
<< " snap/dev" << in
->snapid
7208 << " mode 0" << oct
<< in
->mode
<< dec
7209 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7210 memset(stx
, 0, sizeof(struct ceph_statx
));
7213 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7214 * so that all bits are set.
7219 /* These are always considered to be available */
7220 stx
->stx_dev
= in
->snapid
;
7221 stx
->stx_blksize
= MAX(in
->layout
.stripe_unit
, 4096);
7223 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7224 stx
->stx_mode
= S_IFMT
& in
->mode
;
7225 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7226 stx
->stx_rdev
= in
->rdev
;
7227 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7229 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7230 stx
->stx_uid
= in
->uid
;
7231 stx
->stx_gid
= in
->gid
;
7232 stx
->stx_mode
= in
->mode
;
7233 in
->btime
.to_timespec(&stx
->stx_btime
);
7234 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7237 if (mask
& CEPH_CAP_LINK_SHARED
) {
7239 switch (in
->nlink
) {
7241 stx
->stx_nlink
= 0; /* dir is unlinked */
7244 stx
->stx_nlink
= 1 /* parent dentry */
7246 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7252 stx
->stx_nlink
= in
->nlink
;
7254 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7257 if (mask
& CEPH_CAP_FILE_SHARED
) {
7259 in
->atime
.to_timespec(&stx
->stx_atime
);
7260 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7263 if (cct
->_conf
->client_dirsize_rbytes
)
7264 stx
->stx_size
= in
->rstat
.rbytes
;
7266 stx
->stx_size
= in
->dirstat
.size();
7267 stx
->stx_blocks
= 1;
7269 stx
->stx_size
= in
->size
;
7270 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7272 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7273 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7276 /* Change time and change_attr both require all shared caps to view */
7277 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7278 stx
->stx_version
= in
->change_attr
;
7279 if (in
->ctime
> in
->mtime
)
7280 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7282 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7283 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7288 void Client::touch_dn(Dentry
*dn
)
7293 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7295 Mutex::Locker
lock(client_lock
);
7296 tout(cct
) << "chmod" << std::endl
;
7297 tout(cct
) << relpath
<< std::endl
;
7298 tout(cct
) << mode
<< std::endl
;
7303 filepath
path(relpath
);
7305 int r
= path_walk(path
, &in
, perms
);
7309 attr
.st_mode
= mode
;
7310 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7313 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7315 Mutex::Locker
lock(client_lock
);
7316 tout(cct
) << "fchmod" << std::endl
;
7317 tout(cct
) << fd
<< std::endl
;
7318 tout(cct
) << mode
<< std::endl
;
7323 Fh
*f
= get_filehandle(fd
);
7326 #if defined(__linux__) && defined(O_PATH)
7327 if (f
->flags
& O_PATH
)
7331 attr
.st_mode
= mode
;
7332 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7335 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7337 Mutex::Locker
lock(client_lock
);
7338 tout(cct
) << "lchmod" << std::endl
;
7339 tout(cct
) << relpath
<< std::endl
;
7340 tout(cct
) << mode
<< std::endl
;
7345 filepath
path(relpath
);
7347 // don't follow symlinks
7348 int r
= path_walk(path
, &in
, perms
, false);
7352 attr
.st_mode
= mode
;
7353 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7356 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7357 const UserPerm
& perms
)
7359 Mutex::Locker
lock(client_lock
);
7360 tout(cct
) << "chown" << std::endl
;
7361 tout(cct
) << relpath
<< std::endl
;
7362 tout(cct
) << new_uid
<< std::endl
;
7363 tout(cct
) << new_gid
<< std::endl
;
7368 filepath
path(relpath
);
7370 int r
= path_walk(path
, &in
, perms
);
7374 attr
.st_uid
= new_uid
;
7375 attr
.st_gid
= new_gid
;
7376 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7379 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7381 Mutex::Locker
lock(client_lock
);
7382 tout(cct
) << "fchown" << std::endl
;
7383 tout(cct
) << fd
<< std::endl
;
7384 tout(cct
) << new_uid
<< std::endl
;
7385 tout(cct
) << new_gid
<< std::endl
;
7390 Fh
*f
= get_filehandle(fd
);
7393 #if defined(__linux__) && defined(O_PATH)
7394 if (f
->flags
& O_PATH
)
7398 attr
.st_uid
= new_uid
;
7399 attr
.st_gid
= new_gid
;
7401 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7402 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7403 return _setattr(f
->inode
, &attr
, mask
, perms
);
7406 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7407 const UserPerm
& perms
)
7409 Mutex::Locker
lock(client_lock
);
7410 tout(cct
) << "lchown" << std::endl
;
7411 tout(cct
) << relpath
<< std::endl
;
7412 tout(cct
) << new_uid
<< std::endl
;
7413 tout(cct
) << new_gid
<< std::endl
;
7418 filepath
path(relpath
);
7420 // don't follow symlinks
7421 int r
= path_walk(path
, &in
, perms
, false);
7425 attr
.st_uid
= new_uid
;
7426 attr
.st_gid
= new_gid
;
7428 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7429 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7430 return _setattr(in
, &attr
, mask
, perms
);
7433 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7434 const UserPerm
& perms
)
7436 Mutex::Locker
lock(client_lock
);
7437 tout(cct
) << "utime" << std::endl
;
7438 tout(cct
) << relpath
<< std::endl
;
7439 tout(cct
) << buf
->modtime
<< std::endl
;
7440 tout(cct
) << buf
->actime
<< std::endl
;
7445 filepath
path(relpath
);
7447 int r
= path_walk(path
, &in
, perms
);
7451 stat_set_mtime_sec(&attr
, buf
->modtime
);
7452 stat_set_mtime_nsec(&attr
, 0);
7453 stat_set_atime_sec(&attr
, buf
->actime
);
7454 stat_set_atime_nsec(&attr
, 0);
7455 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7458 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7459 const UserPerm
& perms
)
7461 Mutex::Locker
lock(client_lock
);
7462 tout(cct
) << "lutime" << std::endl
;
7463 tout(cct
) << relpath
<< std::endl
;
7464 tout(cct
) << buf
->modtime
<< std::endl
;
7465 tout(cct
) << buf
->actime
<< std::endl
;
7470 filepath
path(relpath
);
7472 // don't follow symlinks
7473 int r
= path_walk(path
, &in
, perms
, false);
7477 stat_set_mtime_sec(&attr
, buf
->modtime
);
7478 stat_set_mtime_nsec(&attr
, 0);
7479 stat_set_atime_sec(&attr
, buf
->actime
);
7480 stat_set_atime_nsec(&attr
, 0);
7481 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7484 int Client::flock(int fd
, int operation
, uint64_t owner
)
7486 Mutex::Locker
lock(client_lock
);
7487 tout(cct
) << "flock" << std::endl
;
7488 tout(cct
) << fd
<< std::endl
;
7489 tout(cct
) << operation
<< std::endl
;
7490 tout(cct
) << owner
<< std::endl
;
7495 Fh
*f
= get_filehandle(fd
);
7499 return _flock(f
, operation
, owner
);
7502 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7504 Mutex::Locker
lock(client_lock
);
7505 tout(cct
) << "opendir" << std::endl
;
7506 tout(cct
) << relpath
<< std::endl
;
7511 filepath
path(relpath
);
7513 int r
= path_walk(path
, &in
, perms
, true);
7516 if (cct
->_conf
->client_permissions
) {
7517 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7521 r
= _opendir(in
.get(), dirpp
, perms
);
7522 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7524 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7528 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7532 *dirpp
= new dir_result_t(in
, perms
);
7533 opened_dirs
.insert(*dirpp
);
7534 ldout(cct
, 8) << "_opendir(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7539 int Client::closedir(dir_result_t
*dir
)
7541 Mutex::Locker
lock(client_lock
);
7542 tout(cct
) << "closedir" << std::endl
;
7543 tout(cct
) << (unsigned long)dir
<< std::endl
;
7545 ldout(cct
, 3) << "closedir(" << dir
<< ") = 0" << dendl
;
7550 void Client::_closedir(dir_result_t
*dirp
)
7552 ldout(cct
, 10) << "_closedir(" << dirp
<< ")" << dendl
;
7554 ldout(cct
, 10) << "_closedir detaching inode " << dirp
->inode
<< dendl
;
7555 dirp
->inode
.reset();
7557 _readdir_drop_dirp_buffer(dirp
);
7558 opened_dirs
.erase(dirp
);
7562 void Client::rewinddir(dir_result_t
*dirp
)
7564 Mutex::Locker
lock(client_lock
);
7565 ldout(cct
, 3) << "rewinddir(" << dirp
<< ")" << dendl
;
7570 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7571 _readdir_drop_dirp_buffer(d
);
7575 loff_t
Client::telldir(dir_result_t
*dirp
)
7577 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7578 ldout(cct
, 3) << "telldir(" << dirp
<< ") = " << d
->offset
<< dendl
;
7582 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7584 Mutex::Locker
lock(client_lock
);
7586 ldout(cct
, 3) << "seekdir(" << dirp
<< ", " << offset
<< ")" << dendl
;
7591 if (offset
== dirp
->offset
)
7594 if (offset
> dirp
->offset
)
7595 dirp
->release_count
= 0; // bump if we do a forward seek
7597 dirp
->ordered_count
= 0; // disable filling readdir cache
7599 if (dirp
->hash_order()) {
7600 if (dirp
->offset
> offset
) {
7601 _readdir_drop_dirp_buffer(dirp
);
7606 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7607 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7608 _readdir_drop_dirp_buffer(dirp
);
7613 dirp
->offset
= offset
;
7618 // ino_t d_ino; /* inode number */
7619 // off_t d_off; /* offset to the next dirent */
7620 // unsigned short d_reclen; /* length of this record */
7621 // unsigned char d_type; /* type of file */
7622 // char d_name[256]; /* filename */
7624 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7626 strncpy(de
->d_name
, name
, 255);
7627 de
->d_name
[255] = '\0';
7630 #if !defined(DARWIN) && !defined(__FreeBSD__)
7631 de
->d_off
= next_off
;
7634 de
->d_type
= IFTODT(type
);
7635 ldout(cct
, 10) << "fill_dirent '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7636 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7640 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7642 frag_t fg
= dirp
->buffer_frag
;
7644 if (fg
.is_rightmost()) {
7645 ldout(cct
, 10) << "_readdir_next_frag advance from " << fg
<< " to END" << dendl
;
7652 ldout(cct
, 10) << "_readdir_next_frag advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7654 if (dirp
->hash_order()) {
7656 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7657 if (dirp
->offset
< new_offset
) // don't decrease offset
7658 dirp
->offset
= new_offset
;
7660 dirp
->last_name
.clear();
7661 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7662 _readdir_rechoose_frag(dirp
);
7666 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7668 assert(dirp
->inode
);
7670 if (dirp
->hash_order())
7673 frag_t cur
= frag_t(dirp
->offset_high());
7674 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7676 ldout(cct
, 10) << "_readdir_rechoose_frag frag " << cur
<< " maps to " << fg
<< dendl
;
7677 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7678 dirp
->last_name
.clear();
7679 dirp
->next_offset
= 2;
7683 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7685 ldout(cct
, 10) << "_readdir_drop_dirp_buffer " << dirp
<< dendl
;
7686 dirp
->buffer
.clear();
7689 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7692 assert(dirp
->inode
);
7694 // get the current frag.
7696 if (dirp
->hash_order())
7697 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7699 fg
= frag_t(dirp
->offset_high());
7701 ldout(cct
, 10) << "_readdir_get_frag " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7702 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7704 int op
= CEPH_MDS_OP_READDIR
;
7705 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7706 op
= CEPH_MDS_OP_LSSNAP
;
7708 InodeRef
& diri
= dirp
->inode
;
7710 MetaRequest
*req
= new MetaRequest(op
);
7712 diri
->make_nosnap_relative_path(path
);
7713 req
->set_filepath(path
);
7714 req
->set_inode(diri
.get());
7715 req
->head
.args
.readdir
.frag
= fg
;
7716 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7717 if (dirp
->last_name
.length()) {
7718 req
->path2
.set_path(dirp
->last_name
);
7719 } else if (dirp
->hash_order()) {
7720 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7725 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7727 if (res
== -EAGAIN
) {
7728 ldout(cct
, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl
;
7729 _readdir_rechoose_frag(dirp
);
7730 return _readdir_get_frag(dirp
);
7734 ldout(cct
, 10) << "_readdir_get_frag " << dirp
<< " got frag " << dirp
->buffer_frag
7735 << " size " << dirp
->buffer
.size() << dendl
;
7737 ldout(cct
, 10) << "_readdir_get_frag got error " << res
<< ", setting end flag" << dendl
;
7744 struct dentry_off_lt
{
7745 bool operator()(const Dentry
* dn
, int64_t off
) const {
7746 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7750 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7751 int caps
, bool getref
)
7753 assert(client_lock
.is_locked());
7754 ldout(cct
, 10) << "_readdir_cache_cb " << dirp
<< " on " << dirp
->inode
->ino
7755 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
7757 Dir
*dir
= dirp
->inode
->dir
;
7760 ldout(cct
, 10) << " dir is empty" << dendl
;
7765 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
7766 dir
->readdir_cache
.end(),
7767 dirp
->offset
, dentry_off_lt());
7771 if (!dirp
->inode
->is_complete_and_ordered())
7773 if (pd
== dir
->readdir_cache
.end())
7776 if (dn
->inode
== NULL
) {
7777 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
7781 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
7782 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
7787 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
7791 struct ceph_statx stx
;
7793 fill_statx(dn
->inode
, caps
, &stx
);
7795 uint64_t next_off
= dn
->offset
+ 1;
7797 if (pd
== dir
->readdir_cache
.end())
7798 next_off
= dir_result_t::END
;
7801 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
7803 in
= dn
->inode
.get();
7807 dn_name
= dn
->name
; // fill in name while we have lock
7809 client_lock
.Unlock();
7810 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
7812 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
7813 << " = " << r
<< dendl
;
7818 dirp
->offset
= next_off
;
7820 dirp
->next_offset
= 2;
7822 dirp
->next_offset
= dirp
->offset_low();
7823 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
7824 dirp
->release_count
= 0; // last_name no longer match cache index
7829 ldout(cct
, 10) << "_readdir_cache_cb " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
7834 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
7835 unsigned want
, unsigned flags
, bool getref
)
7837 int caps
= statx_to_mask(flags
, want
);
7839 Mutex::Locker
lock(client_lock
);
7844 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
7846 ldout(cct
, 10) << "readdir_r_cb " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
7847 << dec
<< " at_end=" << dirp
->at_end()
7848 << " hash_order=" << dirp
->hash_order() << dendl
;
7851 struct ceph_statx stx
;
7852 memset(&de
, 0, sizeof(de
));
7853 memset(&stx
, 0, sizeof(stx
));
7855 InodeRef
& diri
= dirp
->inode
;
7860 if (dirp
->offset
== 0) {
7861 ldout(cct
, 15) << " including ." << dendl
;
7862 assert(diri
->dn_set
.size() < 2); // can't have multiple hard-links to a dir
7863 uint64_t next_off
= 1;
7866 r
= _getattr(diri
, caps
, dirp
->perms
);
7870 fill_statx(diri
, caps
, &stx
);
7871 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
7873 Inode
*inode
= NULL
;
7879 client_lock
.Unlock();
7880 r
= cb(p
, &de
, &stx
, next_off
, inode
);
7885 dirp
->offset
= next_off
;
7889 if (dirp
->offset
== 1) {
7890 ldout(cct
, 15) << " including .." << dendl
;
7891 uint64_t next_off
= 2;
7893 if (diri
->dn_set
.empty())
7896 in
= diri
->get_first_parent()->dir
->parent_inode
;
7899 r
= _getattr(in
, caps
, dirp
->perms
);
7903 fill_statx(in
, caps
, &stx
);
7904 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
7906 Inode
*inode
= NULL
;
7912 client_lock
.Unlock();
7913 r
= cb(p
, &de
, &stx
, next_off
, inode
);
7918 dirp
->offset
= next_off
;
7923 // can we read from our cache?
7924 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
7925 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
7926 << dirp
->inode
->is_complete_and_ordered()
7927 << " issued " << ccap_string(dirp
->inode
->caps_issued())
7929 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
7930 dirp
->inode
->is_complete_and_ordered() &&
7931 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
7932 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
7941 bool check_caps
= true;
7942 if (!dirp
->is_cached()) {
7943 int r
= _readdir_get_frag(dirp
);
7946 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7947 // different than the requested one. (our dirfragtree was outdated)
7950 frag_t fg
= dirp
->buffer_frag
;
7952 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
7953 << " offset " << hex
<< dirp
->offset
<< dendl
;
7955 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
7956 dirp
->offset
, dir_result_t::dentry_off_lt());
7957 it
!= dirp
->buffer
.end();
7959 dir_result_t::dentry
&entry
= *it
;
7961 uint64_t next_off
= entry
.offset
+ 1;
7965 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
7970 fill_statx(entry
.inode
, caps
, &stx
);
7971 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
7973 Inode
*inode
= NULL
;
7975 inode
= entry
.inode
.get();
7979 client_lock
.Unlock();
7980 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
7983 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
7984 << " = " << r
<< dendl
;
7988 dirp
->offset
= next_off
;
7993 if (dirp
->next_offset
> 2) {
7994 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
7995 _readdir_drop_dirp_buffer(dirp
);
7999 if (!fg
.is_rightmost()) {
8001 _readdir_next_frag(dirp
);
8005 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
8006 diri
->dir_release_count
== dirp
->release_count
) {
8007 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
8008 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
8010 assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
8011 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
8013 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
8015 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
8016 diri
->flags
|= I_COMPLETE
;
8028 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
8030 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
8037 * 1 if we got a dirent
8038 * 0 for end of directory
8042 struct single_readdir
{
8044 struct ceph_statx
*stx
;
8049 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
8050 struct ceph_statx
*stx
, off_t off
,
8053 single_readdir
*c
= static_cast<single_readdir
*>(p
);
8056 return -1; // already filled this dirent
8066 struct dirent
*Client::readdir(dir_result_t
*d
)
8069 static struct dirent de
;
8076 // our callback fills the dirent and sets sr.full=true on first
8077 // call, and returns -1 the second time around.
8078 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8080 errno
= -ret
; // this sucks.
8081 return (dirent
*) NULL
;
8086 return (dirent
*) NULL
;
8089 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8090 struct ceph_statx
*stx
, unsigned want
,
8091 unsigned flags
, Inode
**out
)
8099 // our callback fills the dirent and sets sr.full=true on first
8100 // call, and returns -1 the second time around.
8101 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8113 struct getdents_result
{
8120 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8121 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8123 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8129 dlen
= strlen(de
->d_name
) + 1;
8131 if (c
->pos
+ dlen
> c
->buflen
)
8132 return -1; // doesn't fit
8135 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8137 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8143 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8148 gr
.fullent
= fullent
;
8151 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8153 if (r
< 0) { // some error
8154 if (r
== -1) { // buffer ran out of space
8155 if (gr
.pos
) { // but we got some entries already!
8157 } // or we need a larger buffer
8159 } else { // actual error, return it
8168 struct getdir_result
{
8169 list
<string
> *contents
;
8173 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8175 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8177 r
->contents
->push_back(de
->d_name
);
8182 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8183 const UserPerm
& perms
)
8185 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8187 Mutex::Locker
lock(client_lock
);
8188 tout(cct
) << "getdir" << std::endl
;
8189 tout(cct
) << relpath
<< std::endl
;
8193 int r
= opendir(relpath
, &d
, perms
);
8198 gr
.contents
= &contents
;
8200 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8210 /****** file i/o **********/
8211 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8212 mode_t mode
, int stripe_unit
, int stripe_count
,
8213 int object_size
, const char *data_pool
)
8215 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8216 Mutex::Locker
lock(client_lock
);
8217 tout(cct
) << "open" << std::endl
;
8218 tout(cct
) << relpath
<< std::endl
;
8219 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8226 #if defined(__linux__) && defined(O_PATH)
8227 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8228 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8229 * in kernel (fs/open.c). */
8231 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8234 filepath
path(relpath
);
8236 bool created
= false;
8237 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8238 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8239 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8241 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8244 #if defined(__linux__) && defined(O_PATH)
8245 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8247 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8251 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8252 filepath dirpath
= path
;
8253 string dname
= dirpath
.last_dentry();
8254 dirpath
.pop_dentry();
8256 r
= path_walk(dirpath
, &dir
, perms
, true,
8257 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8260 if (cct
->_conf
->client_permissions
) {
8261 r
= may_create(dir
.get(), perms
);
8265 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8266 stripe_count
, object_size
, data_pool
, &created
, perms
);
8272 // posix says we can only check permissions of existing files
8273 if (cct
->_conf
->client_permissions
) {
8274 r
= may_open(in
.get(), flags
, perms
);
8281 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8283 // allocate a integer file descriptor
8286 assert(fd_map
.count(r
) == 0);
8291 tout(cct
) << r
<< std::endl
;
8292 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8296 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8298 /* Use default file striping parameters */
8299 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8302 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8303 const UserPerm
& perms
)
8305 Mutex::Locker
lock(client_lock
);
8306 ldout(cct
, 3) << "lookup_hash enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8311 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8313 req
->set_filepath(path
);
8315 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8317 sprintf(f
, "%u", h
);
8318 filepath
path2(dirino
);
8319 path2
.push_dentry(string(f
));
8320 req
->set_filepath2(path2
);
8322 int r
= make_request(req
, perms
, NULL
, NULL
,
8323 rand() % mdsmap
->get_num_in_mds());
8324 ldout(cct
, 3) << "lookup_hash exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8330 * Load inode into local cache.
8332 * If inode pointer is non-NULL, and take a reference on
8333 * the resulting Inode object in one operation, so that caller
8334 * can safely assume inode will still be there after return.
8336 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8338 ldout(cct
, 8) << "lookup_ino enter(" << ino
<< ")" << dendl
;
8343 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8345 req
->set_filepath(path
);
8347 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8348 if (r
== 0 && inode
!= NULL
) {
8349 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8350 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8351 assert(p
!= inode_map
.end());
8355 ldout(cct
, 8) << "lookup_ino exit(" << ino
<< ") = " << r
<< dendl
;
8359 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8361 Mutex::Locker
lock(client_lock
);
8362 return _lookup_ino(ino
, perms
, inode
);
8366 * Find the parent inode of `ino` and insert it into
8367 * our cache. Conditionally also set `parent` to a referenced
8368 * Inode* if caller provides non-NULL value.
8370 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8372 ldout(cct
, 8) << "lookup_parent enter(" << ino
->ino
<< ")" << dendl
;
8377 if (!ino
->dn_set
.empty()) {
8378 // if we exposed the parent here, we'd need to check permissions,
8379 // but right now we just rely on the MDS doing so in make_request
8380 ldout(cct
, 8) << "lookup_parent dentry already present" << dendl
;
8384 if (ino
->is_root()) {
8386 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
8390 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8391 filepath
path(ino
->ino
);
8392 req
->set_filepath(path
);
8395 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8396 // Give caller a reference to the parent ino if they provided a pointer.
8397 if (parent
!= NULL
) {
8399 *parent
= target
.get();
8401 ldout(cct
, 8) << "lookup_parent found parent " << (*parent
)->ino
<< dendl
;
8406 ldout(cct
, 8) << "lookup_parent exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8410 int Client::lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8412 Mutex::Locker
lock(client_lock
);
8413 return _lookup_parent(ino
, perms
, parent
);
8417 * Populate the parent dentry for `ino`, provided it is
8418 * a child of `parent`.
8420 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8422 assert(parent
->is_dir());
8423 ldout(cct
, 3) << "lookup_name enter(" << ino
->ino
<< ")" << dendl
;
8428 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8429 req
->set_filepath2(filepath(parent
->ino
));
8430 req
->set_filepath(filepath(ino
->ino
));
8431 req
->set_inode(ino
);
8433 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8434 ldout(cct
, 3) << "lookup_name exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8438 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8440 Mutex::Locker
lock(client_lock
);
8441 return _lookup_name(ino
, parent
, perms
);
8444 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8452 f
->actor_perms
= perms
;
8454 ldout(cct
, 10) << "_create_fh " << in
->ino
<< " mode " << cmode
<< dendl
;
8456 if (in
->snapid
!= CEPH_NOSNAP
) {
8457 in
->snap_cap_refs
++;
8458 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8459 << ccap_string(in
->caps_issued()) << dendl
;
8462 const md_config_t
*conf
= cct
->_conf
;
8463 f
->readahead
.set_trigger_requests(1);
8464 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8465 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8466 if (conf
->client_readahead_max_bytes
) {
8467 max_readahead
= MIN(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8469 if (conf
->client_readahead_max_periods
) {
8470 max_readahead
= MIN(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8472 f
->readahead
.set_max_readahead_size(max_readahead
);
8473 vector
<uint64_t> alignments
;
8474 alignments
.push_back(in
->layout
.get_period());
8475 alignments
.push_back(in
->layout
.stripe_unit
);
8476 f
->readahead
.set_alignments(alignments
);
8481 int Client::_release_fh(Fh
*f
)
8483 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8484 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8485 Inode
*in
= f
->inode
.get();
8486 ldout(cct
, 8) << "_release_fh " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8490 if (in
->snapid
== CEPH_NOSNAP
) {
8491 if (in
->put_open_ref(f
->mode
)) {
8492 _flush(in
, new C_Client_FlushComplete(this, in
));
8496 assert(in
->snap_cap_refs
> 0);
8497 in
->snap_cap_refs
--;
8500 _release_filelocks(f
);
8502 // Finally, read any async err (i.e. from flushes)
8503 int err
= f
->take_async_err();
8505 ldout(cct
, 1) << "_release_fh " << f
<< " on inode " << *in
<< " caught async_err = "
8506 << cpp_strerror(err
) << dendl
;
8508 ldout(cct
, 10) << "_release_fh " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8516 void Client::_put_fh(Fh
*f
)
8518 int left
= f
->put();
8524 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8525 const UserPerm
& perms
)
8527 if (in
->snapid
!= CEPH_NOSNAP
&&
8528 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8532 // use normalized flags to generate cmode
8533 int cmode
= ceph_flags_to_mode(ceph_flags_sys2wire(flags
));
8536 int want
= ceph_caps_for_mode(cmode
);
8539 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8541 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8543 check_caps(in
, CHECK_CAPS_NODELAY
);
8546 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8548 in
->make_nosnap_relative_path(path
);
8549 req
->set_filepath(path
);
8550 req
->head
.args
.open
.flags
= ceph_flags_sys2wire(flags
& ~O_CREAT
);
8551 req
->head
.args
.open
.mode
= mode
;
8552 req
->head
.args
.open
.pool
= -1;
8553 if (cct
->_conf
->client_debug_getattr_caps
)
8554 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8556 req
->head
.args
.open
.mask
= 0;
8557 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8559 result
= make_request(req
, perms
);
8562 * NFS expects that delegations will be broken on a conflicting open,
8563 * not just when there is actual conflicting access to the file. SMB leases
8564 * and oplocks also have similar semantics.
8566 * Ensure that clients that have delegations enabled will wait on minimal
8567 * caps during open, just to ensure that other clients holding delegations
8568 * return theirs first.
8570 if (deleg_timeout
&& result
== 0) {
8573 if (cmode
& CEPH_FILE_MODE_WR
)
8574 need
|= CEPH_CAP_FILE_WR
;
8575 if (cmode
& CEPH_FILE_MODE_RD
)
8576 need
|= CEPH_CAP_FILE_RD
;
8578 result
= get_caps(in
, need
, want
, &have
, -1);
8580 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8581 " . Denying open: " <<
8582 cpp_strerror(result
) << dendl
;
8583 in
->put_open_ref(cmode
);
8585 put_cap_ref(in
, need
);
8593 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8595 in
->put_open_ref(cmode
);
8603 int Client::_renew_caps(Inode
*in
)
8605 int wanted
= in
->caps_file_wanted();
8606 if (in
->is_any_caps() &&
8607 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8608 check_caps(in
, CHECK_CAPS_NODELAY
);
8613 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8615 else if (wanted
& CEPH_CAP_FILE_RD
)
8617 else if (wanted
& CEPH_CAP_FILE_WR
)
8620 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8622 in
->make_nosnap_relative_path(path
);
8623 req
->set_filepath(path
);
8624 req
->head
.args
.open
.flags
= flags
;
8625 req
->head
.args
.open
.pool
= -1;
8626 if (cct
->_conf
->client_debug_getattr_caps
)
8627 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8629 req
->head
.args
.open
.mask
= 0;
8632 // duplicate in case Cap goes away; not sure if that race is a concern?
8633 const UserPerm
*pperm
= in
->get_best_perms();
8637 int ret
= make_request(req
, perms
);
8641 int Client::close(int fd
)
8643 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8644 Mutex::Locker
lock(client_lock
);
8645 tout(cct
) << "close" << std::endl
;
8646 tout(cct
) << fd
<< std::endl
;
8651 Fh
*fh
= get_filehandle(fd
);
8654 int err
= _release_fh(fh
);
8657 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8665 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8667 Mutex::Locker
lock(client_lock
);
8668 tout(cct
) << "lseek" << std::endl
;
8669 tout(cct
) << fd
<< std::endl
;
8670 tout(cct
) << offset
<< std::endl
;
8671 tout(cct
) << whence
<< std::endl
;
8676 Fh
*f
= get_filehandle(fd
);
8679 #if defined(__linux__) && defined(O_PATH)
8680 if (f
->flags
& O_PATH
)
8683 return _lseek(f
, offset
, whence
);
8686 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8688 Inode
*in
= f
->inode
.get();
8701 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8704 f
->pos
= in
->size
+ offset
;
8711 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8716 void Client::lock_fh_pos(Fh
*f
)
8718 ldout(cct
, 10) << "lock_fh_pos " << f
<< dendl
;
8720 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8722 f
->pos_waiters
.push_back(&cond
);
8723 ldout(cct
, 10) << "lock_fh_pos BLOCKING on " << f
<< dendl
;
8724 while (f
->pos_locked
|| f
->pos_waiters
.front() != &cond
)
8725 cond
.Wait(client_lock
);
8726 ldout(cct
, 10) << "lock_fh_pos UNBLOCKING on " << f
<< dendl
;
8727 assert(f
->pos_waiters
.front() == &cond
);
8728 f
->pos_waiters
.pop_front();
8731 f
->pos_locked
= true;
8734 void Client::unlock_fh_pos(Fh
*f
)
8736 ldout(cct
, 10) << "unlock_fh_pos " << f
<< dendl
;
8737 f
->pos_locked
= false;
8740 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
8742 if (!in
->inline_data
.length()) {
8743 onfinish
->complete(0);
8748 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
8749 object_t oid
= oid_buf
;
8751 ObjectOperation create_ops
;
8752 create_ops
.create(false);
8754 objecter
->mutate(oid
,
8755 OSDMap::file_to_object_locator(in
->layout
),
8757 in
->snaprealm
->get_snap_context(),
8758 ceph::real_clock::now(),
8762 bufferlist inline_version_bl
;
8763 ::encode(in
->inline_version
, inline_version_bl
);
8765 ObjectOperation uninline_ops
;
8766 uninline_ops
.cmpxattr("inline_version",
8767 CEPH_OSD_CMPXATTR_OP_GT
,
8768 CEPH_OSD_CMPXATTR_MODE_U64
,
8770 bufferlist inline_data
= in
->inline_data
;
8771 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
8772 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
8774 objecter
->mutate(oid
,
8775 OSDMap::file_to_object_locator(in
->layout
),
8777 in
->snaprealm
->get_snap_context(),
8778 ceph::real_clock::now(),
8787 // blocking osd interface
8789 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
8791 Mutex::Locker
lock(client_lock
);
8792 tout(cct
) << "read" << std::endl
;
8793 tout(cct
) << fd
<< std::endl
;
8794 tout(cct
) << size
<< std::endl
;
8795 tout(cct
) << offset
<< std::endl
;
8800 Fh
*f
= get_filehandle(fd
);
8803 #if defined(__linux__) && defined(O_PATH)
8804 if (f
->flags
& O_PATH
)
8808 int r
= _read(f
, offset
, size
, &bl
);
8809 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
8811 bl
.copy(0, bl
.length(), buf
);
8817 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
8821 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
8824 int Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
8826 const md_config_t
*conf
= cct
->_conf
;
8827 Inode
*in
= f
->inode
.get();
8829 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
8831 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8833 bool movepos
= false;
8839 loff_t start_pos
= offset
;
8841 if (in
->inline_version
== 0) {
8842 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
8848 assert(in
->inline_version
> 0);
8853 int r
= get_caps(in
, CEPH_CAP_FILE_RD
, CEPH_CAP_FILE_CACHE
, &have
, -1);
8859 if (f
->flags
& O_DIRECT
)
8860 have
&= ~CEPH_CAP_FILE_CACHE
;
8862 Mutex
uninline_flock("Client::_read_uninline_data flock");
8864 bool uninline_done
= false;
8865 int uninline_ret
= 0;
8866 Context
*onuninline
= NULL
;
8868 if (in
->inline_version
< CEPH_INLINE_NONE
) {
8869 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
8870 onuninline
= new C_SafeCond(&uninline_flock
,
8874 uninline_data(in
, onuninline
);
8876 uint32_t len
= in
->inline_data
.length();
8878 uint64_t endoff
= offset
+ size
;
8879 if (endoff
> in
->size
)
8883 if (endoff
<= len
) {
8884 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
8886 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
8887 bl
->append_zero(endoff
- len
);
8889 } else if ((uint64_t)offset
< endoff
) {
8890 bl
->append_zero(endoff
- offset
);
8897 if (!conf
->client_debug_force_sync_read
&&
8898 (conf
->client_oc
&& (have
& CEPH_CAP_FILE_CACHE
))) {
8900 if (f
->flags
& O_RSYNC
) {
8901 _flush_range(in
, offset
, size
);
8903 r
= _read_async(f
, offset
, size
, bl
);
8907 if (f
->flags
& O_DIRECT
)
8908 _flush_range(in
, offset
, size
);
8910 bool checkeof
= false;
8911 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
8918 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
8921 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8926 if ((uint64_t)offset
< in
->size
)
8934 f
->pos
= start_pos
+ bl
->length();
8942 client_lock
.Unlock();
8943 uninline_flock
.Lock();
8944 while (!uninline_done
)
8945 uninline_cond
.Wait(uninline_flock
);
8946 uninline_flock
.Unlock();
8949 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
8950 in
->inline_data
.clear();
8951 in
->inline_version
= CEPH_INLINE_NONE
;
8952 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
8959 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
8965 return bl
->length();
8968 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
8971 f
->readahead
.inc_pending();
8974 Client::C_Readahead::~C_Readahead() {
8975 f
->readahead
.dec_pending();
8979 void Client::C_Readahead::finish(int r
) {
8980 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
8981 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
8984 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
8986 const md_config_t
*conf
= cct
->_conf
;
8987 Inode
*in
= f
->inode
.get();
8989 ldout(cct
, 10) << "_read_async " << *in
<< " " << off
<< "~" << len
<< dendl
;
8991 // trim read based on file size?
8992 if (off
>= in
->size
)
8996 if (off
+ len
> in
->size
) {
8997 len
= in
->size
- off
;
9000 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
9001 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
9002 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
9004 // read (and possibly block)
9006 Mutex
flock("Client::_read_async flock");
9009 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
, &rvalue
);
9010 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9011 off
, len
, bl
, 0, onfinish
);
9013 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9014 client_lock
.Unlock();
9020 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
9027 if(f
->readahead
.get_min_readahead_size() > 0) {
9028 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
9029 if (readahead_extent
.second
> 0) {
9030 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
9031 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
9032 Context
*onfinish2
= new C_Readahead(this, f
);
9033 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
9034 readahead_extent
.first
, readahead_extent
.second
,
9035 NULL
, 0, onfinish2
);
9037 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
9038 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
9040 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
9049 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
9052 Inode
*in
= f
->inode
.get();
9057 ldout(cct
, 10) << "_read_sync " << *in
<< " " << off
<< "~" << len
<< dendl
;
9059 Mutex
flock("Client::_read_sync flock");
9064 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
, &r
);
9068 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9070 in
->truncate_size
, in
->truncate_seq
,
9072 client_lock
.Unlock();
9079 // if we get ENOENT from OSD, assume 0 bytes returned
9090 bl
->claim_append(tbl
);
9093 if (r
>= 0 && r
< wanted
) {
9094 if (pos
< in
->size
) {
9095 // zero up to known EOF
9096 int64_t some
= in
->size
- pos
;
9118 * we keep count of uncommitted sync writes on the inode, so that
9121 void Client::_sync_write_commit(Inode
*in
)
9123 assert(unsafe_sync_write
> 0);
9124 unsafe_sync_write
--;
9126 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9128 ldout(cct
, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9129 if (unsafe_sync_write
== 0 && unmounting
) {
9130 ldout(cct
, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl
;
9131 mount_cond
.Signal();
9135 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9137 Mutex::Locker
lock(client_lock
);
9138 tout(cct
) << "write" << std::endl
;
9139 tout(cct
) << fd
<< std::endl
;
9140 tout(cct
) << size
<< std::endl
;
9141 tout(cct
) << offset
<< std::endl
;
9146 Fh
*fh
= get_filehandle(fd
);
9149 #if defined(__linux__) && defined(O_PATH)
9150 if (fh
->flags
& O_PATH
)
9153 int r
= _write(fh
, offset
, size
, buf
, NULL
, 0);
9154 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9158 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9162 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9165 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9167 Mutex::Locker
lock(client_lock
);
9168 tout(cct
) << fd
<< std::endl
;
9169 tout(cct
) << offset
<< std::endl
;
9174 Fh
*fh
= get_filehandle(fd
);
9177 #if defined(__linux__) && defined(O_PATH)
9178 if (fh
->flags
& O_PATH
)
9181 loff_t totallen
= 0;
9182 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9183 totallen
+= iov
[i
].iov_len
;
9186 int w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9187 ldout(cct
, 3) << "pwritev(" << fd
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9191 int r
= _read(fh
, offset
, totallen
, &bl
);
9192 ldout(cct
, 3) << "preadv(" << fd
<< ", " << offset
<< ") = " << r
<< dendl
;
9197 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9199 * This piece of code aims to handle the case that bufferlist does not have enough data
9200 * to fill in the iov
9202 if (resid
< iov
[j
].iov_len
) {
9203 bl
.copy(bufoff
, resid
, (char *)iov
[j
].iov_base
);
9206 bl
.copy(bufoff
, iov
[j
].iov_len
, (char *)iov
[j
].iov_base
);
9208 resid
-= iov
[j
].iov_len
;
9209 bufoff
+= iov
[j
].iov_len
;
9215 int Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9216 const struct iovec
*iov
, int iovcnt
)
9220 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9223 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9224 Inode
*in
= f
->inode
.get();
9226 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9230 assert(in
->snapid
== CEPH_NOSNAP
);
9232 // was Fh opened as writeable?
9233 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9237 uint64_t endoff
= offset
+ size
;
9238 std::list
<InodeRef
> quota_roots
;
9239 if (endoff
> in
->size
&&
9240 is_quota_bytes_exceeded(in
, endoff
- in
->size
, f
->actor_perms
, "a_roots
)) {
9244 // use/adjust fd pos?
9248 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9249 * change out from under us.
9251 if (f
->flags
& O_APPEND
) {
9252 int r
= _lseek(f
, 0, SEEK_END
);
9263 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9265 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9268 utime_t start
= ceph_clock_now();
9270 if (in
->inline_version
== 0) {
9271 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9274 assert(in
->inline_version
> 0);
9277 // copy into fresh buffer (since our write may be resub, async)
9281 bl
.append(buf
, size
);
9283 for (int i
= 0; i
< iovcnt
; i
++) {
9284 if (iov
[i
].iov_len
> 0) {
9285 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9291 uint64_t totalwritten
;
9293 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
,
9294 CEPH_CAP_FILE_BUFFER
, &have
, endoff
);
9298 /* clear the setuid/setgid bits, if any */
9299 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9300 struct ceph_statx stx
= { 0 };
9302 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9303 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9307 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9310 if (f
->flags
& O_DIRECT
)
9311 have
&= ~CEPH_CAP_FILE_BUFFER
;
9313 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9315 Mutex
uninline_flock("Client::_write_uninline_data flock");
9317 bool uninline_done
= false;
9318 int uninline_ret
= 0;
9319 Context
*onuninline
= NULL
;
9321 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9322 if (endoff
> cct
->_conf
->client_max_inline_size
||
9323 endoff
> CEPH_INLINE_MAX_SIZE
||
9324 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9325 onuninline
= new C_SafeCond(&uninline_flock
,
9329 uninline_data(in
, onuninline
);
9331 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9333 uint32_t len
= in
->inline_data
.length();
9336 in
->inline_data
.copy(endoff
, len
- endoff
, bl
);
9339 in
->inline_data
.splice(offset
, len
- offset
);
9340 else if (offset
> len
)
9341 in
->inline_data
.append_zero(offset
- len
);
9343 in
->inline_data
.append(bl
);
9344 in
->inline_version
++;
9346 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9352 if (cct
->_conf
->client_oc
&& (have
& CEPH_CAP_FILE_BUFFER
)) {
9353 // do buffered write
9354 if (!in
->oset
.dirty_or_tx
)
9355 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9357 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9359 // async, caching, non-blocking.
9360 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9361 in
->snaprealm
->get_snap_context(),
9362 offset
, size
, bl
, ceph::real_clock::now(),
9364 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9369 // flush cached write if O_SYNC is set on file fh
9370 // O_DSYNC == O_SYNC on linux < 2.6.33
9371 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9372 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9373 _flush_range(in
, offset
, size
);
9376 if (f
->flags
& O_DIRECT
)
9377 _flush_range(in
, offset
, size
);
9379 // simple, non-atomic sync write
9380 Mutex
flock("Client::_write flock");
9383 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
);
9385 unsafe_sync_write
++;
9386 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9388 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9389 offset
, size
, bl
, ceph::real_clock::now(), 0,
9390 in
->truncate_size
, in
->truncate_seq
,
9392 client_lock
.Unlock();
9399 _sync_write_commit(in
);
9402 // if we get here, write was successful, update client metadata
9405 lat
= ceph_clock_now();
9407 logger
->tinc(l_c_wrlat
, lat
);
9414 totalwritten
= size
;
9415 r
= (int)totalwritten
;
9418 if (totalwritten
+ offset
> in
->size
) {
9419 in
->size
= totalwritten
+ offset
;
9420 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9422 if (is_quota_bytes_approaching(in
, quota_roots
)) {
9423 check_caps(in
, CHECK_CAPS_NODELAY
);
9424 } else if (is_max_size_approaching(in
)) {
9428 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9430 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9434 in
->mtime
= in
->ctime
= ceph_clock_now();
9436 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9441 client_lock
.Unlock();
9442 uninline_flock
.Lock();
9443 while (!uninline_done
)
9444 uninline_cond
.Wait(uninline_flock
);
9445 uninline_flock
.Unlock();
9448 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9449 in
->inline_data
.clear();
9450 in
->inline_version
= CEPH_INLINE_NONE
;
9451 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9457 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9461 int Client::_flush(Fh
*f
)
9463 Inode
*in
= f
->inode
.get();
9464 int err
= f
->take_async_err();
9466 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9467 << cpp_strerror(err
) << dendl
;
9469 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9475 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9477 struct ceph_statx stx
;
9478 stx
.stx_size
= length
;
9479 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9482 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9484 Mutex::Locker
lock(client_lock
);
9485 tout(cct
) << "ftruncate" << std::endl
;
9486 tout(cct
) << fd
<< std::endl
;
9487 tout(cct
) << length
<< std::endl
;
9492 Fh
*f
= get_filehandle(fd
);
9495 #if defined(__linux__) && defined(O_PATH)
9496 if (f
->flags
& O_PATH
)
9500 attr
.st_size
= length
;
9501 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9504 int Client::fsync(int fd
, bool syncdataonly
)
9506 Mutex::Locker
lock(client_lock
);
9507 tout(cct
) << "fsync" << std::endl
;
9508 tout(cct
) << fd
<< std::endl
;
9509 tout(cct
) << syncdataonly
<< std::endl
;
9514 Fh
*f
= get_filehandle(fd
);
9517 #if defined(__linux__) && defined(O_PATH)
9518 if (f
->flags
& O_PATH
)
9521 int r
= _fsync(f
, syncdataonly
);
9523 // The IOs in this fsync were okay, but maybe something happened
9524 // in the background that we shoudl be reporting?
9525 r
= f
->take_async_err();
9526 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9527 << ") = 0, async_err = " << r
<< dendl
;
9529 // Assume that an error we encountered during fsync, even reported
9530 // synchronously, would also have applied the error to the Fh, and we
9531 // should clear it here to avoid returning the same error again on next
9533 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9535 f
->take_async_err();
9540 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9543 Mutex
lock("Client::_fsync::lock");
9546 C_SafeCond
*object_cacher_completion
= NULL
;
9547 ceph_tid_t flush_tid
= 0;
9550 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9552 if (cct
->_conf
->client_oc
) {
9553 object_cacher_completion
= new C_SafeCond(&lock
, &cond
, &done
, &r
);
9554 tmp_ref
= in
; // take a reference; C_SafeCond doesn't and _flush won't either
9555 _flush(in
, object_cacher_completion
);
9556 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9559 if (!syncdataonly
&& in
->dirty_caps
) {
9560 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9561 if (in
->flushing_caps
)
9562 flush_tid
= last_flush_tid
;
9563 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9565 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9568 MetaRequest
*req
= in
->unsafe_ops
.back();
9569 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9572 wait_on_list(req
->waitfor_safe
);
9576 if (object_cacher_completion
) { // wait on a real reply instead of guessing
9577 client_lock
.Unlock();
9579 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9584 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9586 // FIXME: this can starve
9587 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9588 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9589 << " uncommitted, waiting" << dendl
;
9590 wait_on_list(in
->waitfor_commit
);
9596 wait_sync_caps(in
, flush_tid
);
9598 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9600 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9601 << cpp_strerror(-r
) << dendl
;
9607 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9609 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9610 return _fsync(f
->inode
.get(), syncdataonly
);
9613 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9615 Mutex::Locker
lock(client_lock
);
9616 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9617 tout(cct
) << fd
<< std::endl
;
9622 Fh
*f
= get_filehandle(fd
);
9625 int r
= _getattr(f
->inode
, mask
, perms
);
9628 fill_stat(f
->inode
, stbuf
, NULL
);
9629 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9633 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9634 unsigned int want
, unsigned int flags
)
9636 Mutex::Locker
lock(client_lock
);
9637 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9638 tout(cct
) << fd
<< std::endl
;
9643 Fh
*f
= get_filehandle(fd
);
9647 unsigned mask
= statx_to_mask(flags
, want
);
9650 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9651 r
= _getattr(f
->inode
, mask
, perms
);
9653 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9658 fill_statx(f
->inode
, mask
, stx
);
9659 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9663 // not written yet, but i want to link!
9665 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9666 const UserPerm
& perms
)
9668 Mutex::Locker
lock(client_lock
);
9669 tout(cct
) << "chdir" << std::endl
;
9670 tout(cct
) << relpath
<< std::endl
;
9675 filepath
path(relpath
);
9677 int r
= path_walk(path
, &in
, perms
);
9682 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9684 _getcwd(new_cwd
, perms
);
9688 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9691 ldout(cct
, 10) << "getcwd " << *cwd
<< dendl
;
9693 Inode
*in
= cwd
.get();
9694 while (in
!= root
) {
9695 assert(in
->dn_set
.size() < 2); // dirs can't be hard-linked
9697 // A cwd or ancester is unlinked
9698 if (in
->dn_set
.empty()) {
9702 Dentry
*dn
= in
->get_first_parent();
9707 ldout(cct
, 10) << "getcwd looking up parent for " << *in
<< dendl
;
9708 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9709 filepath
path(in
->ino
);
9710 req
->set_filepath(path
);
9712 int res
= make_request(req
, perms
);
9721 path
.push_front_dentry(dn
->name
);
9722 in
= dn
->dir
->parent_inode
;
9725 dir
+= path
.get_path();
9728 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9730 Mutex::Locker
l(client_lock
);
9732 _getcwd(dir
, perms
);
9735 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9736 const UserPerm
& perms
)
9738 Mutex::Locker
l(client_lock
);
9739 tout(cct
) << "statfs" << std::endl
;
9740 unsigned long int total_files_on_fs
;
9748 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
9749 if (data_pools
.size() == 1) {
9750 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
9752 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
9755 client_lock
.Unlock();
9756 int rval
= cond
.wait();
9758 total_files_on_fs
= root
->rstat
.rfiles
+ root
->rstat
.rsubdirs
;
9762 ldout(cct
, 1) << "underlying call to statfs returned error: "
9763 << cpp_strerror(rval
)
9768 memset(stbuf
, 0, sizeof(*stbuf
));
9771 * we're going to set a block size of 4MB so we can represent larger
9772 * FSes without overflowing. Additionally convert the space
9773 * measurements from KB to bytes while making them in terms of
9774 * blocks. We use 4MB only because it is big enough, and because it
9775 * actually *is* the (ceph) default block size.
9777 const int CEPH_BLOCK_SHIFT
= 22;
9778 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
9779 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
9780 stbuf
->f_files
= total_files_on_fs
;
9782 stbuf
->f_favail
= -1;
9783 stbuf
->f_fsid
= -1; // ??
9784 stbuf
->f_flag
= 0; // ??
9785 stbuf
->f_namemax
= NAME_MAX
;
9787 // Usually quota_root will == root_ancestor, but if the mount root has no
9788 // quota but we can see a parent of it that does have a quota, we'll
9789 // respect that one instead.
9790 assert(root
!= nullptr);
9791 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
9793 // get_quota_root should always give us something
9794 // because client quotas are always enabled
9795 assert(quota_root
!= nullptr);
9797 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
9799 // Skip the getattr if any sessions are stale, as we don't want to
9800 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9802 if (!_any_stale_sessions()) {
9803 int r
= _getattr(quota_root
, 0, perms
, true);
9805 // Ignore return value: error getting latest inode metadata is not a good
9806 // reason to break "df".
9807 lderr(cct
) << "Error in getattr on quota root 0x"
9808 << std::hex
<< quota_root
->ino
<< std::dec
9809 << " statfs result may be outdated" << dendl
;
9813 // Special case: if there is a size quota set on the Inode acting
9814 // as the root for this client mount, then report the quota status
9815 // as the filesystem statistics.
9816 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
9817 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
9818 // It is possible for a quota to be exceeded: arithmetic here must
9819 // handle case where used > total.
9820 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
9822 stbuf
->f_blocks
= total
;
9823 stbuf
->f_bfree
= free
;
9824 stbuf
->f_bavail
= free
;
9826 // General case: report the cluster statistics returned from RADOS. Because
9827 // multiple pools may be used without one filesystem namespace via
9828 // layouts, this is the most correct thing we can do.
9829 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
9830 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
9831 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
9837 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
9838 struct flock
*fl
, uint64_t owner
, bool removing
)
9840 ldout(cct
, 10) << "_do_filelock ino " << in
->ino
9841 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
9842 << " type " << fl
->l_type
<< " owner " << owner
9843 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
9846 if (F_RDLCK
== fl
->l_type
)
9847 lock_cmd
= CEPH_LOCK_SHARED
;
9848 else if (F_WRLCK
== fl
->l_type
)
9849 lock_cmd
= CEPH_LOCK_EXCL
;
9850 else if (F_UNLCK
== fl
->l_type
)
9851 lock_cmd
= CEPH_LOCK_UNLOCK
;
9855 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
9859 * Set the most significant bit, so that MDS knows the 'owner'
9860 * is sufficient to identify the owner of lock. (old code uses
9861 * both 'owner' and 'pid')
9863 owner
|= (1ULL << 63);
9865 MetaRequest
*req
= new MetaRequest(op
);
9867 in
->make_nosnap_relative_path(path
);
9868 req
->set_filepath(path
);
9871 req
->head
.args
.filelock_change
.rule
= lock_type
;
9872 req
->head
.args
.filelock_change
.type
= lock_cmd
;
9873 req
->head
.args
.filelock_change
.owner
= owner
;
9874 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
9875 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
9876 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
9877 req
->head
.args
.filelock_change
.wait
= sleep
;
9882 if (sleep
&& switch_interrupt_cb
) {
9884 switch_interrupt_cb(callback_handle
, req
->get());
9885 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
9886 // disable interrupt
9887 switch_interrupt_cb(callback_handle
, NULL
);
9888 if (ret
== 0 && req
->aborted()) {
9889 // effect of this lock request has been revoked by the 'lock intr' request
9890 ret
= req
->get_abort_code();
9894 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
9898 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
9899 ceph_filelock filelock
;
9900 bufferlist::iterator p
= bl
.begin();
9901 ::decode(filelock
, p
);
9903 if (CEPH_LOCK_SHARED
== filelock
.type
)
9904 fl
->l_type
= F_RDLCK
;
9905 else if (CEPH_LOCK_EXCL
== filelock
.type
)
9906 fl
->l_type
= F_WRLCK
;
9908 fl
->l_type
= F_UNLCK
;
9910 fl
->l_whence
= SEEK_SET
;
9911 fl
->l_start
= filelock
.start
;
9912 fl
->l_len
= filelock
.length
;
9913 fl
->l_pid
= filelock
.pid
;
9914 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
9915 ceph_lock_state_t
*lock_state
;
9916 if (lock_type
== CEPH_LOCK_FCNTL
) {
9917 if (!in
->fcntl_locks
)
9918 in
->fcntl_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
);
9919 lock_state
= in
->fcntl_locks
;
9920 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
9921 if (!in
->flock_locks
)
9922 in
->flock_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
);
9923 lock_state
= in
->flock_locks
;
9928 _update_lock_state(fl
, owner
, lock_state
);
9931 if (lock_type
== CEPH_LOCK_FCNTL
) {
9932 if (!fh
->fcntl_locks
)
9933 fh
->fcntl_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
);
9934 lock_state
= fh
->fcntl_locks
;
9936 if (!fh
->flock_locks
)
9937 fh
->flock_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
);
9938 lock_state
= fh
->flock_locks
;
9940 _update_lock_state(fl
, owner
, lock_state
);
9948 int Client::_interrupt_filelock(MetaRequest
*req
)
9950 // Set abort code, but do not kick. The abort code prevents the request
9951 // from being re-sent.
9954 return 0; // haven't sent the request
9956 Inode
*in
= req
->inode();
9959 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
9960 lock_type
= CEPH_LOCK_FLOCK_INTR
;
9961 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
9962 lock_type
= CEPH_LOCK_FCNTL_INTR
;
9968 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
9970 in
->make_nosnap_relative_path(path
);
9971 intr_req
->set_filepath(path
);
9972 intr_req
->set_inode(in
);
9973 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
9974 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
9975 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
9977 UserPerm
perms(req
->get_uid(), req
->get_gid());
9978 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
9981 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
9983 if (!in
->fcntl_locks
&& !in
->flock_locks
)
9986 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
9987 ::encode(nr_fcntl_locks
, bl
);
9988 if (nr_fcntl_locks
) {
9989 ceph_lock_state_t
* lock_state
= in
->fcntl_locks
;
9990 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
9991 p
!= lock_state
->held_locks
.end();
9993 ::encode(p
->second
, bl
);
9996 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
9997 ::encode(nr_flock_locks
, bl
);
9998 if (nr_flock_locks
) {
9999 ceph_lock_state_t
* lock_state
= in
->flock_locks
;
10000 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10001 p
!= lock_state
->held_locks
.end();
10003 ::encode(p
->second
, bl
);
10006 ldout(cct
, 10) << "_encode_filelocks ino " << in
->ino
<< ", " << nr_fcntl_locks
10007 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
10010 void Client::_release_filelocks(Fh
*fh
)
10012 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
10015 Inode
*in
= fh
->inode
.get();
10016 ldout(cct
, 10) << "_release_filelocks " << fh
<< " ino " << in
->ino
<< dendl
;
10018 list
<pair
<int, ceph_filelock
> > to_release
;
10020 if (fh
->fcntl_locks
) {
10021 ceph_lock_state_t
* lock_state
= fh
->fcntl_locks
;
10022 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10023 p
!= lock_state
->held_locks
.end();
10025 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
10026 delete fh
->fcntl_locks
;
10028 if (fh
->flock_locks
) {
10029 ceph_lock_state_t
* lock_state
= fh
->flock_locks
;
10030 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
10031 p
!= lock_state
->held_locks
.end();
10033 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
10034 delete fh
->flock_locks
;
10037 if (to_release
.empty())
10041 memset(&fl
, 0, sizeof(fl
));
10042 fl
.l_whence
= SEEK_SET
;
10043 fl
.l_type
= F_UNLCK
;
10045 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
10046 p
!= to_release
.end();
10048 fl
.l_start
= p
->second
.start
;
10049 fl
.l_len
= p
->second
.length
;
10050 fl
.l_pid
= p
->second
.pid
;
10051 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
10052 p
->second
.owner
, true);
10056 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
10057 ceph_lock_state_t
*lock_state
)
10060 if (F_RDLCK
== fl
->l_type
)
10061 lock_cmd
= CEPH_LOCK_SHARED
;
10062 else if (F_WRLCK
== fl
->l_type
)
10063 lock_cmd
= CEPH_LOCK_EXCL
;
10065 lock_cmd
= CEPH_LOCK_UNLOCK
;;
10067 ceph_filelock filelock
;
10068 filelock
.start
= fl
->l_start
;
10069 filelock
.length
= fl
->l_len
;
10070 filelock
.client
= 0;
10071 // see comment in _do_filelock()
10072 filelock
.owner
= owner
| (1ULL << 63);
10073 filelock
.pid
= fl
->l_pid
;
10074 filelock
.type
= lock_cmd
;
10076 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10077 list
<ceph_filelock
> activated_locks
;
10078 lock_state
->remove_lock(filelock
, activated_locks
);
10080 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10085 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10087 Inode
*in
= fh
->inode
.get();
10088 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10089 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10093 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10095 Inode
*in
= fh
->inode
.get();
10096 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10097 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10098 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10102 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10104 Inode
*in
= fh
->inode
.get();
10105 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10107 int sleep
= !(cmd
& LOCK_NB
);
10126 memset(&fl
, 0, sizeof(fl
));
10128 fl
.l_whence
= SEEK_SET
;
10130 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10131 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10135 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10137 /* Since the only thing this does is wrap a call to statfs, and
10138 statfs takes a lock, it doesn't seem we have a need to split it
10140 return statfs(0, stbuf
, perms
);
10143 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10147 Mutex::Locker
l(client_lock
);
10148 ldout(cct
, 10) << "ll_register_callbacks cb " << args
->handle
10149 << " invalidate_ino_cb " << args
->ino_cb
10150 << " invalidate_dentry_cb " << args
->dentry_cb
10151 << " switch_interrupt_cb " << args
->switch_intr_cb
10152 << " remount_cb " << args
->remount_cb
10154 callback_handle
= args
->handle
;
10155 if (args
->ino_cb
) {
10156 ino_invalidate_cb
= args
->ino_cb
;
10157 async_ino_invalidator
.start();
10159 if (args
->dentry_cb
) {
10160 dentry_invalidate_cb
= args
->dentry_cb
;
10161 async_dentry_invalidator
.start();
10163 if (args
->switch_intr_cb
) {
10164 switch_interrupt_cb
= args
->switch_intr_cb
;
10165 interrupt_finisher
.start();
10167 if (args
->remount_cb
) {
10168 remount_cb
= args
->remount_cb
;
10169 remount_finisher
.start();
10171 umask_cb
= args
->umask_cb
;
10174 int Client::test_dentry_handling(bool can_invalidate
)
10178 can_invalidate_dentries
= can_invalidate
;
10180 if (can_invalidate_dentries
) {
10181 assert(dentry_invalidate_cb
);
10182 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10184 } else if (remount_cb
) {
10185 ldout(cct
, 1) << "using remount_cb" << dendl
;
10186 r
= _do_remount(false);
10189 bool should_abort
= cct
->_conf
->get_val
<bool>("client_die_on_failed_dentry_invalidate");
10190 if (should_abort
) {
10191 lderr(cct
) << "no method to invalidate kernel dentry cache; quitting!" << dendl
;
10194 lderr(cct
) << "no method to invalidate kernel dentry cache; expect issues!" << dendl
;
10200 int Client::_sync_fs()
10202 ldout(cct
, 10) << "_sync_fs" << dendl
;
10205 Mutex
lock("Client::_fsync::lock");
10207 bool flush_done
= false;
10208 if (cct
->_conf
->client_oc
)
10209 objectcacher
->flush_all(new C_SafeCond(&lock
, &cond
, &flush_done
));
10215 ceph_tid_t flush_tid
= last_flush_tid
;
10217 // wait for unsafe mds requests
10218 wait_unsafe_requests();
10220 wait_sync_caps(flush_tid
);
10223 client_lock
.Unlock();
10225 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10226 while (!flush_done
)
10229 client_lock
.Lock();
10235 int Client::sync_fs()
10237 Mutex::Locker
l(client_lock
);
10245 int64_t Client::drop_caches()
10247 Mutex::Locker
l(client_lock
);
10248 return objectcacher
->release_all();
10252 int Client::lazyio_propogate(int fd
, loff_t offset
, size_t count
)
10254 Mutex::Locker
l(client_lock
);
10255 ldout(cct
, 3) << "op: client->lazyio_propogate(" << fd
10256 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10258 Fh
*f
= get_filehandle(fd
);
10268 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10270 Mutex::Locker
l(client_lock
);
10271 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10272 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10274 Fh
*f
= get_filehandle(fd
);
10277 Inode
*in
= f
->inode
.get();
10286 // =============================
10289 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10291 Mutex::Locker
l(client_lock
);
10296 filepath
path(relpath
);
10298 int r
= path_walk(path
, &in
, perm
);
10301 if (cct
->_conf
->client_permissions
) {
10302 r
= may_create(in
.get(), perm
);
10306 Inode
*snapdir
= open_snapdir(in
.get());
10307 return _mkdir(snapdir
, name
, 0, perm
);
10310 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10312 Mutex::Locker
l(client_lock
);
10317 filepath
path(relpath
);
10319 int r
= path_walk(path
, &in
, perms
);
10322 if (cct
->_conf
->client_permissions
) {
10323 r
= may_delete(in
.get(), NULL
, perms
);
10327 Inode
*snapdir
= open_snapdir(in
.get());
10328 return _rmdir(snapdir
, name
, perms
);
10331 // =============================
10334 int Client::get_caps_issued(int fd
) {
10336 Mutex::Locker
lock(client_lock
);
10341 Fh
*f
= get_filehandle(fd
);
10345 return f
->inode
->caps_issued();
10348 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10350 Mutex::Locker
lock(client_lock
);
10357 int r
= path_walk(p
, &in
, perms
, true);
10360 return in
->caps_issued();
10363 // =========================================
10366 Inode
*Client::open_snapdir(Inode
*diri
)
10369 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10370 if (!inode_map
.count(vino
)) {
10371 in
= new Inode(this, vino
, &diri
->layout
);
10373 in
->ino
= diri
->ino
;
10374 in
->snapid
= CEPH_SNAPDIR
;
10375 in
->mode
= diri
->mode
;
10376 in
->uid
= diri
->uid
;
10377 in
->gid
= diri
->gid
;
10378 in
->mtime
= diri
->mtime
;
10379 in
->ctime
= diri
->ctime
;
10380 in
->btime
= diri
->btime
;
10381 in
->size
= diri
->size
;
10382 in
->change_attr
= diri
->change_attr
;
10384 in
->dirfragtree
.clear();
10385 in
->snapdir_parent
= diri
;
10386 diri
->flags
|= I_SNAPDIR_OPEN
;
10387 inode_map
[vino
] = in
;
10388 if (use_faked_inos())
10389 _assign_faked_ino(in
);
10390 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10392 in
= inode_map
[vino
];
10393 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10398 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10399 Inode
**out
, const UserPerm
& perms
)
10401 Mutex::Locker
lock(client_lock
);
10402 vinodeno_t vparent
= _get_vino(parent
);
10403 ldout(cct
, 3) << "ll_lookup " << vparent
<< " " << name
<< dendl
;
10404 tout(cct
) << "ll_lookup" << std::endl
;
10405 tout(cct
) << name
<< std::endl
;
10411 if (!cct
->_conf
->fuse_default_permissions
) {
10412 r
= may_lookup(parent
, perms
);
10417 string
dname(name
);
10420 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10427 fill_stat(in
, attr
);
10431 ldout(cct
, 3) << "ll_lookup " << vparent
<< " " << name
10432 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10433 tout(cct
) << attr
->st_ino
<< std::endl
;
10438 int Client::ll_lookup_inode(
10439 struct inodeno_t ino
,
10440 const UserPerm
& perms
,
10443 Mutex::Locker
lock(client_lock
);
10444 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10446 // Num1: get inode and *inode
10447 int r
= _lookup_ino(ino
, perms
, inode
);
10451 assert(inode
!= NULL
);
10452 assert(*inode
!= NULL
);
10454 // Num2: Request the parent inode, so that we can look up the name
10456 r
= _lookup_parent(*inode
, perms
, &parent
);
10457 if (r
&& r
!= -EINVAL
) {
10458 // Unexpected error
10459 _ll_forget(*inode
, 1);
10461 } else if (r
== -EINVAL
) {
10462 // EINVAL indicates node without parents (root), drop out now
10463 // and don't try to look up the non-existent dentry.
10466 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10467 // is already in cache
10468 assert(parent
!= NULL
);
10470 // Num3: Finally, get the name (dentry) of the requested inode
10471 r
= _lookup_name(*inode
, parent
, perms
);
10473 // Unexpected error
10474 _ll_forget(parent
, 1);
10475 _ll_forget(*inode
, 1);
10479 _ll_forget(parent
, 1);
10483 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10484 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10485 const UserPerm
& perms
)
10487 Mutex::Locker
lock(client_lock
);
10488 vinodeno_t vparent
= _get_vino(parent
);
10489 ldout(cct
, 3) << "ll_lookupx " << vparent
<< " " << name
<< dendl
;
10490 tout(cct
) << "ll_lookupx" << std::endl
;
10491 tout(cct
) << name
<< std::endl
;
10497 if (!cct
->_conf
->fuse_default_permissions
) {
10498 r
= may_lookup(parent
, perms
);
10503 string
dname(name
);
10506 unsigned mask
= statx_to_mask(flags
, want
);
10507 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10513 fill_statx(in
, mask
, stx
);
10517 ldout(cct
, 3) << "ll_lookupx " << vparent
<< " " << name
10518 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10519 tout(cct
) << stx
->stx_ino
<< std::endl
;
10524 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10525 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10527 Mutex::Locker
lock(client_lock
);
10532 filepath
fp(name
, 0);
10535 unsigned mask
= statx_to_mask(flags
, want
);
10537 ldout(cct
, 3) << "ll_walk" << name
<< dendl
;
10538 tout(cct
) << "ll_walk" << std::endl
;
10539 tout(cct
) << name
<< std::endl
;
10541 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10543 /* zero out mask, just in case... */
10550 fill_statx(in
, mask
, stx
);
10557 void Client::_ll_get(Inode
*in
)
10559 if (in
->ll_ref
== 0) {
10561 if (in
->is_dir() && !in
->dn_set
.empty()) {
10562 assert(in
->dn_set
.size() == 1); // dirs can't be hard-linked
10563 in
->get_first_parent()->get(); // pin dentry
10567 ldout(cct
, 20) << "_ll_get " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10570 int Client::_ll_put(Inode
*in
, int num
)
10573 ldout(cct
, 20) << "_ll_put " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10574 if (in
->ll_ref
== 0) {
10575 if (in
->is_dir() && !in
->dn_set
.empty()) {
10576 assert(in
->dn_set
.size() == 1); // dirs can't be hard-linked
10577 in
->get_first_parent()->put(); // unpin dentry
10586 void Client::_ll_drop_pins()
10588 ldout(cct
, 10) << "_ll_drop_pins" << dendl
;
10589 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10590 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10591 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10592 it
!= inode_map
.end();
10594 Inode
*in
= it
->second
;
10598 to_be_put
.insert(in
);
10599 _ll_put(in
, in
->ll_ref
);
10604 bool Client::_ll_forget(Inode
*in
, int count
)
10606 inodeno_t ino
= _get_inodeno(in
);
10608 ldout(cct
, 8) << "ll_forget " << ino
<< " " << count
<< dendl
;
10609 tout(cct
) << "ll_forget" << std::endl
;
10610 tout(cct
) << ino
.val
<< std::endl
;
10611 tout(cct
) << count
<< std::endl
;
10613 // Ignore forget if we're no longer mounted
10617 if (ino
== 1) return true; // ignore forget on root.
10620 if (in
->ll_ref
< count
) {
10621 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10622 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10623 _ll_put(in
, in
->ll_ref
);
10626 if (_ll_put(in
, count
) == 0)
10633 bool Client::ll_forget(Inode
*in
, int count
)
10635 Mutex::Locker
lock(client_lock
);
10636 return _ll_forget(in
, count
);
10639 bool Client::ll_put(Inode
*in
)
10641 /* ll_forget already takes the lock */
10642 return ll_forget(in
, 1);
10645 snapid_t
Client::ll_get_snapid(Inode
*in
)
10647 Mutex::Locker
lock(client_lock
);
10651 Inode
*Client::ll_get_inode(ino_t ino
)
10653 Mutex::Locker
lock(client_lock
);
10658 vinodeno_t vino
= _map_faked_ino(ino
);
10659 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10660 if (p
== inode_map
.end())
10662 Inode
*in
= p
->second
;
10667 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10669 Mutex::Locker
lock(client_lock
);
10674 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10675 if (p
== inode_map
.end())
10677 Inode
*in
= p
->second
;
10682 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
10684 vinodeno_t vino
= _get_vino(in
);
10686 ldout(cct
, 8) << "ll_getattr " << vino
<< dendl
;
10687 tout(cct
) << "ll_getattr" << std::endl
;
10688 tout(cct
) << vino
.ino
.val
<< std::endl
;
10690 if (vino
.snapid
< CEPH_NOSNAP
)
10693 return _getattr(in
, caps
, perms
);
10696 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
10698 Mutex::Locker
lock(client_lock
);
10703 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
10706 fill_stat(in
, attr
);
10707 ldout(cct
, 3) << "ll_getattr " << _get_vino(in
) << " = " << res
<< dendl
;
10711 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
10712 unsigned int flags
, const UserPerm
& perms
)
10714 Mutex::Locker
lock(client_lock
);
10720 unsigned mask
= statx_to_mask(flags
, want
);
10722 if (mask
&& !in
->caps_issued_mask(mask
, true))
10723 res
= _ll_getattr(in
, mask
, perms
);
10726 fill_statx(in
, mask
, stx
);
10727 ldout(cct
, 3) << "ll_getattrx " << _get_vino(in
) << " = " << res
<< dendl
;
10731 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10732 const UserPerm
& perms
, InodeRef
*inp
)
10734 vinodeno_t vino
= _get_vino(in
);
10736 ldout(cct
, 8) << "ll_setattrx " << vino
<< " mask " << hex
<< mask
<< dec
10738 tout(cct
) << "ll_setattrx" << std::endl
;
10739 tout(cct
) << vino
.ino
.val
<< std::endl
;
10740 tout(cct
) << stx
->stx_mode
<< std::endl
;
10741 tout(cct
) << stx
->stx_uid
<< std::endl
;
10742 tout(cct
) << stx
->stx_gid
<< std::endl
;
10743 tout(cct
) << stx
->stx_size
<< std::endl
;
10744 tout(cct
) << stx
->stx_mtime
<< std::endl
;
10745 tout(cct
) << stx
->stx_atime
<< std::endl
;
10746 tout(cct
) << stx
->stx_btime
<< std::endl
;
10747 tout(cct
) << mask
<< std::endl
;
10749 if (!cct
->_conf
->fuse_default_permissions
) {
10750 int res
= may_setattr(in
, stx
, mask
, perms
);
10755 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
10757 return __setattrx(in
, stx
, mask
, perms
, inp
);
10760 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10761 const UserPerm
& perms
)
10763 Mutex::Locker
lock(client_lock
);
10768 InodeRef
target(in
);
10769 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
10771 assert(in
== target
.get());
10772 fill_statx(in
, in
->caps_issued(), stx
);
10775 ldout(cct
, 3) << "ll_setattrx " << _get_vino(in
) << " = " << res
<< dendl
;
10779 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
10780 const UserPerm
& perms
)
10782 struct ceph_statx stx
;
10783 stat_to_statx(attr
, &stx
);
10785 Mutex::Locker
lock(client_lock
);
10790 InodeRef
target(in
);
10791 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
10793 assert(in
== target
.get());
10794 fill_stat(in
, attr
);
10797 ldout(cct
, 3) << "ll_setattr " << _get_vino(in
) << " = " << res
<< dendl
;
10805 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
10806 const UserPerm
& perms
)
10808 Mutex::Locker
lock(client_lock
);
10814 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
10817 return _getxattr(in
, name
, value
, size
, perms
);
10820 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
10821 const UserPerm
& perms
)
10823 Mutex::Locker
lock(client_lock
);
10829 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
10832 return _getxattr(in
, name
, value
, size
, perms
);
10835 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
10836 const UserPerm
& perms
)
10838 Mutex::Locker
lock(client_lock
);
10843 Fh
*f
= get_filehandle(fd
);
10846 return _getxattr(f
->inode
, name
, value
, size
, perms
);
10849 int Client::listxattr(const char *path
, char *list
, size_t size
,
10850 const UserPerm
& perms
)
10852 Mutex::Locker
lock(client_lock
);
10858 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
10861 return Client::_listxattr(in
.get(), list
, size
, perms
);
10864 int Client::llistxattr(const char *path
, char *list
, size_t size
,
10865 const UserPerm
& perms
)
10867 Mutex::Locker
lock(client_lock
);
10873 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
10876 return Client::_listxattr(in
.get(), list
, size
, perms
);
10879 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
10881 Mutex::Locker
lock(client_lock
);
10886 Fh
*f
= get_filehandle(fd
);
10889 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
10892 int Client::removexattr(const char *path
, const char *name
,
10893 const UserPerm
& perms
)
10895 Mutex::Locker
lock(client_lock
);
10901 int r
= Client::path_walk(path
, &in
, perms
, true);
10904 return _removexattr(in
, name
, perms
);
10907 int Client::lremovexattr(const char *path
, const char *name
,
10908 const UserPerm
& perms
)
10910 Mutex::Locker
lock(client_lock
);
10916 int r
= Client::path_walk(path
, &in
, perms
, false);
10919 return _removexattr(in
, name
, perms
);
10922 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
10924 Mutex::Locker
lock(client_lock
);
10929 Fh
*f
= get_filehandle(fd
);
10932 return _removexattr(f
->inode
, name
, perms
);
10935 int Client::setxattr(const char *path
, const char *name
, const void *value
,
10936 size_t size
, int flags
, const UserPerm
& perms
)
10938 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10940 Mutex::Locker
lock(client_lock
);
10946 int r
= Client::path_walk(path
, &in
, perms
, true);
10949 return _setxattr(in
, name
, value
, size
, flags
, perms
);
10952 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
10953 size_t size
, int flags
, const UserPerm
& perms
)
10955 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10957 Mutex::Locker
lock(client_lock
);
10963 int r
= Client::path_walk(path
, &in
, perms
, false);
10966 return _setxattr(in
, name
, value
, size
, flags
, perms
);
10969 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
10970 int flags
, const UserPerm
& perms
)
10972 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10974 Mutex::Locker
lock(client_lock
);
10979 Fh
*f
= get_filehandle(fd
);
10982 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
10985 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
10986 const UserPerm
& perms
)
10990 const VXattr
*vxattr
= _match_vxattr(in
, name
);
10994 // Do a force getattr to get the latest quota before returning
10995 // a value to userspace.
10997 if (vxattr
->flags
& VXATTR_RSTAT
) {
10998 flags
|= CEPH_STAT_RSTAT
;
11000 r
= _getattr(in
, flags
, perms
, true);
11002 // Error from getattr!
11006 // call pointer-to-member function
11008 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
11009 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
11015 if (r
> (int)size
) {
11017 } else if (r
> 0) {
11018 memcpy(value
, buf
, r
);
11024 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
11029 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11033 if (in
->xattrs
.count(n
)) {
11034 r
= in
->xattrs
[n
].length();
11035 if (r
> 0 && size
!= 0) {
11036 if (size
>= (unsigned)r
)
11037 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
11044 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
11048 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
11049 const UserPerm
& perms
)
11051 if (cct
->_conf
->client_permissions
) {
11052 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
11056 return _getxattr(in
.get(), name
, value
, size
, perms
);
11059 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
11060 size_t size
, const UserPerm
& perms
)
11062 Mutex::Locker
lock(client_lock
);
11067 vinodeno_t vino
= _get_vino(in
);
11069 ldout(cct
, 3) << "ll_getxattr " << vino
<< " " << name
<< " size " << size
<< dendl
;
11070 tout(cct
) << "ll_getxattr" << std::endl
;
11071 tout(cct
) << vino
.ino
.val
<< std::endl
;
11072 tout(cct
) << name
<< std::endl
;
11074 if (!cct
->_conf
->fuse_default_permissions
) {
11075 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11080 return _getxattr(in
, name
, value
, size
, perms
);
11083 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11084 const UserPerm
& perms
)
11086 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11088 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11089 p
!= in
->xattrs
.end();
11091 r
+= p
->first
.length() + 1;
11093 const VXattr
*vxattrs
= _get_vxattrs(in
);
11094 r
+= _vxattrs_name_size(vxattrs
);
11097 if (size
>= (unsigned)r
) {
11098 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11099 p
!= in
->xattrs
.end();
11101 memcpy(name
, p
->first
.c_str(), p
->first
.length());
11102 name
+= p
->first
.length();
11107 for (int i
= 0; !vxattrs
[i
].name
.empty(); i
++) {
11108 const VXattr
& vxattr
= vxattrs
[i
];
11111 // call pointer-to-member function
11112 if(vxattr
.exists_cb
&& !(this->*(vxattr
.exists_cb
))(in
))
11114 memcpy(name
, vxattr
.name
.c_str(), vxattr
.name
.length());
11115 name
+= vxattr
.name
.length();
11124 ldout(cct
, 8) << "_listxattr(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11128 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11129 const UserPerm
& perms
)
11131 Mutex::Locker
lock(client_lock
);
11136 vinodeno_t vino
= _get_vino(in
);
11138 ldout(cct
, 3) << "ll_listxattr " << vino
<< " size " << size
<< dendl
;
11139 tout(cct
) << "ll_listxattr" << std::endl
;
11140 tout(cct
) << vino
.ino
.val
<< std::endl
;
11141 tout(cct
) << size
<< std::endl
;
11143 return _listxattr(in
, names
, size
, perms
);
11146 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11147 size_t size
, int flags
, const UserPerm
& perms
)
11150 int xattr_flags
= 0;
11152 xattr_flags
|= CEPH_XATTR_REMOVE
;
11153 if (flags
& XATTR_CREATE
)
11154 xattr_flags
|= CEPH_XATTR_CREATE
;
11155 if (flags
& XATTR_REPLACE
)
11156 xattr_flags
|= CEPH_XATTR_REPLACE
;
11158 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11160 in
->make_nosnap_relative_path(path
);
11161 req
->set_filepath(path
);
11162 req
->set_string2(name
);
11163 req
->set_inode(in
);
11164 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11167 bl
.append((const char*)value
, size
);
11170 int res
= make_request(req
, perms
);
11173 ldout(cct
, 3) << "_setxattr(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11178 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11179 size_t size
, int flags
, const UserPerm
& perms
)
11181 if (in
->snapid
!= CEPH_NOSNAP
) {
11185 bool posix_acl_xattr
= false;
11186 if (acl_type
== POSIX_ACL
)
11187 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11189 if (strncmp(name
, "user.", 5) &&
11190 strncmp(name
, "security.", 9) &&
11191 strncmp(name
, "trusted.", 8) &&
11192 strncmp(name
, "ceph.", 5) &&
11194 return -EOPNOTSUPP
;
11196 if (posix_acl_xattr
) {
11197 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11198 mode_t new_mode
= in
->mode
;
11200 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11207 if (new_mode
!= in
->mode
) {
11208 struct ceph_statx stx
;
11209 stx
.stx_mode
= new_mode
;
11210 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11215 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11217 if (!S_ISDIR(in
->mode
))
11219 int ret
= posix_acl_check(value
, size
);
11228 return -EOPNOTSUPP
;
11231 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11232 if (vxattr
&& vxattr
->readonly
)
11233 return -EOPNOTSUPP
;
11236 return _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11239 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11240 size_t size
, int flags
, const UserPerm
& perms
)
11242 if (cct
->_conf
->client_permissions
) {
11243 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11247 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11250 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11253 if (name
== "layout") {
11254 string::iterator begin
= value
.begin();
11255 string::iterator end
= value
.end();
11256 keys_and_values
<string::iterator
> p
; // create instance of parser
11257 std::map
<string
, string
> m
; // map to receive results
11258 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11263 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11264 if (q
->first
== "pool") {
11269 } else if (name
== "layout.pool") {
11273 if (tmp
.length()) {
11276 pool
= boost::lexical_cast
<unsigned>(tmp
);
11277 if (!osdmap
->have_pg_pool(pool
))
11279 } catch (boost::bad_lexical_cast
const&) {
11280 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11290 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11292 // For setting pool of layout, MetaRequest need osdmap epoch.
11293 // There is a race which create a new data pool but client and mds both don't have.
11294 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11295 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11296 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11297 string
rest(strstr(name
, "layout"));
11298 string
v((const char*)value
, size
);
11299 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11300 return _setxattr_check_data_pool(rest
, v
, &o
);
11303 if (r
== -ENOENT
) {
11305 objecter
->wait_for_latest_osdmap(&ctx
);
11311 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11312 size_t size
, int flags
, const UserPerm
& perms
)
11314 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11316 Mutex::Locker
lock(client_lock
);
11321 vinodeno_t vino
= _get_vino(in
);
11323 ldout(cct
, 3) << "ll_setxattr " << vino
<< " " << name
<< " size " << size
<< dendl
;
11324 tout(cct
) << "ll_setxattr" << std::endl
;
11325 tout(cct
) << vino
.ino
.val
<< std::endl
;
11326 tout(cct
) << name
<< std::endl
;
11328 if (!cct
->_conf
->fuse_default_permissions
) {
11329 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11333 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11336 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11338 if (in
->snapid
!= CEPH_NOSNAP
) {
11342 // same xattrs supported by kernel client
11343 if (strncmp(name
, "user.", 5) &&
11344 strncmp(name
, "system.", 7) &&
11345 strncmp(name
, "security.", 9) &&
11346 strncmp(name
, "trusted.", 8) &&
11347 strncmp(name
, "ceph.", 5))
11348 return -EOPNOTSUPP
;
11350 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11351 if (vxattr
&& vxattr
->readonly
)
11352 return -EOPNOTSUPP
;
11354 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11356 in
->make_nosnap_relative_path(path
);
11357 req
->set_filepath(path
);
11358 req
->set_filepath2(name
);
11359 req
->set_inode(in
);
11361 int res
= make_request(req
, perms
);
11364 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11368 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11370 if (cct
->_conf
->client_permissions
) {
11371 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11375 return _removexattr(in
.get(), name
, perms
);
11378 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11380 Mutex::Locker
lock(client_lock
);
11385 vinodeno_t vino
= _get_vino(in
);
11387 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11388 tout(cct
) << "ll_removexattr" << std::endl
;
11389 tout(cct
) << vino
.ino
.val
<< std::endl
;
11390 tout(cct
) << name
<< std::endl
;
11392 if (!cct
->_conf
->fuse_default_permissions
) {
11393 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11398 return _removexattr(in
, name
, perms
);
11401 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11403 return in
->quota
.is_enable();
11405 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11407 return snprintf(val
, size
,
11408 "max_bytes=%lld max_files=%lld",
11409 (long long int)in
->quota
.max_bytes
,
11410 (long long int)in
->quota
.max_files
);
11412 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11414 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11416 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11418 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11421 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11423 return in
->layout
!= file_layout_t();
11425 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11427 int r
= snprintf(val
, size
,
11428 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11429 (unsigned long long)in
->layout
.stripe_unit
,
11430 (unsigned long long)in
->layout
.stripe_count
,
11431 (unsigned long long)in
->layout
.object_size
);
11432 objecter
->with_osdmap([&](const OSDMap
& o
) {
11433 if (o
.have_pg_pool(in
->layout
.pool_id
))
11434 r
+= snprintf(val
+ r
, size
- r
, "%s",
11435 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11437 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11438 (uint64_t)in
->layout
.pool_id
);
11440 if (in
->layout
.pool_ns
.length())
11441 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11442 in
->layout
.pool_ns
.c_str());
11445 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11447 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.stripe_unit
);
11449 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11451 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.stripe_count
);
11453 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11455 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.object_size
);
11457 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11460 objecter
->with_osdmap([&](const OSDMap
& o
) {
11461 if (o
.have_pg_pool(in
->layout
.pool_id
))
11462 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11463 in
->layout
.pool_id
).c_str());
11465 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11469 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11471 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11473 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11475 return snprintf(val
, size
, "%lld", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11477 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11479 return snprintf(val
, size
, "%lld", (unsigned long long)in
->dirstat
.nfiles
);
11481 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11483 return snprintf(val
, size
, "%lld", (unsigned long long)in
->dirstat
.nsubdirs
);
11485 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11487 return snprintf(val
, size
, "%lld", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11489 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11491 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rfiles
);
11493 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11495 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rsubdirs
);
11497 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11499 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rbytes
);
11501 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11503 return snprintf(val
, size
, "%ld.09%ld", (long)in
->rstat
.rctime
.sec(),
11504 (long)in
->rstat
.rctime
.nsec());
11507 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11508 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11510 #define XATTR_NAME_CEPH(_type, _name) \
11512 name: CEPH_XATTR_NAME(_type, _name), \
11513 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11519 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11521 name: CEPH_XATTR_NAME(_type, _name), \
11522 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11528 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11530 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11531 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11534 exists_cb: &Client::_vxattrcb_layout_exists, \
11537 #define XATTR_QUOTA_FIELD(_type, _name) \
11539 name: CEPH_XATTR_NAME(_type, _name), \
11540 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11543 exists_cb: &Client::_vxattrcb_quota_exists, \
11547 const Client::VXattr
Client::_dir_vxattrs
[] = {
11549 name
: "ceph.dir.layout",
11550 getxattr_cb
: &Client::_vxattrcb_layout
,
11553 exists_cb
: &Client::_vxattrcb_layout_exists
,
11556 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11557 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11558 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11559 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11560 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11561 XATTR_NAME_CEPH(dir
, entries
),
11562 XATTR_NAME_CEPH(dir
, files
),
11563 XATTR_NAME_CEPH(dir
, subdirs
),
11564 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11565 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11566 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11567 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11568 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11570 name
: "ceph.quota",
11571 getxattr_cb
: &Client::_vxattrcb_quota
,
11574 exists_cb
: &Client::_vxattrcb_quota_exists
,
11577 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11578 XATTR_QUOTA_FIELD(quota
, max_files
),
11579 { name
: "" } /* Required table terminator */
11582 const Client::VXattr
Client::_file_vxattrs
[] = {
11584 name
: "ceph.file.layout",
11585 getxattr_cb
: &Client::_vxattrcb_layout
,
11588 exists_cb
: &Client::_vxattrcb_layout_exists
,
11591 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11592 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11593 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11594 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11595 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11596 { name
: "" } /* Required table terminator */
11599 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11602 return _dir_vxattrs
;
11603 else if (in
->is_file())
11604 return _file_vxattrs
;
11608 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11610 if (strncmp(name
, "ceph.", 5) == 0) {
11611 const VXattr
*vxattr
= _get_vxattrs(in
);
11613 while (!vxattr
->name
.empty()) {
11614 if (vxattr
->name
== name
)
11623 size_t Client::_vxattrs_calcu_name_size(const VXattr
*vxattr
)
11626 while (!vxattr
->name
.empty()) {
11627 if (!vxattr
->hidden
)
11628 len
+= vxattr
->name
.length() + 1;
11634 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11636 Mutex::Locker
lock(client_lock
);
11641 vinodeno_t vino
= _get_vino(in
);
11643 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11644 tout(cct
) << "ll_readlink" << std::endl
;
11645 tout(cct
) << vino
.ino
.val
<< std::endl
;
11647 set
<Dentry
*>::iterator dn
= in
->dn_set
.begin();
11648 while (dn
!= in
->dn_set
.end()) {
11653 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11654 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
11658 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
11659 const UserPerm
& perms
, InodeRef
*inp
)
11661 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
11662 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
11663 << ", gid " << perms
.gid() << ")" << dendl
;
11665 if (strlen(name
) > NAME_MAX
)
11666 return -ENAMETOOLONG
;
11668 if (dir
->snapid
!= CEPH_NOSNAP
) {
11671 if (is_quota_files_exceeded(dir
, perms
)) {
11675 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
11678 dir
->make_nosnap_relative_path(path
);
11679 path
.push_dentry(name
);
11680 req
->set_filepath(path
);
11681 req
->set_inode(dir
);
11682 req
->head
.args
.mknod
.rdev
= rdev
;
11683 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11684 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11686 bufferlist xattrs_bl
;
11687 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
11690 req
->head
.args
.mknod
.mode
= mode
;
11691 if (xattrs_bl
.length() > 0)
11692 req
->set_data(xattrs_bl
);
11695 res
= get_or_create(dir
, name
, &de
);
11698 req
->set_dentry(de
);
11700 res
= make_request(req
, perms
, inp
);
11704 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
11712 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
11713 dev_t rdev
, struct stat
*attr
, Inode
**out
,
11714 const UserPerm
& perms
)
11716 Mutex::Locker
lock(client_lock
);
11721 vinodeno_t vparent
= _get_vino(parent
);
11723 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
11724 tout(cct
) << "ll_mknod" << std::endl
;
11725 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11726 tout(cct
) << name
<< std::endl
;
11727 tout(cct
) << mode
<< std::endl
;
11728 tout(cct
) << rdev
<< std::endl
;
11730 if (!cct
->_conf
->fuse_default_permissions
) {
11731 int r
= may_create(parent
, perms
);
11737 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
11739 fill_stat(in
, attr
);
11742 tout(cct
) << attr
->st_ino
<< std::endl
;
11743 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
11744 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11749 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
11750 dev_t rdev
, Inode
**out
,
11751 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11752 const UserPerm
& perms
)
11754 unsigned caps
= statx_to_mask(flags
, want
);
11755 Mutex::Locker
lock(client_lock
);
11760 vinodeno_t vparent
= _get_vino(parent
);
11762 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
11763 tout(cct
) << "ll_mknodx" << std::endl
;
11764 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11765 tout(cct
) << name
<< std::endl
;
11766 tout(cct
) << mode
<< std::endl
;
11767 tout(cct
) << rdev
<< std::endl
;
11769 if (!cct
->_conf
->fuse_default_permissions
) {
11770 int r
= may_create(parent
, perms
);
11776 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
11778 fill_statx(in
, caps
, stx
);
11781 tout(cct
) << stx
->stx_ino
<< std::endl
;
11782 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
11783 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11788 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
11789 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
11790 int object_size
, const char *data_pool
, bool *created
,
11791 const UserPerm
& perms
)
11793 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
11794 mode
<< dec
<< ")" << dendl
;
11796 if (strlen(name
) > NAME_MAX
)
11797 return -ENAMETOOLONG
;
11798 if (dir
->snapid
!= CEPH_NOSNAP
) {
11801 if (is_quota_files_exceeded(dir
, perms
)) {
11805 // use normalized flags to generate cmode
11806 int cmode
= ceph_flags_to_mode(ceph_flags_sys2wire(flags
));
11810 int64_t pool_id
= -1;
11811 if (data_pool
&& *data_pool
) {
11812 pool_id
= objecter
->with_osdmap(
11813 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
11816 if (pool_id
> 0xffffffffll
)
11817 return -ERANGE
; // bummer!
11820 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
11823 dir
->make_nosnap_relative_path(path
);
11824 path
.push_dentry(name
);
11825 req
->set_filepath(path
);
11826 req
->set_inode(dir
);
11827 req
->head
.args
.open
.flags
= ceph_flags_sys2wire(flags
| O_CREAT
);
11829 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
11830 req
->head
.args
.open
.stripe_count
= stripe_count
;
11831 req
->head
.args
.open
.object_size
= object_size
;
11832 if (cct
->_conf
->client_debug_getattr_caps
)
11833 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
11835 req
->head
.args
.open
.mask
= 0;
11836 req
->head
.args
.open
.pool
= pool_id
;
11837 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11838 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11841 bufferlist xattrs_bl
;
11842 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
11845 req
->head
.args
.open
.mode
= mode
;
11846 if (xattrs_bl
.length() > 0)
11847 req
->set_data(xattrs_bl
);
11850 res
= get_or_create(dir
, name
, &de
);
11853 req
->set_dentry(de
);
11855 res
= make_request(req
, perms
, inp
, created
);
11860 /* If the caller passed a value in fhp, do the open */
11862 (*inp
)->get_open_ref(cmode
);
11863 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
11869 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
11870 << " layout " << stripe_unit
11871 << ' ' << stripe_count
11872 << ' ' << object_size
11873 <<") = " << res
<< dendl
;
11882 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
11885 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
11886 << mode
<< dec
<< ", uid " << perm
.uid()
11887 << ", gid " << perm
.gid() << ")" << dendl
;
11889 if (strlen(name
) > NAME_MAX
)
11890 return -ENAMETOOLONG
;
11892 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
11895 if (is_quota_files_exceeded(dir
, perm
)) {
11898 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
11899 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
11902 dir
->make_nosnap_relative_path(path
);
11903 path
.push_dentry(name
);
11904 req
->set_filepath(path
);
11905 req
->set_inode(dir
);
11906 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11907 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11910 bufferlist xattrs_bl
;
11911 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
11914 req
->head
.args
.mkdir
.mode
= mode
;
11915 if (xattrs_bl
.length() > 0)
11916 req
->set_data(xattrs_bl
);
11919 res
= get_or_create(dir
, name
, &de
);
11922 req
->set_dentry(de
);
11924 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
11925 res
= make_request(req
, perm
, inp
);
11926 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
11930 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
11938 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
11939 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
11941 Mutex::Locker
lock(client_lock
);
11946 vinodeno_t vparent
= _get_vino(parent
);
11948 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
11949 tout(cct
) << "ll_mkdir" << std::endl
;
11950 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11951 tout(cct
) << name
<< std::endl
;
11952 tout(cct
) << mode
<< std::endl
;
11954 if (!cct
->_conf
->fuse_default_permissions
) {
11955 int r
= may_create(parent
, perm
);
11961 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
11963 fill_stat(in
, attr
);
11966 tout(cct
) << attr
->st_ino
<< std::endl
;
11967 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
11968 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11973 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
11974 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11975 const UserPerm
& perms
)
11977 Mutex::Locker
lock(client_lock
);
11982 vinodeno_t vparent
= _get_vino(parent
);
11984 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
11985 tout(cct
) << "ll_mkdirx" << std::endl
;
11986 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11987 tout(cct
) << name
<< std::endl
;
11988 tout(cct
) << mode
<< std::endl
;
11990 if (!cct
->_conf
->fuse_default_permissions
) {
11991 int r
= may_create(parent
, perms
);
11997 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
11999 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12005 tout(cct
) << stx
->stx_ino
<< std::endl
;
12006 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
12007 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12012 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
12013 const UserPerm
& perms
, InodeRef
*inp
)
12015 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
12016 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
12019 if (strlen(name
) > NAME_MAX
)
12020 return -ENAMETOOLONG
;
12022 if (dir
->snapid
!= CEPH_NOSNAP
) {
12025 if (is_quota_files_exceeded(dir
, perms
)) {
12029 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
12032 dir
->make_nosnap_relative_path(path
);
12033 path
.push_dentry(name
);
12034 req
->set_filepath(path
);
12035 req
->set_inode(dir
);
12036 req
->set_string2(target
);
12037 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12038 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12041 int res
= get_or_create(dir
, name
, &de
);
12044 req
->set_dentry(de
);
12046 res
= make_request(req
, perms
, inp
);
12049 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
12058 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
12059 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
12061 Mutex::Locker
lock(client_lock
);
12066 vinodeno_t vparent
= _get_vino(parent
);
12068 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
12070 tout(cct
) << "ll_symlink" << std::endl
;
12071 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12072 tout(cct
) << name
<< std::endl
;
12073 tout(cct
) << value
<< std::endl
;
12075 if (!cct
->_conf
->fuse_default_permissions
) {
12076 int r
= may_create(parent
, perms
);
12082 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12084 fill_stat(in
, attr
);
12087 tout(cct
) << attr
->st_ino
<< std::endl
;
12088 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12089 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12094 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12095 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12096 unsigned flags
, const UserPerm
& perms
)
12098 Mutex::Locker
lock(client_lock
);
12103 vinodeno_t vparent
= _get_vino(parent
);
12105 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12107 tout(cct
) << "ll_symlinkx" << std::endl
;
12108 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12109 tout(cct
) << name
<< std::endl
;
12110 tout(cct
) << value
<< std::endl
;
12112 if (!cct
->_conf
->fuse_default_permissions
) {
12113 int r
= may_create(parent
, perms
);
12119 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12121 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12124 tout(cct
) << stx
->stx_ino
<< std::endl
;
12125 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12126 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12131 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12133 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12134 << " uid " << perm
.uid() << " gid " << perm
.gid()
12137 if (dir
->snapid
!= CEPH_NOSNAP
) {
12141 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12144 dir
->make_nosnap_relative_path(path
);
12145 path
.push_dentry(name
);
12146 req
->set_filepath(path
);
12152 int res
= get_or_create(dir
, name
, &de
);
12155 req
->set_dentry(de
);
12156 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12157 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12159 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12163 in
= otherin
.get();
12164 req
->set_other_inode(in
);
12165 in
->break_all_delegs();
12166 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12168 req
->set_inode(dir
);
12170 res
= make_request(req
, perm
);
12173 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12181 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12183 Mutex::Locker
lock(client_lock
);
12188 vinodeno_t vino
= _get_vino(in
);
12190 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12191 tout(cct
) << "ll_unlink" << std::endl
;
12192 tout(cct
) << vino
.ino
.val
<< std::endl
;
12193 tout(cct
) << name
<< std::endl
;
12195 if (!cct
->_conf
->fuse_default_permissions
) {
12196 int r
= may_delete(in
, name
, perm
);
12200 return _unlink(in
, name
, perm
);
12203 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12205 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12206 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12208 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12212 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12213 MetaRequest
*req
= new MetaRequest(op
);
12215 dir
->make_nosnap_relative_path(path
);
12216 path
.push_dentry(name
);
12217 req
->set_filepath(path
);
12219 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12220 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12221 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12226 int res
= get_or_create(dir
, name
, &de
);
12229 if (op
== CEPH_MDS_OP_RMDIR
)
12230 req
->set_dentry(de
);
12234 res
= _lookup(dir
, name
, 0, &in
, perms
);
12237 if (op
== CEPH_MDS_OP_RMDIR
) {
12238 req
->set_inode(dir
);
12239 req
->set_other_inode(in
.get());
12241 unlink(de
, true, true);
12243 req
->set_other_inode(in
.get());
12246 res
= make_request(req
, perms
);
12249 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12257 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12259 Mutex::Locker
lock(client_lock
);
12264 vinodeno_t vino
= _get_vino(in
);
12266 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12267 tout(cct
) << "ll_rmdir" << std::endl
;
12268 tout(cct
) << vino
.ino
.val
<< std::endl
;
12269 tout(cct
) << name
<< std::endl
;
12271 if (!cct
->_conf
->fuse_default_permissions
) {
12272 int r
= may_delete(in
, name
, perms
);
12277 return _rmdir(in
, name
, perms
);
12280 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12282 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12283 << todir
->ino
<< " " << toname
12284 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12287 if (fromdir
->snapid
!= todir
->snapid
)
12290 int op
= CEPH_MDS_OP_RENAME
;
12291 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12292 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12293 op
= CEPH_MDS_OP_RENAMESNAP
;
12297 if (fromdir
!= todir
) {
12298 Inode
*fromdir_root
=
12299 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12300 Inode
*todir_root
=
12301 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12302 if (fromdir_root
!= todir_root
) {
12308 MetaRequest
*req
= new MetaRequest(op
);
12311 fromdir
->make_nosnap_relative_path(from
);
12312 from
.push_dentry(fromname
);
12314 todir
->make_nosnap_relative_path(to
);
12315 to
.push_dentry(toname
);
12316 req
->set_filepath(to
);
12317 req
->set_filepath2(from
);
12320 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12324 res
= get_or_create(todir
, toname
, &de
);
12328 if (op
== CEPH_MDS_OP_RENAME
) {
12329 req
->set_old_dentry(oldde
);
12330 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12331 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12333 req
->set_dentry(de
);
12334 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12335 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12337 InodeRef oldin
, otherin
;
12338 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
12342 Inode
*oldinode
= oldin
.get();
12343 oldinode
->break_all_delegs();
12344 req
->set_old_inode(oldinode
);
12345 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12347 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12351 Inode
*in
= otherin
.get();
12352 req
->set_other_inode(in
);
12353 in
->break_all_delegs();
12355 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12363 req
->set_inode(todir
);
12365 // renamesnap reply contains no tracedn, so we need to invalidate
12367 unlink(oldde
, true, true);
12368 unlink(de
, true, true);
12371 res
= make_request(req
, perm
, &target
);
12372 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12374 // renamed item from our cache
12377 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12385 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12386 const char *newname
, const UserPerm
& perm
)
12388 Mutex::Locker
lock(client_lock
);
12393 vinodeno_t vparent
= _get_vino(parent
);
12394 vinodeno_t vnewparent
= _get_vino(newparent
);
12396 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12397 << vnewparent
<< " " << newname
<< dendl
;
12398 tout(cct
) << "ll_rename" << std::endl
;
12399 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12400 tout(cct
) << name
<< std::endl
;
12401 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12402 tout(cct
) << newname
<< std::endl
;
12404 if (!cct
->_conf
->fuse_default_permissions
) {
12405 int r
= may_delete(parent
, name
, perm
);
12408 r
= may_delete(newparent
, newname
, perm
);
12409 if (r
< 0 && r
!= -ENOENT
)
12413 return _rename(parent
, name
, newparent
, newname
, perm
);
12416 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12418 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12419 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12421 if (strlen(newname
) > NAME_MAX
)
12422 return -ENAMETOOLONG
;
12424 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12427 if (is_quota_files_exceeded(dir
, perm
)) {
12431 in
->break_all_delegs();
12432 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12434 filepath
path(newname
, dir
->ino
);
12435 req
->set_filepath(path
);
12436 filepath
existing(in
->ino
);
12437 req
->set_filepath2(existing
);
12439 req
->set_inode(dir
);
12440 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12441 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12444 int res
= get_or_create(dir
, newname
, &de
);
12447 req
->set_dentry(de
);
12449 res
= make_request(req
, perm
, inp
);
12450 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12453 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12461 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12462 const UserPerm
& perm
)
12464 Mutex::Locker
lock(client_lock
);
12469 vinodeno_t vino
= _get_vino(in
);
12470 vinodeno_t vnewparent
= _get_vino(newparent
);
12472 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12474 tout(cct
) << "ll_link" << std::endl
;
12475 tout(cct
) << vino
.ino
.val
<< std::endl
;
12476 tout(cct
) << vnewparent
<< std::endl
;
12477 tout(cct
) << newname
<< std::endl
;
12482 if (!cct
->_conf
->fuse_default_permissions
) {
12483 if (S_ISDIR(in
->mode
))
12486 r
= may_hardlink(in
, perm
);
12490 r
= may_create(newparent
, perm
);
12495 return _link(in
, newparent
, newname
, perm
, &target
);
12498 int Client::ll_num_osds(void)
12500 Mutex::Locker
lock(client_lock
);
12501 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12504 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12506 Mutex::Locker
lock(client_lock
);
12509 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12510 if (!o
.exists(osd
))
12512 g
= o
.get_addr(osd
);
12517 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12518 *addr
= ntohl(nb_addr
);
12522 uint32_t Client::ll_stripe_unit(Inode
*in
)
12524 Mutex::Locker
lock(client_lock
);
12525 return in
->layout
.stripe_unit
;
12528 uint64_t Client::ll_snap_seq(Inode
*in
)
12530 Mutex::Locker
lock(client_lock
);
12531 return in
->snaprealm
->seq
;
12534 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12536 Mutex::Locker
lock(client_lock
);
12537 *layout
= in
->layout
;
12541 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12543 return ll_file_layout(fh
->inode
.get(), layout
);
12546 /* Currently we cannot take advantage of redundancy in reads, since we
12547 would have to go through all possible placement groups (a
12548 potentially quite large number determined by a hash), and use CRUSH
12549 to calculate the appropriate set of OSDs for each placement group,
12550 then index into that. An array with one entry per OSD is much more
12551 tractable and works for demonstration purposes. */
12553 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12554 file_layout_t
* layout
)
12556 Mutex::Locker
lock(client_lock
);
12558 inodeno_t ino
= in
->ino
;
12559 uint32_t object_size
= layout
->object_size
;
12560 uint32_t su
= layout
->stripe_unit
;
12561 uint32_t stripe_count
= layout
->stripe_count
;
12562 uint64_t stripes_per_object
= object_size
/ su
;
12564 uint64_t stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12565 uint64_t stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12566 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12567 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12569 object_t oid
= file_object_t(ino
, objectno
);
12570 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12571 ceph_object_layout olayout
=
12572 o
.file_to_object_layout(oid
, *layout
);
12573 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12576 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12581 /* Return the offset of the block, internal to the object */
12583 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12585 Mutex::Locker
lock(client_lock
);
12586 file_layout_t
*layout
=&(in
->layout
);
12587 uint32_t object_size
= layout
->object_size
;
12588 uint32_t su
= layout
->stripe_unit
;
12589 uint64_t stripes_per_object
= object_size
/ su
;
12591 return (blockno
% stripes_per_object
) * su
;
12594 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12595 const UserPerm
& perms
)
12597 Mutex::Locker
lock(client_lock
);
12602 vinodeno_t vino
= _get_vino(in
);
12604 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
12605 tout(cct
) << "ll_opendir" << std::endl
;
12606 tout(cct
) << vino
.ino
.val
<< std::endl
;
12608 if (!cct
->_conf
->fuse_default_permissions
) {
12609 int r
= may_open(in
, flags
, perms
);
12614 int r
= _opendir(in
, dirpp
, perms
);
12615 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
12617 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
12622 int Client::ll_releasedir(dir_result_t
*dirp
)
12624 Mutex::Locker
lock(client_lock
);
12625 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
12626 tout(cct
) << "ll_releasedir" << std::endl
;
12627 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12636 int Client::ll_fsyncdir(dir_result_t
*dirp
)
12638 Mutex::Locker
lock(client_lock
);
12639 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
12640 tout(cct
) << "ll_fsyncdir" << std::endl
;
12641 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12646 return _fsync(dirp
->inode
.get(), false);
12649 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
12651 assert(!(flags
& O_CREAT
));
12653 Mutex::Locker
lock(client_lock
);
12658 vinodeno_t vino
= _get_vino(in
);
12660 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
12661 tout(cct
) << "ll_open" << std::endl
;
12662 tout(cct
) << vino
.ino
.val
<< std::endl
;
12663 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
12666 if (!cct
->_conf
->fuse_default_permissions
) {
12667 r
= may_open(in
, flags
, perms
);
12672 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
12675 Fh
*fhptr
= fhp
? *fhp
: NULL
;
12677 ll_unclosed_fh_set
.insert(fhptr
);
12679 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
12680 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
12681 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
12685 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
12686 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
12687 const UserPerm
& perms
)
12691 vinodeno_t vparent
= _get_vino(parent
);
12693 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
12694 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
12695 << ", gid " << perms
.gid() << dendl
;
12696 tout(cct
) << "ll_create" << std::endl
;
12697 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12698 tout(cct
) << name
<< std::endl
;
12699 tout(cct
) << mode
<< std::endl
;
12700 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
12702 bool created
= false;
12703 int r
= _lookup(parent
, name
, caps
, in
, perms
);
12705 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
12708 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
12709 if (!cct
->_conf
->fuse_default_permissions
) {
12710 r
= may_create(parent
, perms
);
12714 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
12725 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
12727 if (!cct
->_conf
->fuse_default_permissions
) {
12728 r
= may_open(in
->get(), flags
, perms
);
12731 int release_r
= _release_fh(*fhp
);
12732 assert(release_r
== 0); // during create, no async data ops should have happened
12737 if (*fhp
== NULL
) {
12738 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
12746 ll_unclosed_fh_set
.insert(*fhp
);
12751 Inode
*inode
= in
->get();
12752 if (use_faked_inos())
12753 ino
= inode
->faked_ino
;
12758 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
12759 tout(cct
) << ino
<< std::endl
;
12760 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
12761 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
12762 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
12767 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
12768 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
12769 const UserPerm
& perms
)
12771 Mutex::Locker
lock(client_lock
);
12777 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
12782 // passing an Inode in outp requires an additional ref
12787 fill_stat(in
, attr
);
12795 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
12796 int oflags
, Inode
**outp
, Fh
**fhp
,
12797 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
12798 const UserPerm
& perms
)
12800 unsigned caps
= statx_to_mask(lflags
, want
);
12801 Mutex::Locker
lock(client_lock
);
12807 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
12811 // passing an Inode in outp requires an additional ref
12816 fill_statx(in
, caps
, stx
);
12825 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
12827 Mutex::Locker
lock(client_lock
);
12828 tout(cct
) << "ll_lseek" << std::endl
;
12829 tout(cct
) << offset
<< std::endl
;
12830 tout(cct
) << whence
<< std::endl
;
12835 return _lseek(fh
, offset
, whence
);
12838 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
12840 Mutex::Locker
lock(client_lock
);
12841 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
12842 tout(cct
) << "ll_read" << std::endl
;
12843 tout(cct
) << (unsigned long)fh
<< std::endl
;
12844 tout(cct
) << off
<< std::endl
;
12845 tout(cct
) << len
<< std::endl
;
12850 return _read(fh
, off
, len
, bl
);
12853 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
12857 file_layout_t
* layout
)
12859 Mutex::Locker
lock(client_lock
);
12864 vinodeno_t vino
= _get_vino(in
);
12865 object_t oid
= file_object_t(vino
.ino
, blockid
);
12866 C_SaferCond onfinish
;
12869 objecter
->read(oid
,
12870 object_locator_t(layout
->pool_id
),
12875 CEPH_OSD_FLAG_READ
,
12878 client_lock
.Unlock();
12879 int r
= onfinish
.wait();
12880 client_lock
.Lock();
12883 bl
.copy(0, bl
.length(), buf
);
12890 /* It appears that the OSD doesn't return success unless the entire
12891 buffer was written, return the write length on success. */
12893 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
12894 char* buf
, uint64_t offset
,
12895 uint64_t length
, file_layout_t
* layout
,
12896 uint64_t snapseq
, uint32_t sync
)
12898 Mutex
flock("Client::ll_write_block flock");
12899 vinodeno_t vino
= ll_get_vino(in
);
12903 Context
*onsafe
= nullptr;
12908 if (true || sync
) {
12909 /* if write is stable, the epilogue is waiting on
12911 onsafe
= new C_SafeCond(&flock
, &cond
, &done
, &r
);
12914 /* if write is unstable, we just place a barrier for
12915 * future commits to wait on */
12916 /*onsafe = new C_Block_Sync(this, vino.ino,
12917 barrier_interval(offset, offset + length), &r);
12921 object_t oid
= file_object_t(vino
.ino
, blockid
);
12922 SnapContext fakesnap
;
12924 if (length
> 0) bp
= buffer::copy(buf
, length
);
12928 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
12931 fakesnap
.seq
= snapseq
;
12933 /* lock just in time */
12934 client_lock
.Lock();
12936 client_lock
.Unlock();
12941 objecter
->write(oid
,
12942 object_locator_t(layout
->pool_id
),
12947 ceph::real_clock::now(),
12951 client_lock
.Unlock();
12952 if (!done
/* also !sync */) {
12966 int Client::ll_commit_blocks(Inode
*in
,
12970 Mutex::Locker
lock(client_lock
);
12972 BarrierContext *bctx;
12973 vinodeno_t vino = _get_vino(in);
12974 uint64_t ino = vino.ino;
12976 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12977 << offset << " to " << length << dendl;
12983 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12984 if (p != barriers.end()) {
12985 barrier_interval civ(offset, offset + length);
12986 p->second->commit_barrier(civ);
12992 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
12994 Mutex::Locker
lock(client_lock
);
12995 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
12996 "~" << len
<< dendl
;
12997 tout(cct
) << "ll_write" << std::endl
;
12998 tout(cct
) << (unsigned long)fh
<< std::endl
;
12999 tout(cct
) << off
<< std::endl
;
13000 tout(cct
) << len
<< std::endl
;
13005 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
13006 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
13011 int Client::ll_flush(Fh
*fh
)
13013 Mutex::Locker
lock(client_lock
);
13014 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13015 tout(cct
) << "ll_flush" << std::endl
;
13016 tout(cct
) << (unsigned long)fh
<< std::endl
;
13024 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
13026 Mutex::Locker
lock(client_lock
);
13027 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13028 tout(cct
) << "ll_fsync" << std::endl
;
13029 tout(cct
) << (unsigned long)fh
<< std::endl
;
13034 int r
= _fsync(fh
, syncdataonly
);
13036 // If we're returning an error, clear it from the FH
13037 fh
->take_async_err();
13042 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
13044 Mutex::Locker
lock(client_lock
);
13045 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
13046 tout(cct
) << "ll_sync_inode" << std::endl
;
13047 tout(cct
) << (unsigned long)in
<< std::endl
;
13052 return _fsync(in
, syncdataonly
);
13055 #ifdef FALLOC_FL_PUNCH_HOLE
13057 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13059 if (offset
< 0 || length
<= 0)
13062 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
13063 return -EOPNOTSUPP
;
13065 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
13066 return -EOPNOTSUPP
;
13068 Inode
*in
= fh
->inode
.get();
13070 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
13071 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13075 if (in
->snapid
!= CEPH_NOSNAP
)
13078 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13081 uint64_t size
= offset
+ length
;
13082 std::list
<InodeRef
> quota_roots
;
13083 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13085 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
, "a_roots
)) {
13090 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13094 Mutex
uninline_flock("Client::_fallocate_uninline_data flock");
13095 Cond uninline_cond
;
13096 bool uninline_done
= false;
13097 int uninline_ret
= 0;
13098 Context
*onuninline
= NULL
;
13100 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13101 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13102 (have
& CEPH_CAP_FILE_BUFFER
)) {
13104 int len
= in
->inline_data
.length();
13105 if (offset
< len
) {
13107 in
->inline_data
.copy(0, offset
, bl
);
13109 if (offset
+ size
> len
)
13110 size
= len
- offset
;
13112 bl
.append_zero(size
);
13113 if (offset
+ size
< len
)
13114 in
->inline_data
.copy(offset
+ size
, len
- offset
- size
, bl
);
13115 in
->inline_data
= bl
;
13116 in
->inline_version
++;
13118 in
->mtime
= in
->ctime
= ceph_clock_now();
13120 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13122 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13123 onuninline
= new C_SafeCond(&uninline_flock
,
13127 uninline_data(in
, onuninline
);
13130 Mutex
flock("Client::_punch_hole flock");
13133 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
);
13135 unsafe_sync_write
++;
13136 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13138 _invalidate_inode_cache(in
, offset
, length
);
13139 filer
->zero(in
->ino
, &in
->layout
,
13140 in
->snaprealm
->get_snap_context(),
13142 ceph::real_clock::now(),
13143 0, true, onfinish
);
13144 in
->mtime
= in
->ctime
= ceph_clock_now();
13146 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13148 client_lock
.Unlock();
13153 client_lock
.Lock();
13154 _sync_write_commit(in
);
13156 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13157 uint64_t size
= offset
+ length
;
13158 if (size
> in
->size
) {
13160 in
->mtime
= in
->ctime
= ceph_clock_now();
13162 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13164 if (is_quota_bytes_approaching(in
, quota_roots
)) {
13165 check_caps(in
, CHECK_CAPS_NODELAY
);
13166 } else if (is_max_size_approaching(in
)) {
13173 client_lock
.Unlock();
13174 uninline_flock
.Lock();
13175 while (!uninline_done
)
13176 uninline_cond
.Wait(uninline_flock
);
13177 uninline_flock
.Unlock();
13178 client_lock
.Lock();
13180 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
13181 in
->inline_data
.clear();
13182 in
->inline_version
= CEPH_INLINE_NONE
;
13183 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13189 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13194 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13196 return -EOPNOTSUPP
;
13202 int Client::ll_fallocate(Fh
*fh
, int mode
, loff_t offset
, loff_t length
)
13204 Mutex::Locker
lock(client_lock
);
13205 ldout(cct
, 3) << "ll_fallocate " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13206 tout(cct
) << "ll_fallocate " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13207 tout(cct
) << (unsigned long)fh
<< std::endl
;
13212 return _fallocate(fh
, mode
, offset
, length
);
13215 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13217 Mutex::Locker
lock(client_lock
);
13218 tout(cct
) << "fallocate " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13223 Fh
*fh
= get_filehandle(fd
);
13226 #if defined(__linux__) && defined(O_PATH)
13227 if (fh
->flags
& O_PATH
)
13230 return _fallocate(fh
, mode
, offset
, length
);
13233 int Client::ll_release(Fh
*fh
)
13235 Mutex::Locker
lock(client_lock
);
13240 ldout(cct
, 3) << "ll_release (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13242 tout(cct
) << "ll_release (fh)" << std::endl
;
13243 tout(cct
) << (unsigned long)fh
<< std::endl
;
13245 if (ll_unclosed_fh_set
.count(fh
))
13246 ll_unclosed_fh_set
.erase(fh
);
13247 return _release_fh(fh
);
13250 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13252 Mutex::Locker
lock(client_lock
);
13254 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13255 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13260 return _getlk(fh
, fl
, owner
);
13263 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13265 Mutex::Locker
lock(client_lock
);
13267 ldout(cct
, 3) << "ll_setlk (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13268 tout(cct
) << "ll_setk (fh)" << (unsigned long)fh
<< std::endl
;
13273 return _setlk(fh
, fl
, owner
, sleep
);
13276 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13278 Mutex::Locker
lock(client_lock
);
13280 ldout(cct
, 3) << "ll_flock (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13281 tout(cct
) << "ll_flock (fh)" << (unsigned long)fh
<< std::endl
;
13286 return _flock(fh
, cmd
, owner
);
13289 int Client::set_deleg_timeout(uint32_t timeout
)
13291 Mutex::Locker
lock(client_lock
);
13294 * The whole point is to prevent blacklisting so we must time out the
13295 * delegation before the session autoclose timeout kicks in.
13297 if (timeout
>= mdsmap
->get_session_autoclose())
13300 deleg_timeout
= timeout
;
13304 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13308 Mutex::Locker
lock(client_lock
);
13313 Inode
*inode
= fh
->inode
.get();
13316 case CEPH_DELEGATION_NONE
:
13317 inode
->unset_deleg(fh
);
13322 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13323 } catch (std::bad_alloc
) {
13331 class C_Client_RequestInterrupt
: public Context
{
13336 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13339 void finish(int r
) override
{
13340 Mutex::Locker
l(client
->client_lock
);
13341 assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13342 client
->_interrupt_filelock(req
);
13343 client
->put_request(req
);
13347 void Client::ll_interrupt(void *d
)
13349 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13350 ldout(cct
, 3) << "ll_interrupt tid " << req
->get_tid() << dendl
;
13351 tout(cct
) << "ll_interrupt tid " << req
->get_tid() << std::endl
;
13352 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13355 // =========================================
13358 // expose file layouts
13360 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13361 const UserPerm
& perms
)
13363 Mutex::Locker
lock(client_lock
);
13368 filepath
path(relpath
);
13370 int r
= path_walk(path
, &in
, perms
);
13376 ldout(cct
, 3) << "describe_layout(" << relpath
<< ") = 0" << dendl
;
13380 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13382 Mutex::Locker
lock(client_lock
);
13387 Fh
*f
= get_filehandle(fd
);
13390 Inode
*in
= f
->inode
.get();
13394 ldout(cct
, 3) << "fdescribe_layout(" << fd
<< ") = 0" << dendl
;
13398 int64_t Client::get_default_pool_id()
13400 Mutex::Locker
lock(client_lock
);
13405 /* first data pool is the default */
13406 return mdsmap
->get_first_data_pool();
13411 int64_t Client::get_pool_id(const char *pool_name
)
13413 Mutex::Locker
lock(client_lock
);
13418 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13422 string
Client::get_pool_name(int64_t pool
)
13424 Mutex::Locker
lock(client_lock
);
13429 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13430 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13434 int Client::get_pool_replication(int64_t pool
)
13436 Mutex::Locker
lock(client_lock
);
13441 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13442 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13446 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13448 Mutex::Locker
lock(client_lock
);
13453 Fh
*f
= get_filehandle(fd
);
13456 Inode
*in
= f
->inode
.get();
13458 vector
<ObjectExtent
> extents
;
13459 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13460 assert(extents
.size() == 1);
13462 objecter
->with_osdmap([&](const OSDMap
& o
) {
13463 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13464 o
.pg_to_acting_osds(pg
, osds
);
13471 * Return the remainder of the extent (stripe unit)
13473 * If length = 1 is passed to Striper::file_to_extents we get a single
13474 * extent back, but its length is one so we still need to compute the length
13475 * to the end of the stripe unit.
13477 * If length = su then we may get 1 or 2 objects back in the extents vector
13478 * which would have to be examined. Even then, the offsets are local to the
13479 * object, so matching up to the file offset is extra work.
13481 * It seems simpler to stick with length = 1 and manually compute the
13485 uint64_t su
= in
->layout
.stripe_unit
;
13486 *len
= su
- (off
% su
);
13492 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13494 Mutex::Locker
lock(client_lock
);
13501 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13502 return o
.crush
->get_full_location_ordered(id
, path
);
13506 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13507 vector
<entity_addr_t
>& address
)
13509 Mutex::Locker
lock(client_lock
);
13514 Fh
*f
= get_filehandle(fd
);
13517 Inode
*in
= f
->inode
.get();
13520 vector
<ObjectExtent
> extents
;
13521 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13522 in
->truncate_size
, extents
);
13523 assert(extents
.size() == 1);
13525 // now we have the object and its 'layout'
13526 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13527 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13529 o
.pg_to_acting_osds(pg
, osds
);
13532 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13533 entity_addr_t addr
= o
.get_addr(osds
[i
]);
13534 address
.push_back(addr
);
13540 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13542 Mutex::Locker
lock(client_lock
);
13547 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13548 if (!o
.exists(osd
))
13551 addr
= o
.get_addr(osd
);
13556 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13557 loff_t length
, loff_t offset
)
13559 Mutex::Locker
lock(client_lock
);
13564 Fh
*f
= get_filehandle(fd
);
13567 Inode
*in
= f
->inode
.get();
13569 // map to a list of extents
13570 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13572 ldout(cct
, 3) << "enumerate_layout(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13577 /* find an osd with the same ip. -ENXIO if none. */
13578 int Client::get_local_osd()
13580 Mutex::Locker
lock(client_lock
);
13585 objecter
->with_osdmap([this](const OSDMap
& o
) {
13586 if (o
.get_epoch() != local_osd_epoch
) {
13587 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddr());
13588 local_osd_epoch
= o
.get_epoch();
13599 // ===============================
13601 void Client::ms_handle_connect(Connection
*con
)
13603 ldout(cct
, 10) << "ms_handle_connect on " << con
->get_peer_addr() << dendl
;
13606 bool Client::ms_handle_reset(Connection
*con
)
13608 ldout(cct
, 0) << "ms_handle_reset on " << con
->get_peer_addr() << dendl
;
13612 void Client::ms_handle_remote_reset(Connection
*con
)
13614 ldout(cct
, 0) << "ms_handle_remote_reset on " << con
->get_peer_addr() << dendl
;
13615 Mutex::Locker
l(client_lock
);
13616 switch (con
->get_peer_type()) {
13617 case CEPH_ENTITY_TYPE_MDS
:
13619 // kludge to figure out which mds this is; fixme with a Connection* state
13620 mds_rank_t mds
= MDS_RANK_NONE
;
13621 MetaSession
*s
= NULL
;
13622 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
13623 p
!= mds_sessions
.end();
13625 if (mdsmap
->get_addr(p
->first
) == con
->get_peer_addr()) {
13631 assert (s
!= NULL
);
13632 switch (s
->state
) {
13633 case MetaSession::STATE_CLOSING
:
13634 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
13635 _closed_mds_session(s
);
13638 case MetaSession::STATE_OPENING
:
13640 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
13641 list
<Context
*> waiters
;
13642 waiters
.swap(s
->waiting_for_open
);
13643 _closed_mds_session(s
);
13644 MetaSession
*news
= _get_or_open_mds_session(mds
);
13645 news
->waiting_for_open
.swap(waiters
);
13649 case MetaSession::STATE_OPEN
:
13651 objecter
->maybe_request_map(); /* to check if we are blacklisted */
13652 const md_config_t
*conf
= cct
->_conf
;
13653 if (conf
->client_reconnect_stale
) {
13654 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
13655 _closed_mds_session(s
);
13657 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
13658 s
->state
= MetaSession::STATE_STALE
;
13663 case MetaSession::STATE_NEW
:
13664 case MetaSession::STATE_CLOSED
:
13674 bool Client::ms_handle_refused(Connection
*con
)
13676 ldout(cct
, 1) << "ms_handle_refused on " << con
->get_peer_addr() << dendl
;
13680 bool Client::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
, bool force_new
)
13682 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
13684 *authorizer
= monclient
->build_authorizer(dest_type
);
13688 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
13691 utime_t now
= ceph_clock_now();
13694 if (cur
!= in
&& cur
->quota
.is_enable())
13697 Inode
*parent_in
= NULL
;
13698 if (!cur
->dn_set
.empty()) {
13699 for (auto p
= cur
->dn_set
.begin(); p
!= cur
->dn_set
.end(); ++p
) {
13701 if (dn
->lease_mds
>= 0 &&
13702 dn
->lease_ttl
> now
&&
13703 mds_sessions
.count(dn
->lease_mds
)) {
13704 parent_in
= dn
->dir
->parent_inode
;
13706 Inode
*diri
= dn
->dir
->parent_inode
;
13707 if (diri
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) &&
13708 diri
->shared_gen
== dn
->cap_shared_gen
) {
13709 parent_in
= dn
->dir
->parent_inode
;
13715 } else if (root_parents
.count(cur
)) {
13716 parent_in
= root_parents
[cur
].get();
13724 if (cur
== root_ancestor
)
13728 if (cur
->nlink
== 0) {
13729 cur
= root_ancestor
;
13733 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
13734 filepath
path(cur
->ino
);
13735 req
->set_filepath(path
);
13736 req
->set_inode(cur
);
13738 InodeRef parent_ref
;
13739 int ret
= make_request(req
, perms
, &parent_ref
);
13741 ldout(cct
, 1) << __func__
<< " " << in
->vino()
13742 << " failed to find parent of " << cur
->vino()
13743 << " err " << ret
<< dendl
;
13744 // FIXME: what to do?
13745 cur
= root_ancestor
;
13749 now
= ceph_clock_now();
13751 cur
= parent_ref
.get();
13753 cur
= in
; // start over
13756 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << cur
->vino() << dendl
;
13761 * Traverse quota ancestors of the Inode, return true
13762 * if any of them passes the passed function
13764 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
13765 std::function
<bool (const Inode
&in
)> test
)
13768 assert(in
!= NULL
);
13773 if (in
== root_ancestor
) {
13774 // We're done traversing, drop out
13777 // Continue up the tree
13778 in
= get_quota_root(in
, perms
);
13785 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
13787 return check_quota_condition(in
, perms
,
13788 [](const Inode
&in
) {
13789 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
13793 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
13794 const UserPerm
& perms
,
13795 std::list
<InodeRef
>* quota_roots
)
13797 return check_quota_condition(in
, perms
,
13798 [&new_bytes
, quota_roots
](const Inode
&in
) {
13800 quota_roots
->emplace_back(const_cast<Inode
*>(&in
));
13801 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
13802 > in
.quota
.max_bytes
;
13806 bool Client::is_quota_bytes_approaching(Inode
*in
, std::list
<InodeRef
>& quota_roots
)
13808 assert(in
->size
>= in
->reported_size
);
13809 const uint64_t size
= in
->size
- in
->reported_size
;
13811 for (auto& diri
: quota_roots
) {
13812 if (diri
->quota
.max_bytes
) {
13813 if (diri
->rstat
.rbytes
>= diri
->quota
.max_bytes
)
13816 uint64_t space
= diri
->quota
.max_bytes
- diri
->rstat
.rbytes
;
13817 if ((space
>> 4) < size
)
13831 int Client::check_pool_perm(Inode
*in
, int need
)
13833 if (!cct
->_conf
->client_check_pool_perm
)
13836 int64_t pool_id
= in
->layout
.pool_id
;
13837 std::string pool_ns
= in
->layout
.pool_ns
;
13838 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
13841 auto it
= pool_perms
.find(perm_key
);
13842 if (it
== pool_perms
.end())
13844 if (it
->second
== POOL_CHECKING
) {
13845 // avoid concurrent checkings
13846 wait_on_list(waiting_for_pool_perm
);
13849 assert(have
& POOL_CHECKED
);
13855 if (in
->snapid
!= CEPH_NOSNAP
) {
13856 // pool permission check needs to write to the first object. But for snapshot,
13857 // head of the first object may have alread been deleted. To avoid creating
13858 // orphan object, skip the check for now.
13862 pool_perms
[perm_key
] = POOL_CHECKING
;
13865 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
13866 object_t oid
= oid_buf
;
13868 SnapContext nullsnapc
;
13870 C_SaferCond rd_cond
;
13871 ObjectOperation rd_op
;
13872 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
13874 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
13875 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
13877 C_SaferCond wr_cond
;
13878 ObjectOperation wr_op
;
13879 wr_op
.create(true);
13881 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
13882 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
13884 client_lock
.Unlock();
13885 int rd_ret
= rd_cond
.wait();
13886 int wr_ret
= wr_cond
.wait();
13887 client_lock
.Lock();
13889 bool errored
= false;
13891 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
13893 else if (rd_ret
!= -EPERM
) {
13894 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13895 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
13899 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
13900 have
|= POOL_WRITE
;
13901 else if (wr_ret
!= -EPERM
) {
13902 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13903 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
13908 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13909 // Raise EIO because actual error code might be misleading for
13910 // userspace filesystem user.
13911 pool_perms
.erase(perm_key
);
13912 signal_cond_list(waiting_for_pool_perm
);
13916 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
13917 signal_cond_list(waiting_for_pool_perm
);
13920 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
13921 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13922 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
13925 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
13926 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13927 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
13934 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
13936 if (acl_type
== POSIX_ACL
) {
13937 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
13938 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
13940 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
13946 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
13948 if (acl_type
== NO_ACL
)
13951 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
13955 if (acl_type
== POSIX_ACL
) {
13956 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
13957 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
13958 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
13959 r
= posix_acl_access_chmod(acl
, mode
);
13962 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
13968 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
13972 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
13973 const UserPerm
& perms
)
13975 if (acl_type
== NO_ACL
)
13978 if (S_ISLNK(*mode
))
13981 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
13985 if (acl_type
== POSIX_ACL
) {
13986 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
13987 map
<string
, bufferptr
> xattrs
;
13989 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
13990 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
13991 r
= posix_acl_inherit_mode(acl
, mode
);
13996 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
14000 xattrs
[ACL_EA_ACCESS
] = acl
;
14003 if (S_ISDIR(*mode
))
14004 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
14008 ::encode(xattrs
, xattrs_bl
);
14011 *mode
&= ~umask_cb(callback_handle
);
14016 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
14020 void Client::set_filer_flags(int flags
)
14022 Mutex::Locker
l(client_lock
);
14023 assert(flags
== 0 ||
14024 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14025 objecter
->add_global_op_flags(flags
);
14028 void Client::clear_filer_flags(int flags
)
14030 Mutex::Locker
l(client_lock
);
14031 assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
14032 objecter
->clear_global_op_flag(flags
);
14036 * This is included in cap release messages, to cause
14037 * the MDS to wait until this OSD map epoch. It is necessary
14038 * in corner cases where we cancel RADOS ops, so that
14039 * nobody else tries to do IO to the same objects in
14040 * the same epoch as the cancelled ops.
14042 void Client::set_cap_epoch_barrier(epoch_t e
)
14044 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
14045 cap_epoch_barrier
= e
;
14048 const char** Client::get_tracked_conf_keys() const
14050 static const char* keys
[] = {
14051 "client_cache_size",
14052 "client_cache_mid",
14054 "client_deleg_timeout",
14055 "client_deleg_break_on_open",
14061 void Client::handle_conf_change(const struct md_config_t
*conf
,
14062 const std::set
<std::string
> &changed
)
14064 Mutex::Locker
lock(client_lock
);
14066 if (changed
.count("client_cache_mid")) {
14067 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
14069 if (changed
.count("client_acl_type")) {
14071 if (cct
->_conf
->client_acl_type
== "posix_acl")
14072 acl_type
= POSIX_ACL
;
14076 void intrusive_ptr_add_ref(Inode
*in
)
14081 void intrusive_ptr_release(Inode
*in
)
14083 in
->client
->put_inode(in
);
14086 mds_rank_t
Client::_get_random_up_mds() const
14088 assert(client_lock
.is_locked_by_me());
14090 std::set
<mds_rank_t
> up
;
14091 mdsmap
->get_up_mds_set(up
);
14094 return MDS_RANK_NONE
;
14095 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14096 for (int n
= rand() % up
.size(); n
; n
--)
14102 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14103 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14105 monclient
->set_messenger(m
);
14106 objecter
->set_client_incarnation(0);
14109 StandaloneClient::~StandaloneClient()
14112 objecter
= nullptr;
14115 int StandaloneClient::init()
14118 objectcacher
->start();
14121 client_lock
.Lock();
14122 assert(!initialized
);
14124 messenger
->add_dispatcher_tail(objecter
);
14125 messenger
->add_dispatcher_tail(this);
14127 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14128 int r
= monclient
->init();
14130 // need to do cleanup because we're in an intermediate init state
14132 client_lock
.Unlock();
14133 objecter
->shutdown();
14134 objectcacher
->stop();
14135 monclient
->shutdown();
14140 client_lock
.Unlock();
14146 void StandaloneClient::shutdown()
14148 Client::shutdown();
14149 objecter
->shutdown();
14150 monclient
->shutdown();