1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
22 #include <sys/param.h>
25 #include <sys/utsname.h>
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
35 #include <sys/xattr.h>
38 #if defined(__linux__)
39 #include <linux/falloc.h>
42 #include <sys/statvfs.h>
44 #include "common/config.h"
45 #include "common/version.h"
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
64 #include "mon/MonClient.h"
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
77 #define dout_subsys ceph_subsys_client
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
86 #include "Delegation.h"
88 #include "ClientSnapRealm.h"
90 #include "MetaSession.h"
91 #include "MetaRequest.h"
92 #include "ObjecterWriteback.h"
93 #include "posix_acl.h"
95 #include "include/assert.h"
96 #include "include/stat.h"
98 #include "include/cephfs/ceph_statx.h"
100 #if HAVE_GETGROUPLIST
107 #define dout_prefix *_dout << "client." << whoami << " "
109 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
111 // FreeBSD fails to define this
115 // Darwin fails to define this
124 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
126 void client_flush_set_callback(void *p
, ObjectCacher::ObjectSet
*oset
)
128 Client
*client
= static_cast<Client
*>(p
);
129 client
->flush_set_callback(oset
);
135 Client::CommandHook::CommandHook(Client
*client
) :
140 bool Client::CommandHook::call(std::string command
, cmdmap_t
& cmdmap
,
141 std::string format
, bufferlist
& out
)
143 Formatter
*f
= Formatter::create(format
);
144 f
->open_object_section("result");
145 m_client
->client_lock
.Lock();
146 if (command
== "mds_requests")
147 m_client
->dump_mds_requests(f
);
148 else if (command
== "mds_sessions")
149 m_client
->dump_mds_sessions(f
);
150 else if (command
== "dump_cache")
151 m_client
->dump_cache(f
);
152 else if (command
== "kick_stale_sessions")
153 m_client
->_kick_stale_sessions();
154 else if (command
== "status")
155 m_client
->dump_status(f
);
157 assert(0 == "bad command registered");
158 m_client
->client_lock
.Unlock();
168 dir_result_t::dir_result_t(Inode
*in
, const UserPerm
& perms
)
169 : inode(in
), offset(0), next_offset(2),
170 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
174 void Client::_reset_faked_inos()
177 free_faked_inos
.clear();
178 free_faked_inos
.insert(start
, (uint32_t)-1 - start
+ 1);
179 last_used_faked_ino
= 0;
180 _use_faked_inos
= sizeof(ino_t
) < 8 || cct
->_conf
->client_use_faked_inos
;
183 void Client::_assign_faked_ino(Inode
*in
)
185 interval_set
<ino_t
>::const_iterator it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
186 if (it
== free_faked_inos
.end() && last_used_faked_ino
> 0) {
187 last_used_faked_ino
= 0;
188 it
= free_faked_inos
.lower_bound(last_used_faked_ino
+ 1);
190 assert(it
!= free_faked_inos
.end());
191 if (last_used_faked_ino
< it
.get_start()) {
192 assert(it
.get_len() > 0);
193 last_used_faked_ino
= it
.get_start();
195 ++last_used_faked_ino
;
196 assert(it
.get_start() + it
.get_len() > last_used_faked_ino
);
198 in
->faked_ino
= last_used_faked_ino
;
199 free_faked_inos
.erase(in
->faked_ino
);
200 faked_ino_map
[in
->faked_ino
] = in
->vino();
203 void Client::_release_faked_ino(Inode
*in
)
205 free_faked_inos
.insert(in
->faked_ino
);
206 faked_ino_map
.erase(in
->faked_ino
);
209 vinodeno_t
Client::_map_faked_ino(ino_t ino
)
214 else if (faked_ino_map
.count(ino
))
215 vino
= faked_ino_map
[ino
];
217 vino
= vinodeno_t(0, CEPH_NOSNAP
);
218 ldout(cct
, 10) << "map_faked_ino " << ino
<< " -> " << vino
<< dendl
;
222 vinodeno_t
Client::map_faked_ino(ino_t ino
)
224 Mutex::Locker
lock(client_lock
);
225 return _map_faked_ino(ino
);
230 Client::Client(Messenger
*m
, MonClient
*mc
, Objecter
*objecter_
)
231 : Dispatcher(m
->cct
),
232 m_command_hook(this),
233 timer(m
->cct
, client_lock
),
234 callback_handle(NULL
),
235 switch_interrupt_cb(NULL
),
237 ino_invalidate_cb(NULL
),
238 dentry_invalidate_cb(NULL
),
240 can_invalidate_dentries(false),
241 async_ino_invalidator(m
->cct
),
242 async_dentry_invalidator(m
->cct
),
243 interrupt_finisher(m
->cct
),
244 remount_finisher(m
->cct
),
245 objecter_finisher(m
->cct
),
247 messenger(m
), monclient(mc
),
249 whoami(mc
->get_global_id()), cap_epoch_barrier(0),
250 last_tid(0), oldest_tid(0), last_flush_tid(1),
252 mounted(false), unmounting(false), blacklisted(false),
253 local_osd(-ENXIO
), local_osd_epoch(0),
254 unsafe_sync_write(0),
255 client_lock("Client::client_lock"),
262 num_flushing_caps
= 0;
264 _dir_vxattrs_name_size
= _vxattrs_calcu_name_size(_dir_vxattrs
);
265 _file_vxattrs_name_size
= _vxattrs_calcu_name_size(_file_vxattrs
);
267 user_id
= cct
->_conf
->client_mount_uid
;
268 group_id
= cct
->_conf
->client_mount_gid
;
271 if (cct
->_conf
->client_acl_type
== "posix_acl")
272 acl_type
= POSIX_ACL
;
274 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
277 free_fd_set
.insert(10, 1<<30);
279 mdsmap
.reset(new MDSMap
);
282 writeback_handler
.reset(new ObjecterWriteback(objecter
, &objecter_finisher
,
284 objectcacher
.reset(new ObjectCacher(cct
, "libcephfs", *writeback_handler
, client_lock
,
285 client_flush_set_callback
, // all commit callback
287 cct
->_conf
->client_oc_size
,
288 cct
->_conf
->client_oc_max_objects
,
289 cct
->_conf
->client_oc_max_dirty
,
290 cct
->_conf
->client_oc_target_dirty
,
291 cct
->_conf
->client_oc_max_dirty_age
,
293 objecter_finisher
.start();
294 filer
.reset(new Filer(objecter
, &objecter_finisher
));
295 objecter
->enable_blacklist_events();
301 assert(!client_lock
.is_locked());
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
308 client_lock
.Unlock();
311 void Client::tear_down_cache()
314 for (ceph::unordered_map
<int, Fh
*>::iterator it
= fd_map
.begin();
318 ldout(cct
, 1) << "tear_down_cache forcing close of fh " << it
->first
<< " ino " << fh
->inode
->ino
<< dendl
;
323 while (!opened_dirs
.empty()) {
324 dir_result_t
*dirp
= *opened_dirs
.begin();
325 ldout(cct
, 1) << "tear_down_cache forcing close of dir " << dirp
<< " ino " << dirp
->inode
->ino
<< dendl
;
334 assert(lru
.lru_get_size() == 0);
337 assert(inode_map
.size() <= 1 + root_parents
.size());
338 if (root
&& inode_map
.size() == 1 + root_parents
.size()) {
342 while (!root_parents
.empty())
343 root_parents
.erase(root_parents
.begin());
348 assert(inode_map
.empty());
351 inodeno_t
Client::get_root_ino()
353 Mutex::Locker
l(client_lock
);
354 if (use_faked_inos())
355 return root
->faked_ino
;
360 Inode
*Client::get_root()
362 Mutex::Locker
l(client_lock
);
370 void Client::dump_inode(Formatter
*f
, Inode
*in
, set
<Inode
*>& did
, bool disconnected
)
373 in
->make_long_path(path
);
374 ldout(cct
, 1) << "dump_inode: "
375 << (disconnected
? "DISCONNECTED ":"")
376 << "inode " << in
->ino
378 << " ref " << in
->get_num_ref()
382 f
->open_object_section("inode");
383 f
->dump_stream("path") << path
;
385 f
->dump_int("disconnected", 1);
392 ldout(cct
, 1) << " dir " << in
->dir
<< " size " << in
->dir
->dentries
.size() << dendl
;
393 for (ceph::unordered_map
<string
, Dentry
*>::iterator it
= in
->dir
->dentries
.begin();
394 it
!= in
->dir
->dentries
.end();
396 ldout(cct
, 1) << " " << in
->ino
<< " dn " << it
->first
<< " " << it
->second
<< " ref " << it
->second
->ref
<< dendl
;
398 f
->open_object_section("dentry");
402 if (it
->second
->inode
)
403 dump_inode(f
, it
->second
->inode
.get(), did
, false);
408 void Client::dump_cache(Formatter
*f
)
412 ldout(cct
, 1) << "dump_cache" << dendl
;
415 f
->open_array_section("cache");
418 dump_inode(f
, root
, did
, true);
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
422 it
!= inode_map
.end();
424 if (did
.count(it
->second
))
426 dump_inode(f
, it
->second
, did
, true);
433 void Client::dump_status(Formatter
*f
)
435 assert(client_lock
.is_locked_by_me());
437 ldout(cct
, 1) << __func__
<< dendl
;
439 const epoch_t osd_epoch
440 = objecter
->with_osdmap(std::mem_fn(&OSDMap::get_epoch
));
443 f
->open_object_section("metadata");
444 for (const auto& kv
: metadata
)
445 f
->dump_string(kv
.first
.c_str(), kv
.second
);
448 f
->dump_int("dentry_count", lru
.lru_get_size());
449 f
->dump_int("dentry_pinned_count", lru
.lru_get_num_pinned());
450 f
->dump_int("id", get_nodeid().v
);
451 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr());
452 f
->dump_object("inst", inst
);
453 f
->dump_stream("inst_str") << inst
;
454 f
->dump_stream("addr_str") << inst
.addr
;
455 f
->dump_int("inode_count", inode_map
.size());
456 f
->dump_int("mds_epoch", mdsmap
->get_epoch());
457 f
->dump_int("osd_epoch", osd_epoch
);
458 f
->dump_int("osd_epoch_barrier", cap_epoch_barrier
);
465 objectcacher
->start();
468 assert(!initialized
);
470 messenger
->add_dispatcher_tail(this);
471 client_lock
.Unlock();
477 void Client::_finish_init()
481 PerfCountersBuilder
plb(cct
, "client", l_c_first
, l_c_last
);
482 plb
.add_time_avg(l_c_reply
, "reply", "Latency of receiving a reply on metadata request");
483 plb
.add_time_avg(l_c_lat
, "lat", "Latency of processing a metadata request");
484 plb
.add_time_avg(l_c_wrlat
, "wrlat", "Latency of a file data write operation");
485 logger
.reset(plb
.create_perf_counters());
486 cct
->get_perfcounters_collection()->add(logger
.get());
488 client_lock
.Unlock();
490 cct
->_conf
->add_observer(this);
492 AdminSocket
* admin_socket
= cct
->get_admin_socket();
493 int ret
= admin_socket
->register_command("mds_requests",
496 "show in-progress mds requests");
498 lderr(cct
) << "error registering admin socket command: "
499 << cpp_strerror(-ret
) << dendl
;
501 ret
= admin_socket
->register_command("mds_sessions",
504 "show mds session state");
506 lderr(cct
) << "error registering admin socket command: "
507 << cpp_strerror(-ret
) << dendl
;
509 ret
= admin_socket
->register_command("dump_cache",
512 "show in-memory metadata cache contents");
514 lderr(cct
) << "error registering admin socket command: "
515 << cpp_strerror(-ret
) << dendl
;
517 ret
= admin_socket
->register_command("kick_stale_sessions",
518 "kick_stale_sessions",
520 "kick sessions that were remote reset");
522 lderr(cct
) << "error registering admin socket command: "
523 << cpp_strerror(-ret
) << dendl
;
525 ret
= admin_socket
->register_command("status",
528 "show overall client status");
530 lderr(cct
) << "error registering admin socket command: "
531 << cpp_strerror(-ret
) << dendl
;
536 client_lock
.Unlock();
539 void Client::shutdown()
541 ldout(cct
, 1) << "shutdown" << dendl
;
543 // If we were not mounted, but were being used for sending
544 // MDS commands, we may have sessions that need closing.
547 client_lock
.Unlock();
549 cct
->_conf
->remove_observer(this);
551 AdminSocket
* admin_socket
= cct
->get_admin_socket();
552 admin_socket
->unregister_command("mds_requests");
553 admin_socket
->unregister_command("mds_sessions");
554 admin_socket
->unregister_command("dump_cache");
555 admin_socket
->unregister_command("kick_stale_sessions");
556 admin_socket
->unregister_command("status");
558 if (ino_invalidate_cb
) {
559 ldout(cct
, 10) << "shutdown stopping cache invalidator finisher" << dendl
;
560 async_ino_invalidator
.wait_for_empty();
561 async_ino_invalidator
.stop();
564 if (dentry_invalidate_cb
) {
565 ldout(cct
, 10) << "shutdown stopping dentry invalidator finisher" << dendl
;
566 async_dentry_invalidator
.wait_for_empty();
567 async_dentry_invalidator
.stop();
570 if (switch_interrupt_cb
) {
571 ldout(cct
, 10) << "shutdown stopping interrupt finisher" << dendl
;
572 interrupt_finisher
.wait_for_empty();
573 interrupt_finisher
.stop();
577 ldout(cct
, 10) << "shutdown stopping remount finisher" << dendl
;
578 remount_finisher
.wait_for_empty();
579 remount_finisher
.stop();
582 objectcacher
->stop(); // outside of client_lock! this does a join.
588 client_lock
.Unlock();
590 objecter_finisher
.wait_for_empty();
591 objecter_finisher
.stop();
594 cct
->get_perfcounters_collection()->remove(logger
.get());
600 // ===================
601 // metadata cache stuff
603 void Client::trim_cache(bool trim_kernel_dcache
)
605 uint64_t max
= cct
->_conf
->client_cache_size
;
606 ldout(cct
, 20) << "trim_cache size " << lru
.lru_get_size() << " max " << max
<< dendl
;
608 while (lru
.lru_get_size() != last
) {
609 last
= lru
.lru_get_size();
611 if (!unmounting
&& lru
.lru_get_size() <= max
) break;
614 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_get_next_expire());
621 if (trim_kernel_dcache
&& lru
.lru_get_size() > max
)
622 _invalidate_kernel_dcache();
625 if (lru
.lru_get_size() == 0 && root
&& root
->get_num_ref() == 0 && inode_map
.size() == 1 + root_parents
.size()) {
626 ldout(cct
, 15) << "trim_cache trimmed root " << root
<< dendl
;
630 while (!root_parents
.empty())
631 root_parents
.erase(root_parents
.begin());
637 void Client::trim_cache_for_reconnect(MetaSession
*s
)
639 mds_rank_t mds
= s
->mds_num
;
640 ldout(cct
, 20) << "trim_cache_for_reconnect mds." << mds
<< dendl
;
643 list
<Dentry
*> skipped
;
644 while (lru
.lru_get_size() > 0) {
645 Dentry
*dn
= static_cast<Dentry
*>(lru
.lru_expire());
649 if ((dn
->inode
&& dn
->inode
->caps
.count(mds
)) ||
650 dn
->dir
->parent_inode
->caps
.count(mds
)) {
654 skipped
.push_back(dn
);
657 for(list
<Dentry
*>::iterator p
= skipped
.begin(); p
!= skipped
.end(); ++p
)
658 lru
.lru_insert_mid(*p
);
660 ldout(cct
, 20) << "trim_cache_for_reconnect mds." << mds
661 << " trimmed " << trimmed
<< " dentries" << dendl
;
663 if (s
->caps
.size() > 0)
664 _invalidate_kernel_dcache();
667 void Client::trim_dentry(Dentry
*dn
)
669 ldout(cct
, 15) << "trim_dentry unlinking dn " << dn
->name
670 << " in dir " << hex
<< dn
->dir
->parent_inode
->ino
673 Inode
*diri
= dn
->dir
->parent_inode
;
674 diri
->dir_release_count
++;
675 clear_dir_complete_and_ordered(diri
, true);
677 unlink(dn
, false, false); // drop dir, drop dentry
681 void Client::update_inode_file_size(Inode
*in
, int issued
, uint64_t size
,
682 uint64_t truncate_seq
, uint64_t truncate_size
)
684 uint64_t prior_size
= in
->size
;
686 if (truncate_seq
> in
->truncate_seq
||
687 (truncate_seq
== in
->truncate_seq
&& size
> in
->size
)) {
688 ldout(cct
, 10) << "size " << in
->size
<< " -> " << size
<< dendl
;
690 in
->reported_size
= size
;
691 if (truncate_seq
!= in
->truncate_seq
) {
692 ldout(cct
, 10) << "truncate_seq " << in
->truncate_seq
<< " -> "
693 << truncate_seq
<< dendl
;
694 in
->truncate_seq
= truncate_seq
;
695 in
->oset
.truncate_seq
= truncate_seq
;
697 // truncate cached file data
698 if (prior_size
> size
) {
699 _invalidate_inode_cache(in
, truncate_size
, prior_size
- truncate_size
);
703 // truncate inline data
704 if (in
->inline_version
< CEPH_INLINE_NONE
) {
705 uint32_t len
= in
->inline_data
.length();
707 in
->inline_data
.splice(size
, len
- size
);
710 if (truncate_seq
>= in
->truncate_seq
&&
711 in
->truncate_size
!= truncate_size
) {
713 ldout(cct
, 10) << "truncate_size " << in
->truncate_size
<< " -> "
714 << truncate_size
<< dendl
;
715 in
->truncate_size
= truncate_size
;
716 in
->oset
.truncate_size
= truncate_size
;
718 ldout(cct
, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl
;
723 void Client::update_inode_file_time(Inode
*in
, int issued
, uint64_t time_warp_seq
,
724 utime_t ctime
, utime_t mtime
, utime_t atime
)
726 ldout(cct
, 10) << __func__
<< " " << *in
<< " " << ccap_string(issued
)
727 << " ctime " << ctime
<< " mtime " << mtime
<< dendl
;
729 if (time_warp_seq
> in
->time_warp_seq
)
730 ldout(cct
, 10) << " mds time_warp_seq " << time_warp_seq
731 << " is higher than local time_warp_seq "
732 << in
->time_warp_seq
<< dendl
;
735 // be careful with size, mtime, atime
736 if (issued
& (CEPH_CAP_FILE_EXCL
|
738 CEPH_CAP_FILE_BUFFER
|
740 CEPH_CAP_XATTR_EXCL
)) {
741 ldout(cct
, 30) << "Yay have enough caps to look at our times" << dendl
;
742 if (ctime
> in
->ctime
)
744 if (time_warp_seq
> in
->time_warp_seq
) {
745 //the mds updated times, so take those!
748 in
->time_warp_seq
= time_warp_seq
;
749 } else if (time_warp_seq
== in
->time_warp_seq
) {
751 if (mtime
> in
->mtime
)
753 if (atime
> in
->atime
)
755 } else if (issued
& CEPH_CAP_FILE_EXCL
) {
756 //ignore mds values as we have a higher seq
759 ldout(cct
, 30) << "Don't have enough caps, just taking mds' time values" << dendl
;
760 if (time_warp_seq
>= in
->time_warp_seq
) {
764 in
->time_warp_seq
= time_warp_seq
;
768 ldout(cct
, 0) << "WARNING: " << *in
<< " mds time_warp_seq "
769 << time_warp_seq
<< " is lower than local time_warp_seq "
775 void Client::_fragmap_remove_non_leaves(Inode
*in
)
777 for (map
<frag_t
,int>::iterator p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
778 if (!in
->dirfragtree
.is_leaf(p
->first
))
779 in
->fragmap
.erase(p
++);
784 void Client::_fragmap_remove_stopped_mds(Inode
*in
, mds_rank_t mds
)
786 for (auto p
= in
->fragmap
.begin(); p
!= in
->fragmap
.end(); )
787 if (p
->second
== mds
)
788 in
->fragmap
.erase(p
++);
793 Inode
* Client::add_update_inode(InodeStat
*st
, utime_t from
,
794 MetaSession
*session
,
795 const UserPerm
& request_perms
)
798 bool was_new
= false;
799 if (inode_map
.count(st
->vino
)) {
800 in
= inode_map
[st
->vino
];
801 ldout(cct
, 12) << "add_update_inode had " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
803 in
= new Inode(this, st
->vino
, &st
->layout
);
804 inode_map
[st
->vino
] = in
;
806 if (use_faked_inos())
807 _assign_faked_ino(in
);
813 } else if (!mounted
) {
814 root_parents
[root_ancestor
] = in
;
819 in
->ino
= st
->vino
.ino
;
820 in
->snapid
= st
->vino
.snapid
;
821 in
->mode
= st
->mode
& S_IFMT
;
826 if (in
->is_symlink())
827 in
->symlink
= st
->symlink
;
829 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
830 bool new_version
= false;
831 if (in
->version
== 0 ||
832 ((st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
) &&
833 (in
->version
& ~1) < st
->version
))
837 in
->caps_issued(&issued
);
838 issued
|= in
->caps_dirty();
839 int new_issued
= ~issued
& (int)st
->cap
.caps
;
841 if ((new_version
|| (new_issued
& CEPH_CAP_AUTH_SHARED
)) &&
842 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
846 in
->btime
= st
->btime
;
849 if ((new_version
|| (new_issued
& CEPH_CAP_LINK_SHARED
)) &&
850 !(issued
& CEPH_CAP_LINK_EXCL
)) {
851 in
->nlink
= st
->nlink
;
854 if (new_version
|| (new_issued
& CEPH_CAP_ANY_RD
)) {
855 update_inode_file_time(in
, issued
, st
->time_warp_seq
,
856 st
->ctime
, st
->mtime
, st
->atime
);
860 (new_issued
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
))) {
861 in
->layout
= st
->layout
;
862 update_inode_file_size(in
, issued
, st
->size
, st
->truncate_seq
, st
->truncate_size
);
866 if (new_version
|| (new_issued
& CEPH_CAP_FILE_SHARED
)) {
867 in
->dirstat
= st
->dirstat
;
869 // dir_layout/rstat/quota are not tracked by capability, update them only if
870 // the inode stat is from auth mds
871 if (new_version
|| (st
->cap
.flags
& CEPH_CAP_FLAG_AUTH
)) {
872 in
->dir_layout
= st
->dir_layout
;
873 ldout(cct
, 20) << " dir hash is " << (int)in
->dir_layout
.dl_dir_hash
<< dendl
;
874 in
->rstat
= st
->rstat
;
875 in
->quota
= st
->quota
;
877 // move me if/when version reflects fragtree changes.
878 if (in
->dirfragtree
!= st
->dirfragtree
) {
879 in
->dirfragtree
= st
->dirfragtree
;
880 _fragmap_remove_non_leaves(in
);
884 if ((in
->xattr_version
== 0 || !(issued
& CEPH_CAP_XATTR_EXCL
)) &&
885 st
->xattrbl
.length() &&
886 st
->xattr_version
> in
->xattr_version
) {
887 bufferlist::iterator p
= st
->xattrbl
.begin();
888 ::decode(in
->xattrs
, p
);
889 in
->xattr_version
= st
->xattr_version
;
892 if (st
->inline_version
> in
->inline_version
) {
893 in
->inline_data
= st
->inline_data
;
894 in
->inline_version
= st
->inline_version
;
897 /* always take a newer change attr */
898 if (st
->change_attr
> in
->change_attr
)
899 in
->change_attr
= st
->change_attr
;
901 if (st
->version
> in
->version
)
902 in
->version
= st
->version
;
905 ldout(cct
, 12) << __func__
<< " adding " << *in
<< " caps " << ccap_string(st
->cap
.caps
) << dendl
;
908 return in
; // as with readdir returning indoes in different snaprealms (no caps!)
910 if (in
->snapid
== CEPH_NOSNAP
) {
911 add_update_cap(in
, session
, st
->cap
.cap_id
, st
->cap
.caps
, st
->cap
.seq
,
912 st
->cap
.mseq
, inodeno_t(st
->cap
.realm
), st
->cap
.flags
,
914 if (in
->auth_cap
&& in
->auth_cap
->session
== session
) {
915 in
->max_size
= st
->max_size
;
916 in
->rstat
= st
->rstat
;
919 // setting I_COMPLETE needs to happen after adding the cap
921 (st
->cap
.caps
& CEPH_CAP_FILE_SHARED
) &&
922 (issued
& CEPH_CAP_FILE_EXCL
) == 0 &&
923 in
->dirstat
.nfiles
== 0 &&
924 in
->dirstat
.nsubdirs
== 0) {
925 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in
<< dendl
;
926 in
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
928 ldout(cct
, 10) << " dir is open on empty dir " << in
->ino
<< " with "
929 << in
->dir
->dentries
.size() << " entries, marking all dentries null" << dendl
;
930 in
->dir
->readdir_cache
.clear();
931 for (const auto& p
: in
->dir
->dentries
) {
932 unlink(p
.second
, true, true); // keep dir, keep dentry
934 if (in
->dir
->dentries
.empty())
939 in
->snap_caps
|= st
->cap
.caps
;
947 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
949 Dentry
*Client::insert_dentry_inode(Dir
*dir
, const string
& dname
, LeaseStat
*dlease
,
950 Inode
*in
, utime_t from
, MetaSession
*session
,
954 if (dir
->dentries
.count(dname
))
955 dn
= dir
->dentries
[dname
];
957 ldout(cct
, 12) << "insert_dentry_inode '" << dname
<< "' vino " << in
->vino()
958 << " in dir " << dir
->parent_inode
->vino() << " dn " << dn
961 if (dn
&& dn
->inode
) {
962 if (dn
->inode
->vino() == in
->vino()) {
964 ldout(cct
, 12) << " had dentry " << dname
965 << " with correct vino " << dn
->inode
->vino()
968 ldout(cct
, 12) << " had dentry " << dname
969 << " with WRONG vino " << dn
->inode
->vino()
971 unlink(dn
, true, true); // keep dir, keep dentry
975 if (!dn
|| !dn
->inode
) {
976 InodeRef
tmp_ref(in
);
978 if (old_dentry
->dir
!= dir
) {
979 Inode
*old_diri
= old_dentry
->dir
->parent_inode
;
980 old_diri
->dir_ordered_count
++;
981 clear_dir_complete_and_ordered(old_diri
, false);
983 unlink(old_dentry
, dir
== old_dentry
->dir
, false); // drop dentry, keep dir open if its the same dir
985 Inode
*diri
= dir
->parent_inode
;
986 diri
->dir_ordered_count
++;
987 clear_dir_complete_and_ordered(diri
, false);
988 dn
= link(dir
, dname
, in
, dn
);
991 update_dentry_lease(dn
, dlease
, from
, session
);
995 void Client::update_dentry_lease(Dentry
*dn
, LeaseStat
*dlease
, utime_t from
, MetaSession
*session
)
998 dttl
+= (float)dlease
->duration_ms
/ 1000.0;
1002 if (dlease
->mask
& CEPH_LOCK_DN
) {
1003 if (dttl
> dn
->lease_ttl
) {
1004 ldout(cct
, 10) << "got dentry lease on " << dn
->name
1005 << " dur " << dlease
->duration_ms
<< "ms ttl " << dttl
<< dendl
;
1006 dn
->lease_ttl
= dttl
;
1007 dn
->lease_mds
= session
->mds_num
;
1008 dn
->lease_seq
= dlease
->seq
;
1009 dn
->lease_gen
= session
->cap_gen
;
1012 dn
->cap_shared_gen
= dn
->dir
->parent_inode
->shared_gen
;
1017 * update MDS location cache for a single inode
1019 void Client::update_dir_dist(Inode
*in
, DirStat
*dst
)
1022 ldout(cct
, 20) << "got dirfrag map for " << in
->ino
<< " frag " << dst
->frag
<< " to mds " << dst
->auth
<< dendl
;
1023 if (dst
->auth
>= 0) {
1024 in
->fragmap
[dst
->frag
] = dst
->auth
;
1026 in
->fragmap
.erase(dst
->frag
);
1028 if (!in
->dirfragtree
.is_leaf(dst
->frag
)) {
1029 in
->dirfragtree
.force_to_leaf(cct
, dst
->frag
);
1030 _fragmap_remove_non_leaves(in
);
1034 in
->dir_replicated
= !dst
->dist
.empty(); // FIXME that's just one frag!
1038 if (!st->dirfrag_dist.empty()) { // FIXME
1039 set<int> dist = st->dirfrag_dist.begin()->second;
1040 if (dist.empty() && !in->dir_contacts.empty())
1041 ldout(cct, 9) << "lost dist spec for " << in->ino
1042 << " " << dist << dendl;
1043 if (!dist.empty() && in->dir_contacts.empty())
1044 ldout(cct, 9) << "got dist spec for " << in->ino
1045 << " " << dist << dendl;
1046 in->dir_contacts = dist;
1051 void Client::clear_dir_complete_and_ordered(Inode
*diri
, bool complete
)
1053 if (diri
->flags
& I_COMPLETE
) {
1055 ldout(cct
, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
1056 diri
->flags
&= ~(I_COMPLETE
| I_DIR_ORDERED
);
1058 if (diri
->flags
& I_DIR_ORDERED
) {
1059 ldout(cct
, 10) << " clearing I_DIR_ORDERED on " << *diri
<< dendl
;
1060 diri
->flags
&= ~I_DIR_ORDERED
;
1064 diri
->dir
->readdir_cache
.clear();
1069 * insert results from readdir or lssnap into the metadata cache.
1071 void Client::insert_readdir_results(MetaRequest
*request
, MetaSession
*session
, Inode
*diri
) {
1073 MClientReply
*reply
= request
->reply
;
1074 ConnectionRef con
= request
->reply
->get_connection();
1075 uint64_t features
= con
->get_features();
1077 dir_result_t
*dirp
= request
->dirp
;
1080 // the extra buffer list is only set for readdir and lssnap replies
1081 bufferlist::iterator p
= reply
->get_extra_bl().begin();
1084 if (request
->head
.op
== CEPH_MDS_OP_LSSNAP
) {
1086 diri
= open_snapdir(diri
);
1089 // only open dir if we're actually adding stuff to it!
1090 Dir
*dir
= diri
->open_dir();
1100 bool end
= ((unsigned)flags
& CEPH_READDIR_FRAG_END
);
1101 bool hash_order
= ((unsigned)flags
& CEPH_READDIR_HASH_ORDER
);
1103 frag_t fg
= (unsigned)request
->head
.args
.readdir
.frag
;
1104 unsigned readdir_offset
= dirp
->next_offset
;
1105 string readdir_start
= dirp
->last_name
;
1106 assert(!readdir_start
.empty() || readdir_offset
== 2);
1108 unsigned last_hash
= 0;
1110 if (!readdir_start
.empty()) {
1111 last_hash
= ceph_frag_value(diri
->hash_dentry_name(readdir_start
));
1112 } else if (flags
& CEPH_READDIR_OFFSET_HASH
) {
1113 /* mds understands offset_hash */
1114 last_hash
= (unsigned)request
->head
.args
.readdir
.offset_hash
;
1118 if (fg
!= dst
.frag
) {
1119 ldout(cct
, 10) << "insert_trace got new frag " << fg
<< " -> " << dst
.frag
<< dendl
;
1123 readdir_start
.clear();
1124 dirp
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
, false);
1128 ldout(cct
, 10) << __func__
<< " " << numdn
<< " readdir items, end=" << end
1129 << ", hash_order=" << hash_order
1130 << ", readdir_start " << readdir_start
1131 << ", last_hash " << last_hash
1132 << ", next_offset " << readdir_offset
<< dendl
;
1134 if (diri
->snapid
!= CEPH_SNAPDIR
&&
1135 fg
.is_leftmost() && readdir_offset
== 2 &&
1136 !(hash_order
&& last_hash
)) {
1137 dirp
->release_count
= diri
->dir_release_count
;
1138 dirp
->ordered_count
= diri
->dir_ordered_count
;
1139 dirp
->start_shared_gen
= diri
->shared_gen
;
1140 dirp
->cache_index
= 0;
1143 dirp
->buffer_frag
= fg
;
1145 _readdir_drop_dirp_buffer(dirp
);
1146 dirp
->buffer
.reserve(numdn
);
1150 for (unsigned i
=0; i
<numdn
; i
++) {
1152 ::decode(dlease
, p
);
1153 InodeStat
ist(p
, features
);
1155 ldout(cct
, 15) << "" << i
<< ": '" << dname
<< "'" << dendl
;
1157 Inode
*in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1160 if (diri
->dir
->dentries
.count(dname
)) {
1161 Dentry
*olddn
= diri
->dir
->dentries
[dname
];
1162 if (olddn
->inode
!= in
) {
1163 // replace incorrect dentry
1164 unlink(olddn
, true, true); // keep dir, dentry
1165 dn
= link(dir
, dname
, in
, olddn
);
1166 assert(dn
== olddn
);
1174 dn
= link(dir
, dname
, in
, NULL
);
1177 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1179 unsigned hash
= ceph_frag_value(diri
->hash_dentry_name(dname
));
1180 if (hash
!= last_hash
)
1183 dn
->offset
= dir_result_t::make_fpos(hash
, readdir_offset
++, true);
1185 dn
->offset
= dir_result_t::make_fpos(fg
, readdir_offset
++, false);
1187 // add to readdir cache
1188 if (dirp
->release_count
== diri
->dir_release_count
&&
1189 dirp
->ordered_count
== diri
->dir_ordered_count
&&
1190 dirp
->start_shared_gen
== diri
->shared_gen
) {
1191 if (dirp
->cache_index
== dir
->readdir_cache
.size()) {
1193 assert(!dirp
->inode
->is_complete_and_ordered());
1194 dir
->readdir_cache
.reserve(dirp
->cache_index
+ numdn
);
1196 dir
->readdir_cache
.push_back(dn
);
1197 } else if (dirp
->cache_index
< dir
->readdir_cache
.size()) {
1198 if (dirp
->inode
->is_complete_and_ordered())
1199 assert(dir
->readdir_cache
[dirp
->cache_index
] == dn
);
1201 dir
->readdir_cache
[dirp
->cache_index
] = dn
;
1203 assert(0 == "unexpected readdir buffer idx");
1205 dirp
->cache_index
++;
1207 // add to cached result list
1208 dirp
->buffer
.push_back(dir_result_t::dentry(dn
->offset
, dname
, in
));
1209 ldout(cct
, 15) << __func__
<< " " << hex
<< dn
->offset
<< dec
<< ": '" << dname
<< "' -> " << in
->ino
<< dendl
;
1213 dirp
->last_name
= dname
;
1215 dirp
->next_offset
= 2;
1217 dirp
->next_offset
= readdir_offset
;
1219 if (dir
->is_empty())
1226 * insert a trace from a MDS reply into the cache.
1228 Inode
* Client::insert_trace(MetaRequest
*request
, MetaSession
*session
)
1230 MClientReply
*reply
= request
->reply
;
1231 int op
= request
->get_op();
1233 ldout(cct
, 10) << "insert_trace from " << request
->sent_stamp
<< " mds." << session
->mds_num
1234 << " is_target=" << (int)reply
->head
.is_target
1235 << " is_dentry=" << (int)reply
->head
.is_dentry
1238 bufferlist::iterator p
= reply
->get_trace_bl().begin();
1239 if (request
->got_unsafe
) {
1240 ldout(cct
, 10) << "insert_trace -- already got unsafe; ignoring" << dendl
;
1246 ldout(cct
, 10) << "insert_trace -- no trace" << dendl
;
1248 Dentry
*d
= request
->dentry();
1250 Inode
*diri
= d
->dir
->parent_inode
;
1251 diri
->dir_release_count
++;
1252 clear_dir_complete_and_ordered(diri
, true);
1255 if (d
&& reply
->get_result() == 0) {
1256 if (op
== CEPH_MDS_OP_RENAME
) {
1258 Dentry
*od
= request
->old_dentry();
1259 ldout(cct
, 10) << " unlinking rename src dn " << od
<< " for traceless reply" << dendl
;
1261 unlink(od
, true, true); // keep dir, dentry
1262 } else if (op
== CEPH_MDS_OP_RMDIR
||
1263 op
== CEPH_MDS_OP_UNLINK
) {
1265 ldout(cct
, 10) << " unlinking unlink/rmdir dn " << d
<< " for traceless reply" << dendl
;
1266 unlink(d
, true, true); // keep dir, dentry
1272 ConnectionRef con
= request
->reply
->get_connection();
1273 uint64_t features
= con
->get_features();
1274 ldout(cct
, 10) << " features 0x" << hex
<< features
<< dec
<< dendl
;
1277 SnapRealm
*realm
= NULL
;
1278 if (reply
->snapbl
.length())
1279 update_snap_trace(reply
->snapbl
, &realm
);
1281 ldout(cct
, 10) << " hrm "
1282 << " is_target=" << (int)reply
->head
.is_target
1283 << " is_dentry=" << (int)reply
->head
.is_dentry
1292 if (reply
->head
.is_dentry
) {
1293 dirst
.decode(p
, features
);
1296 ::decode(dlease
, p
);
1300 if (reply
->head
.is_target
) {
1301 ist
.decode(p
, features
);
1302 if (cct
->_conf
->client_debug_getattr_caps
) {
1303 unsigned wanted
= 0;
1304 if (op
== CEPH_MDS_OP_GETATTR
|| op
== CEPH_MDS_OP_LOOKUP
)
1305 wanted
= request
->head
.args
.getattr
.mask
;
1306 else if (op
== CEPH_MDS_OP_OPEN
|| op
== CEPH_MDS_OP_CREATE
)
1307 wanted
= request
->head
.args
.open
.mask
;
1309 if ((wanted
& CEPH_CAP_XATTR_SHARED
) &&
1310 !(ist
.xattr_version
> 0 && ist
.xattrbl
.length() > 0))
1311 assert(0 == "MDS reply does not contain xattrs");
1314 in
= add_update_inode(&ist
, request
->sent_stamp
, session
,
1319 if (reply
->head
.is_dentry
) {
1320 diri
= add_update_inode(&dirst
, request
->sent_stamp
, session
,
1322 update_dir_dist(diri
, &dst
); // dir stat info is attached to ..
1325 Dir
*dir
= diri
->open_dir();
1326 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
,
1327 (op
== CEPH_MDS_OP_RENAME
) ? request
->old_dentry() : NULL
);
1330 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1331 dn
= diri
->dir
->dentries
[dname
];
1333 diri
->dir_ordered_count
++;
1334 clear_dir_complete_and_ordered(diri
, false);
1335 unlink(dn
, true, true); // keep dir, dentry
1338 if (dlease
.duration_ms
> 0) {
1340 Dir
*dir
= diri
->open_dir();
1341 dn
= link(dir
, dname
, NULL
, NULL
);
1343 update_dentry_lease(dn
, &dlease
, request
->sent_stamp
, session
);
1346 } else if (op
== CEPH_MDS_OP_LOOKUPSNAP
||
1347 op
== CEPH_MDS_OP_MKSNAP
) {
1348 ldout(cct
, 10) << " faking snap lookup weirdness" << dendl
;
1349 // fake it for snap lookup
1350 vinodeno_t vino
= ist
.vino
;
1351 vino
.snapid
= CEPH_SNAPDIR
;
1352 assert(inode_map
.count(vino
));
1353 diri
= inode_map
[vino
];
1355 string dname
= request
->path
.last_dentry();
1358 dlease
.duration_ms
= 0;
1361 Dir
*dir
= diri
->open_dir();
1362 insert_dentry_inode(dir
, dname
, &dlease
, in
, request
->sent_stamp
, session
);
1364 if (diri
->dir
&& diri
->dir
->dentries
.count(dname
)) {
1365 Dentry
*dn
= diri
->dir
->dentries
[dname
];
1367 unlink(dn
, true, true); // keep dir, dentry
1373 if (op
== CEPH_MDS_OP_READDIR
||
1374 op
== CEPH_MDS_OP_LSSNAP
) {
1375 insert_readdir_results(request
, session
, in
);
1376 } else if (op
== CEPH_MDS_OP_LOOKUPNAME
) {
1377 // hack: return parent inode instead
1381 if (request
->dentry() == NULL
&& in
!= request
->inode()) {
1382 // pin the target inode if its parent dentry is not pinned
1383 request
->set_other_inode(in
);
1388 put_snap_realm(realm
);
1390 request
->target
= in
;
1396 mds_rank_t
Client::choose_target_mds(MetaRequest
*req
, Inode
** phash_diri
)
1398 mds_rank_t mds
= MDS_RANK_NONE
;
1400 bool is_hash
= false;
1406 if (req
->resend_mds
>= 0) {
1407 mds
= req
->resend_mds
;
1408 req
->resend_mds
= -1;
1409 ldout(cct
, 10) << "choose_target_mds resend_mds specified as mds." << mds
<< dendl
;
1413 if (cct
->_conf
->client_use_random_mds
)
1419 ldout(cct
, 20) << "choose_target_mds starting with req->inode " << *in
<< dendl
;
1420 if (req
->path
.depth()) {
1421 hash
= in
->hash_dentry_name(req
->path
[0]);
1422 ldout(cct
, 20) << "choose_target_mds inode dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1423 << " on " << req
->path
[0]
1424 << " => " << hash
<< dendl
;
1429 in
= de
->inode
.get();
1430 ldout(cct
, 20) << "choose_target_mds starting with req->dentry inode " << *in
<< dendl
;
1432 in
= de
->dir
->parent_inode
;
1433 hash
= in
->hash_dentry_name(de
->name
);
1434 ldout(cct
, 20) << "choose_target_mds dentry dir hash is " << (int)in
->dir_layout
.dl_dir_hash
1435 << " on " << de
->name
1436 << " => " << hash
<< dendl
;
1441 if (in
->snapid
!= CEPH_NOSNAP
) {
1442 ldout(cct
, 10) << "choose_target_mds " << *in
<< " is snapped, using nonsnap parent" << dendl
;
1443 while (in
->snapid
!= CEPH_NOSNAP
) {
1444 if (in
->snapid
== CEPH_SNAPDIR
)
1445 in
= in
->snapdir_parent
.get();
1446 else if (!in
->dn_set
.empty())
1447 /* In most cases there will only be one dentry, so getting it
1448 * will be the correct action. If there are multiple hard links,
1449 * I think the MDS should be able to redirect as needed*/
1450 in
= in
->get_first_parent()->dir
->parent_inode
;
1452 ldout(cct
, 10) << "got unlinked inode, can't look at parent" << dendl
;
1459 ldout(cct
, 20) << "choose_target_mds " << *in
<< " is_hash=" << is_hash
1460 << " hash=" << hash
<< dendl
;
1462 if (is_hash
&& S_ISDIR(in
->mode
) && !in
->fragmap
.empty()) {
1463 frag_t fg
= in
->dirfragtree
[hash
];
1464 if (in
->fragmap
.count(fg
)) {
1465 mds
= in
->fragmap
[fg
];
1468 ldout(cct
, 10) << "choose_target_mds from dirfragtree hash" << dendl
;
1473 if (req
->auth_is_best())
1475 if (!cap
&& !in
->caps
.empty())
1476 cap
= in
->caps
.begin()->second
;
1479 mds
= cap
->session
->mds_num
;
1480 ldout(cct
, 10) << "choose_target_mds from caps on inode " << *in
<< dendl
;
1487 mds
= _get_random_up_mds();
1488 ldout(cct
, 10) << "did not get mds through better means, so chose random mds " << mds
<< dendl
;
1492 ldout(cct
, 20) << "mds is " << mds
<< dendl
;
1497 void Client::connect_mds_targets(mds_rank_t mds
)
1499 ldout(cct
, 10) << "connect_mds_targets for mds." << mds
<< dendl
;
1500 assert(mds_sessions
.count(mds
));
1501 const MDSMap::mds_info_t
& info
= mdsmap
->get_mds_info(mds
);
1502 for (set
<mds_rank_t
>::const_iterator q
= info
.export_targets
.begin();
1503 q
!= info
.export_targets
.end();
1505 if (mds_sessions
.count(*q
) == 0 &&
1506 mdsmap
->is_clientreplay_or_active_or_stopping(*q
)) {
1507 ldout(cct
, 10) << "check_mds_sessions opening mds." << mds
1508 << " export target mds." << *q
<< dendl
;
1509 _open_mds_session(*q
);
1514 void Client::dump_mds_sessions(Formatter
*f
)
1516 f
->dump_int("id", get_nodeid().v
);
1517 entity_inst_t
inst(messenger
->get_myname(), messenger
->get_myaddr());
1518 f
->dump_object("inst", inst
);
1519 f
->dump_stream("inst_str") << inst
;
1520 f
->dump_stream("addr_str") << inst
.addr
;
1521 f
->open_array_section("sessions");
1522 for (map
<mds_rank_t
,MetaSession
*>::const_iterator p
= mds_sessions
.begin(); p
!= mds_sessions
.end(); ++p
) {
1523 f
->open_object_section("session");
1528 f
->dump_int("mdsmap_epoch", mdsmap
->get_epoch());
1530 void Client::dump_mds_requests(Formatter
*f
)
1532 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
1533 p
!= mds_requests
.end();
1535 f
->open_object_section("request");
1541 int Client::verify_reply_trace(int r
,
1542 MetaRequest
*request
, MClientReply
*reply
,
1543 InodeRef
*ptarget
, bool *pcreated
,
1544 const UserPerm
& perms
)
1546 // check whether this request actually did the create, and set created flag
1547 bufferlist extra_bl
;
1548 inodeno_t created_ino
;
1549 bool got_created_ino
= false;
1550 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
;
1552 extra_bl
.claim(reply
->get_extra_bl());
1553 if (extra_bl
.length() >= 8) {
1554 // if the extra bufferlist has a buffer, we assume its the created inode
1555 // and that this request to create succeeded in actually creating
1556 // the inode (won the race with other create requests)
1557 ::decode(created_ino
, extra_bl
);
1558 got_created_ino
= true;
1559 ldout(cct
, 10) << "make_request created ino " << created_ino
<< dendl
;
1563 *pcreated
= got_created_ino
;
1565 if (request
->target
) {
1566 *ptarget
= request
->target
;
1567 ldout(cct
, 20) << "make_request target is " << *ptarget
->get() << dendl
;
1569 if (got_created_ino
&& (p
= inode_map
.find(vinodeno_t(created_ino
, CEPH_NOSNAP
))) != inode_map
.end()) {
1570 (*ptarget
) = p
->second
;
1571 ldout(cct
, 20) << "make_request created, target is " << *ptarget
->get() << dendl
;
1573 // we got a traceless reply, and need to look up what we just
1574 // created. for now, do this by name. someday, do this by the
1575 // ino... which we know! FIXME.
1577 Dentry
*d
= request
->dentry();
1580 ldout(cct
, 10) << "make_request got traceless reply, looking up #"
1581 << d
->dir
->parent_inode
->ino
<< "/" << d
->name
1582 << " got_ino " << got_created_ino
1583 << " ino " << created_ino
1585 r
= _do_lookup(d
->dir
->parent_inode
, d
->name
, request
->regetattr_mask
,
1588 // if the dentry is not linked, just do our best. see #5021.
1589 assert(0 == "how did this happen? i want logs!");
1592 Inode
*in
= request
->inode();
1593 ldout(cct
, 10) << "make_request got traceless reply, forcing getattr on #"
1594 << in
->ino
<< dendl
;
1595 r
= _getattr(in
, request
->regetattr_mask
, perms
, true);
1599 // verify ino returned in reply and trace_dist are the same
1600 if (got_created_ino
&&
1601 created_ino
.val
!= target
->ino
.val
) {
1602 ldout(cct
, 5) << "create got ino " << created_ino
<< " but then failed on lookup; EINTR?" << dendl
;
1606 ptarget
->swap(target
);
1618 * Blocking helper to make an MDS request.
1620 * If the ptarget flag is set, behavior changes slightly: the caller
1621 * expects to get a pointer to the inode we are creating or operating
1622 * on. As a result, we will follow up any traceless mutation reply
1623 * with a getattr or lookup to transparently handle a traceless reply
1624 * from the MDS (as when the MDS restarts and the client has to replay
1627 * @param request the MetaRequest to execute
1628 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1629 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1630 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1631 * @param use_mds [optional] prefer a specific mds (-1 for default)
1632 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1634 int Client::make_request(MetaRequest
*request
,
1635 const UserPerm
& perms
,
1636 InodeRef
*ptarget
, bool *pcreated
,
1642 // assign a unique tid
1643 ceph_tid_t tid
= ++last_tid
;
1644 request
->set_tid(tid
);
1647 request
->op_stamp
= ceph_clock_now();
1650 mds_requests
[tid
] = request
->get();
1651 if (oldest_tid
== 0 && request
->get_op() != CEPH_MDS_OP_SETFILELOCK
)
1654 request
->set_caller_perms(perms
);
1656 if (cct
->_conf
->client_inject_fixed_oldest_tid
) {
1657 ldout(cct
, 20) << __func__
<< " injecting fixed oldest_client_tid(1)" << dendl
;
1658 request
->set_oldest_client_tid(1);
1660 request
->set_oldest_client_tid(oldest_tid
);
1665 request
->resend_mds
= use_mds
;
1668 if (request
->aborted())
1672 request
->abort(-EBLACKLISTED
);
1678 request
->caller_cond
= &caller_cond
;
1681 Inode
*hash_diri
= NULL
;
1682 mds_rank_t mds
= choose_target_mds(request
, &hash_diri
);
1683 int mds_state
= (mds
== MDS_RANK_NONE
) ? MDSMap::STATE_NULL
: mdsmap
->get_state(mds
);
1684 if (mds_state
!= MDSMap::STATE_ACTIVE
&& mds_state
!= MDSMap::STATE_STOPPING
) {
1685 if (mds_state
== MDSMap::STATE_NULL
&& mds
>= mdsmap
->get_max_mds()) {
1687 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, remove it from fragmap" << dendl
;
1688 _fragmap_remove_stopped_mds(hash_diri
, mds
);
1690 ldout(cct
, 10) << " target mds." << mds
<< " has stopped, trying a random mds" << dendl
;
1691 request
->resend_mds
= _get_random_up_mds();
1694 ldout(cct
, 10) << " target mds." << mds
<< " not active, waiting for new mdsmap" << dendl
;
1695 wait_on_list(waiting_for_mdsmap
);
1701 MetaSession
*session
= NULL
;
1702 if (!have_open_session(mds
)) {
1703 session
= _get_or_open_mds_session(mds
);
1706 if (session
->state
== MetaSession::STATE_OPENING
) {
1707 ldout(cct
, 10) << "waiting for session to mds." << mds
<< " to open" << dendl
;
1708 wait_on_context_list(session
->waiting_for_open
);
1709 // Abort requests on REJECT from MDS
1710 if (rejected_by_mds
.count(mds
)) {
1711 request
->abort(-EPERM
);
1717 if (!have_open_session(mds
))
1720 session
= mds_sessions
[mds
];
1724 send_request(request
, session
);
1727 ldout(cct
, 20) << "awaiting reply|forward|kick on " << &caller_cond
<< dendl
;
1728 request
->kick
= false;
1729 while (!request
->reply
&& // reply
1730 request
->resend_mds
< 0 && // forward
1732 caller_cond
.Wait(client_lock
);
1733 request
->caller_cond
= NULL
;
1735 // did we get a reply?
1740 if (!request
->reply
) {
1741 assert(request
->aborted());
1742 assert(!request
->got_unsafe
);
1743 r
= request
->get_abort_code();
1744 request
->item
.remove_myself();
1745 unregister_request(request
);
1746 put_request(request
); // ours
1751 MClientReply
*reply
= request
->reply
;
1752 request
->reply
= NULL
;
1753 r
= reply
->get_result();
1755 request
->success
= true;
1757 // kick dispatcher (we've got it!)
1758 assert(request
->dispatch_cond
);
1759 request
->dispatch_cond
->Signal();
1760 ldout(cct
, 20) << "sendrecv kickback on tid " << tid
<< " " << request
->dispatch_cond
<< dendl
;
1761 request
->dispatch_cond
= 0;
1763 if (r
>= 0 && ptarget
)
1764 r
= verify_reply_trace(r
, request
, reply
, ptarget
, pcreated
, perms
);
1767 pdirbl
->claim(reply
->get_extra_bl());
1770 utime_t lat
= ceph_clock_now();
1771 lat
-= request
->sent_stamp
;
1772 ldout(cct
, 20) << "lat " << lat
<< dendl
;
1773 logger
->tinc(l_c_lat
, lat
);
1774 logger
->tinc(l_c_reply
, lat
);
1776 put_request(request
);
1782 void Client::unregister_request(MetaRequest
*req
)
1784 mds_requests
.erase(req
->tid
);
1785 if (req
->tid
== oldest_tid
) {
1786 map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.upper_bound(oldest_tid
);
1788 if (p
== mds_requests
.end()) {
1792 if (p
->second
->get_op() != CEPH_MDS_OP_SETFILELOCK
) {
1793 oldest_tid
= p
->first
;
1802 void Client::put_request(MetaRequest
*request
)
1804 if (request
->_put()) {
1806 if (request
->success
)
1807 op
= request
->get_op();
1809 request
->take_other_inode(&other_in
);
1813 (op
== CEPH_MDS_OP_RMDIR
||
1814 op
== CEPH_MDS_OP_RENAME
||
1815 op
== CEPH_MDS_OP_RMSNAP
)) {
1816 _try_to_trim_inode(other_in
.get(), false);
1821 int Client::encode_inode_release(Inode
*in
, MetaRequest
*req
,
1822 mds_rank_t mds
, int drop
,
1823 int unless
, int force
)
1825 ldout(cct
, 20) << "encode_inode_release enter(in:" << *in
<< ", req:" << req
1826 << " mds:" << mds
<< ", drop:" << drop
<< ", unless:" << unless
1827 << ", have:" << ", force:" << force
<< ")" << dendl
;
1829 if (in
->caps
.count(mds
)) {
1830 Cap
*caps
= in
->caps
[mds
];
1831 drop
&= ~(in
->dirty_caps
| get_caps_used(in
));
1832 if ((drop
& caps
->issued
) &&
1833 !(unless
& caps
->issued
)) {
1834 ldout(cct
, 25) << "Dropping caps. Initial " << ccap_string(caps
->issued
) << dendl
;
1835 caps
->issued
&= ~drop
;
1836 caps
->implemented
&= ~drop
;
1838 ldout(cct
, 25) << "Now have: " << ccap_string(caps
->issued
) << dendl
;
1843 ceph_mds_request_release rel
;
1845 rel
.cap_id
= caps
->cap_id
;
1846 rel
.seq
= caps
->seq
;
1847 rel
.issue_seq
= caps
->issue_seq
;
1848 rel
.mseq
= caps
->mseq
;
1849 rel
.caps
= caps
->implemented
;
1850 rel
.wanted
= caps
->wanted
;
1853 req
->cap_releases
.push_back(MClientRequest::Release(rel
,""));
1856 ldout(cct
, 25) << "encode_inode_release exit(in:" << *in
<< ") released:"
1857 << released
<< dendl
;
1861 void Client::encode_dentry_release(Dentry
*dn
, MetaRequest
*req
,
1862 mds_rank_t mds
, int drop
, int unless
)
1864 ldout(cct
, 20) << "encode_dentry_release enter(dn:"
1865 << dn
<< ")" << dendl
;
1868 released
= encode_inode_release(dn
->dir
->parent_inode
, req
,
1869 mds
, drop
, unless
, 1);
1870 if (released
&& dn
->lease_mds
== mds
) {
1871 ldout(cct
, 25) << "preemptively releasing dn to mds" << dendl
;
1872 MClientRequest::Release
& rel
= req
->cap_releases
.back();
1873 rel
.item
.dname_len
= dn
->name
.length();
1874 rel
.item
.dname_seq
= dn
->lease_seq
;
1875 rel
.dname
= dn
->name
;
1877 ldout(cct
, 25) << "encode_dentry_release exit(dn:"
1878 << dn
<< ")" << dendl
;
1883 * This requires the MClientRequest *request member to be set.
1884 * It will error out horribly without one.
1885 * Additionally, if you set any *drop member, you'd better have
1886 * set the corresponding dentry!
1888 void Client::encode_cap_releases(MetaRequest
*req
, mds_rank_t mds
)
1890 ldout(cct
, 20) << "encode_cap_releases enter (req: "
1891 << req
<< ", mds: " << mds
<< ")" << dendl
;
1892 if (req
->inode_drop
&& req
->inode())
1893 encode_inode_release(req
->inode(), req
,
1894 mds
, req
->inode_drop
,
1897 if (req
->old_inode_drop
&& req
->old_inode())
1898 encode_inode_release(req
->old_inode(), req
,
1899 mds
, req
->old_inode_drop
,
1900 req
->old_inode_unless
);
1901 if (req
->other_inode_drop
&& req
->other_inode())
1902 encode_inode_release(req
->other_inode(), req
,
1903 mds
, req
->other_inode_drop
,
1904 req
->other_inode_unless
);
1906 if (req
->dentry_drop
&& req
->dentry())
1907 encode_dentry_release(req
->dentry(), req
,
1908 mds
, req
->dentry_drop
,
1909 req
->dentry_unless
);
1911 if (req
->old_dentry_drop
&& req
->old_dentry())
1912 encode_dentry_release(req
->old_dentry(), req
,
1913 mds
, req
->old_dentry_drop
,
1914 req
->old_dentry_unless
);
1915 ldout(cct
, 25) << "encode_cap_releases exit (req: "
1916 << req
<< ", mds " << mds
<<dendl
;
1919 bool Client::have_open_session(mds_rank_t mds
)
1922 mds_sessions
.count(mds
) &&
1923 (mds_sessions
[mds
]->state
== MetaSession::STATE_OPEN
||
1924 mds_sessions
[mds
]->state
== MetaSession::STATE_STALE
);
1927 MetaSession
*Client::_get_mds_session(mds_rank_t mds
, Connection
*con
)
1929 if (mds_sessions
.count(mds
) == 0)
1931 MetaSession
*s
= mds_sessions
[mds
];
1937 MetaSession
*Client::_get_or_open_mds_session(mds_rank_t mds
)
1939 if (mds_sessions
.count(mds
))
1940 return mds_sessions
[mds
];
1941 return _open_mds_session(mds
);
1945 * Populate a map of strings with client-identifying metadata,
1946 * such as the hostname. Call this once at initialization.
1948 void Client::populate_metadata(const std::string
&mount_root
)
1954 metadata
["hostname"] = u
.nodename
;
1955 ldout(cct
, 20) << __func__
<< " read hostname '" << u
.nodename
<< "'" << dendl
;
1957 ldout(cct
, 1) << __func__
<< " failed to read hostname (" << cpp_strerror(r
) << ")" << dendl
;
1960 metadata
["pid"] = stringify(getpid());
1962 // Ceph entity id (the '0' in "client.0")
1963 metadata
["entity_id"] = cct
->_conf
->name
.get_id();
1965 // Our mount position
1966 if (!mount_root
.empty()) {
1967 metadata
["root"] = mount_root
;
1971 metadata
["ceph_version"] = pretty_version_to_str();
1972 metadata
["ceph_sha1"] = git_version_to_str();
1974 // Apply any metadata from the user's configured overrides
1975 std::vector
<std::string
> tokens
;
1976 get_str_vec(cct
->_conf
->client_metadata
, ",", tokens
);
1977 for (const auto &i
: tokens
) {
1978 auto eqpos
= i
.find("=");
1979 // Throw out anything that isn't of the form "<str>=<str>"
1980 if (eqpos
== 0 || eqpos
== std::string::npos
|| eqpos
== i
.size()) {
1981 lderr(cct
) << "Invalid metadata keyval pair: '" << i
<< "'" << dendl
;
1984 metadata
[i
.substr(0, eqpos
)] = i
.substr(eqpos
+ 1);
1989 * Optionally add or override client metadata fields.
1991 void Client::update_metadata(std::string
const &k
, std::string
const &v
)
1993 Mutex::Locker
l(client_lock
);
1994 assert(initialized
);
1996 if (metadata
.count(k
)) {
1997 ldout(cct
, 1) << __func__
<< " warning, overriding metadata field '" << k
1998 << "' from '" << metadata
[k
] << "' to '" << v
<< "'" << dendl
;
2004 MetaSession
*Client::_open_mds_session(mds_rank_t mds
)
2006 ldout(cct
, 10) << "_open_mds_session mds." << mds
<< dendl
;
2007 assert(mds_sessions
.count(mds
) == 0);
2008 MetaSession
*session
= new MetaSession
;
2009 session
->mds_num
= mds
;
2011 session
->inst
= mdsmap
->get_inst(mds
);
2012 session
->con
= messenger
->get_connection(session
->inst
);
2013 session
->state
= MetaSession::STATE_OPENING
;
2014 session
->mds_state
= MDSMap::STATE_NULL
;
2015 mds_sessions
[mds
] = session
;
2017 // Maybe skip sending a request to open if this MDS daemon
2018 // has previously sent us a REJECT.
2019 if (rejected_by_mds
.count(mds
)) {
2020 if (rejected_by_mds
[mds
] == session
->inst
) {
2021 ldout(cct
, 4) << "_open_mds_session mds." << mds
<< " skipping "
2022 "because we were rejected" << dendl
;
2025 ldout(cct
, 4) << "_open_mds_session mds." << mds
<< " old inst "
2026 "rejected us, trying with new inst" << dendl
;
2027 rejected_by_mds
.erase(mds
);
2031 MClientSession
*m
= new MClientSession(CEPH_SESSION_REQUEST_OPEN
);
2032 m
->client_meta
= metadata
;
2033 session
->con
->send_message(m
);
2037 void Client::_close_mds_session(MetaSession
*s
)
2039 ldout(cct
, 2) << "_close_mds_session mds." << s
->mds_num
<< " seq " << s
->seq
<< dendl
;
2040 s
->state
= MetaSession::STATE_CLOSING
;
2041 s
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2044 void Client::_closed_mds_session(MetaSession
*s
)
2046 s
->state
= MetaSession::STATE_CLOSED
;
2047 s
->con
->mark_down();
2048 signal_context_list(s
->waiting_for_open
);
2049 mount_cond
.Signal();
2050 remove_session_caps(s
);
2051 kick_requests_closed(s
);
2052 mds_sessions
.erase(s
->mds_num
);
2056 void Client::handle_client_session(MClientSession
*m
)
2058 mds_rank_t from
= mds_rank_t(m
->get_source().num());
2059 ldout(cct
, 10) << "handle_client_session " << *m
<< " from mds." << from
<< dendl
;
2061 MetaSession
*session
= _get_mds_session(from
, m
->get_connection().get());
2063 ldout(cct
, 10) << " discarding session message from sessionless mds " << m
->get_source_inst() << dendl
;
2068 switch (m
->get_op()) {
2069 case CEPH_SESSION_OPEN
:
2070 renew_caps(session
);
2071 session
->state
= MetaSession::STATE_OPEN
;
2073 mount_cond
.Signal();
2075 connect_mds_targets(from
);
2076 signal_context_list(session
->waiting_for_open
);
2079 case CEPH_SESSION_CLOSE
:
2080 _closed_mds_session(session
);
2083 case CEPH_SESSION_RENEWCAPS
:
2084 if (session
->cap_renew_seq
== m
->get_seq()) {
2086 session
->last_cap_renew_request
+ mdsmap
->get_session_timeout();
2087 wake_inode_waiters(session
);
2091 case CEPH_SESSION_STALE
:
2092 // invalidate session caps/leases
2094 session
->cap_ttl
= ceph_clock_now();
2095 session
->cap_ttl
-= 1;
2096 renew_caps(session
);
2099 case CEPH_SESSION_RECALL_STATE
:
2100 trim_caps(session
, m
->get_max_caps());
2103 case CEPH_SESSION_FLUSHMSG
:
2104 session
->con
->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK
, m
->get_seq()));
2107 case CEPH_SESSION_FORCE_RO
:
2108 force_session_readonly(session
);
2111 case CEPH_SESSION_REJECT
:
2112 rejected_by_mds
[session
->mds_num
] = session
->inst
;
2113 _closed_mds_session(session
);
2124 bool Client::_any_stale_sessions() const
2126 assert(client_lock
.is_locked_by_me());
2128 for (const auto &i
: mds_sessions
) {
2129 if (i
.second
->state
== MetaSession::STATE_STALE
) {
2137 void Client::_kick_stale_sessions()
2139 ldout(cct
, 1) << "kick_stale_sessions" << dendl
;
2141 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2142 p
!= mds_sessions
.end(); ) {
2143 MetaSession
*s
= p
->second
;
2145 if (s
->state
== MetaSession::STATE_STALE
)
2146 _closed_mds_session(s
);
2150 void Client::send_request(MetaRequest
*request
, MetaSession
*session
,
2151 bool drop_cap_releases
)
2154 mds_rank_t mds
= session
->mds_num
;
2155 ldout(cct
, 10) << "send_request rebuilding request " << request
->get_tid()
2156 << " for mds." << mds
<< dendl
;
2157 MClientRequest
*r
= build_client_request(request
);
2158 if (request
->dentry()) {
2159 r
->set_dentry_wanted();
2161 if (request
->got_unsafe
) {
2162 r
->set_replayed_op();
2163 if (request
->target
)
2164 r
->head
.ino
= request
->target
->ino
;
2166 encode_cap_releases(request
, mds
);
2167 if (drop_cap_releases
) // we haven't send cap reconnect yet, drop cap releases
2168 request
->cap_releases
.clear();
2170 r
->releases
.swap(request
->cap_releases
);
2172 r
->set_mdsmap_epoch(mdsmap
->get_epoch());
2173 if (r
->head
.op
== CEPH_MDS_OP_SETXATTR
) {
2174 objecter
->with_osdmap([r
](const OSDMap
& o
) {
2175 r
->set_osdmap_epoch(o
.get_epoch());
2179 if (request
->mds
== -1) {
2180 request
->sent_stamp
= ceph_clock_now();
2181 ldout(cct
, 20) << "send_request set sent_stamp to " << request
->sent_stamp
<< dendl
;
2185 Inode
*in
= request
->inode();
2186 if (in
&& in
->caps
.count(mds
))
2187 request
->sent_on_mseq
= in
->caps
[mds
]->mseq
;
2189 session
->requests
.push_back(&request
->item
);
2191 ldout(cct
, 10) << "send_request " << *r
<< " to mds." << mds
<< dendl
;
2192 session
->con
->send_message(r
);
2195 MClientRequest
* Client::build_client_request(MetaRequest
*request
)
2197 MClientRequest
*req
= new MClientRequest(request
->get_op());
2198 req
->set_tid(request
->tid
);
2199 req
->set_stamp(request
->op_stamp
);
2200 memcpy(&req
->head
, &request
->head
, sizeof(ceph_mds_request_head
));
2202 // if the filepath's haven't been set, set them!
2203 if (request
->path
.empty()) {
2204 Inode
*in
= request
->inode();
2205 Dentry
*de
= request
->dentry();
2207 in
->make_nosnap_relative_path(request
->path
);
2210 de
->inode
->make_nosnap_relative_path(request
->path
);
2212 de
->dir
->parent_inode
->make_nosnap_relative_path(request
->path
);
2213 request
->path
.push_dentry(de
->name
);
2215 else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2216 << " No path, inode, or appropriately-endowed dentry given!"
2218 } else ldout(cct
, 1) << "Warning -- unable to construct a filepath!"
2219 << " No path, inode, or dentry given!"
2222 req
->set_filepath(request
->get_filepath());
2223 req
->set_filepath2(request
->get_filepath2());
2224 req
->set_data(request
->data
);
2225 req
->set_retry_attempt(request
->retry_attempt
++);
2226 req
->head
.num_fwd
= request
->num_fwd
;
2228 int gid_count
= request
->perms
.get_gids(&_gids
);
2229 req
->set_gid_list(gid_count
, _gids
);
2235 void Client::handle_client_request_forward(MClientRequestForward
*fwd
)
2237 mds_rank_t mds
= mds_rank_t(fwd
->get_source().num());
2238 MetaSession
*session
= _get_mds_session(mds
, fwd
->get_connection().get());
2243 ceph_tid_t tid
= fwd
->get_tid();
2245 if (mds_requests
.count(tid
) == 0) {
2246 ldout(cct
, 10) << "handle_client_request_forward no pending request on tid " << tid
<< dendl
;
2251 MetaRequest
*request
= mds_requests
[tid
];
2254 // reset retry counter
2255 request
->retry_attempt
= 0;
2257 // request not forwarded, or dest mds has no session.
2259 ldout(cct
, 10) << "handle_client_request tid " << tid
2260 << " fwd " << fwd
->get_num_fwd()
2261 << " to mds." << fwd
->get_dest_mds()
2262 << ", resending to " << fwd
->get_dest_mds()
2266 request
->item
.remove_myself();
2267 request
->num_fwd
= fwd
->get_num_fwd();
2268 request
->resend_mds
= fwd
->get_dest_mds();
2269 request
->caller_cond
->Signal();
2274 bool Client::is_dir_operation(MetaRequest
*req
)
2276 int op
= req
->get_op();
2277 if (op
== CEPH_MDS_OP_MKNOD
|| op
== CEPH_MDS_OP_LINK
||
2278 op
== CEPH_MDS_OP_UNLINK
|| op
== CEPH_MDS_OP_RENAME
||
2279 op
== CEPH_MDS_OP_MKDIR
|| op
== CEPH_MDS_OP_RMDIR
||
2280 op
== CEPH_MDS_OP_SYMLINK
|| op
== CEPH_MDS_OP_CREATE
)
2285 void Client::handle_client_reply(MClientReply
*reply
)
2287 mds_rank_t mds_num
= mds_rank_t(reply
->get_source().num());
2288 MetaSession
*session
= _get_mds_session(mds_num
, reply
->get_connection().get());
2294 ceph_tid_t tid
= reply
->get_tid();
2295 bool is_safe
= reply
->is_safe();
2297 if (mds_requests
.count(tid
) == 0) {
2298 lderr(cct
) << "handle_client_reply no pending request on tid " << tid
2299 << " safe is:" << is_safe
<< dendl
;
2303 MetaRequest
*request
= mds_requests
.at(tid
);
2305 ldout(cct
, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2306 << " tid " << tid
<< dendl
;
2308 if (request
->got_unsafe
&& !is_safe
) {
2309 //duplicate response
2310 ldout(cct
, 0) << "got a duplicate reply on tid " << tid
<< " from mds "
2311 << mds_num
<< " safe:" << is_safe
<< dendl
;
2316 if (-ESTALE
== reply
->get_result()) { // see if we can get to proper MDS
2317 ldout(cct
, 20) << "got ESTALE on tid " << request
->tid
2318 << " from mds." << request
->mds
<< dendl
;
2319 request
->send_to_auth
= true;
2320 request
->resend_mds
= choose_target_mds(request
);
2321 Inode
*in
= request
->inode();
2322 if (request
->resend_mds
>= 0 &&
2323 request
->resend_mds
== request
->mds
&&
2325 in
->caps
.count(request
->resend_mds
) == 0 ||
2326 request
->sent_on_mseq
== in
->caps
[request
->resend_mds
]->mseq
)) {
2327 // have to return ESTALE
2329 request
->caller_cond
->Signal();
2333 ldout(cct
, 20) << "have to return ESTALE" << dendl
;
2336 assert(request
->reply
== NULL
);
2337 request
->reply
= reply
;
2338 insert_trace(request
, session
);
2340 // Handle unsafe reply
2342 request
->got_unsafe
= true;
2343 session
->unsafe_requests
.push_back(&request
->unsafe_item
);
2344 if (is_dir_operation(request
)) {
2345 Inode
*dir
= request
->inode();
2347 dir
->unsafe_ops
.push_back(&request
->unsafe_dir_item
);
2349 if (request
->target
) {
2350 InodeRef
&in
= request
->target
;
2351 in
->unsafe_ops
.push_back(&request
->unsafe_target_item
);
2355 // Only signal the caller once (on the first reply):
2356 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2357 if (!is_safe
|| !request
->got_unsafe
) {
2359 request
->dispatch_cond
= &cond
;
2362 ldout(cct
, 20) << "handle_client_reply signalling caller " << (void*)request
->caller_cond
<< dendl
;
2363 request
->caller_cond
->Signal();
2365 // wake for kick back
2366 while (request
->dispatch_cond
) {
2367 ldout(cct
, 20) << "handle_client_reply awaiting kickback on tid " << tid
<< " " << &cond
<< dendl
;
2368 cond
.Wait(client_lock
);
2373 // the filesystem change is committed to disk
2374 // we're done, clean up
2375 if (request
->got_unsafe
) {
2376 request
->unsafe_item
.remove_myself();
2377 request
->unsafe_dir_item
.remove_myself();
2378 request
->unsafe_target_item
.remove_myself();
2379 signal_cond_list(request
->waitfor_safe
);
2381 request
->item
.remove_myself();
2382 unregister_request(request
);
2385 mount_cond
.Signal();
2388 void Client::_handle_full_flag(int64_t pool
)
2390 ldout(cct
, 1) << __func__
<< ": FULL: cancelling outstanding operations "
2391 << "on " << pool
<< dendl
;
2392 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2393 // to do this rather than blocking, because otherwise when we fill up we
2394 // potentially lock caps forever on files with dirty pages, and we need
2395 // to be able to release those caps to the MDS so that it can delete files
2396 // and free up space.
2397 epoch_t cancelled_epoch
= objecter
->op_cancel_writes(-ENOSPC
, pool
);
2399 // For all inodes with layouts in this pool and a pending flush write op
2400 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2401 // from ObjectCacher so that it doesn't re-issue the write in response to
2402 // the ENOSPC error.
2403 // Fortunately since we're cancelling everything in a given pool, we don't
2404 // need to know which ops belong to which ObjectSet, we can just blow all
2405 // the un-flushed cached data away and mark any dirty inodes' async_err
2406 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2407 // affecting this pool, and all the objectsets we're purging were also
2409 for (unordered_map
<vinodeno_t
,Inode
*>::iterator i
= inode_map
.begin();
2410 i
!= inode_map
.end(); ++i
)
2412 Inode
*inode
= i
->second
;
2413 if (inode
->oset
.dirty_or_tx
2414 && (pool
== -1 || inode
->layout
.pool_id
== pool
)) {
2415 ldout(cct
, 4) << __func__
<< ": FULL: inode 0x" << std::hex
<< i
->first
<< std::dec
2416 << " has dirty objects, purging and setting ENOSPC" << dendl
;
2417 objectcacher
->purge_set(&inode
->oset
);
2418 inode
->set_async_err(-ENOSPC
);
2422 if (cancelled_epoch
!= (epoch_t
)-1) {
2423 set_cap_epoch_barrier(cancelled_epoch
);
2427 void Client::handle_osd_map(MOSDMap
*m
)
2429 std::set
<entity_addr_t
> new_blacklists
;
2430 objecter
->consume_blacklist_events(&new_blacklists
);
2432 const auto myaddr
= messenger
->get_myaddr();
2433 if (!blacklisted
&& new_blacklists
.count(myaddr
)) {
2434 auto epoch
= objecter
->with_osdmap([](const OSDMap
&o
){
2435 return o
.get_epoch();
2437 lderr(cct
) << "I was blacklisted at osd epoch " << epoch
<< dendl
;
2439 for (std::map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2440 p
!= mds_requests
.end(); ) {
2441 auto req
= p
->second
;
2443 req
->abort(-EBLACKLISTED
);
2444 if (req
->caller_cond
) {
2446 req
->caller_cond
->Signal();
2450 // Progress aborts on any requests that were on this waitlist. Any
2451 // requests that were on a waiting_for_open session waitlist
2452 // will get kicked during close session below.
2453 signal_cond_list(waiting_for_mdsmap
);
2455 // Force-close all sessions: assume this is not abandoning any state
2456 // on the MDS side because the MDS will have seen the blacklist too.
2457 while(!mds_sessions
.empty()) {
2458 auto i
= mds_sessions
.begin();
2459 auto session
= i
->second
;
2460 _closed_mds_session(session
);
2463 // Since we know all our OSD ops will fail, cancel them all preemtively,
2464 // so that on an unhealthy cluster we can umount promptly even if e.g.
2465 // some PGs were inaccessible.
2466 objecter
->op_cancel_writes(-EBLACKLISTED
);
2468 } else if (blacklisted
) {
2469 // Handle case where we were blacklisted but no longer are
2470 blacklisted
= objecter
->with_osdmap([myaddr
](const OSDMap
&o
){
2471 return o
.is_blacklisted(myaddr
);});
2474 if (objecter
->osdmap_full_flag()) {
2475 _handle_full_flag(-1);
2477 // Accumulate local list of full pools so that I can drop
2478 // the objecter lock before re-entering objecter in
2480 std::vector
<int64_t> full_pools
;
2482 objecter
->with_osdmap([&full_pools
](const OSDMap
&o
) {
2483 for (const auto& kv
: o
.get_pools()) {
2484 if (kv
.second
.has_flag(pg_pool_t::FLAG_FULL
)) {
2485 full_pools
.push_back(kv
.first
);
2490 for (auto p
: full_pools
)
2491 _handle_full_flag(p
);
2493 // Subscribe to subsequent maps to watch for the full flag going
2494 // away. For the global full flag objecter does this for us, but
2495 // it pays no attention to the per-pool full flag so in this branch
2496 // we do it ourselves.
2497 if (!full_pools
.empty()) {
2498 objecter
->maybe_request_map();
2506 // ------------------------
2507 // incoming messages
2510 bool Client::ms_dispatch(Message
*m
)
2512 Mutex::Locker
l(client_lock
);
2514 ldout(cct
, 10) << "inactive, discarding " << *m
<< dendl
;
2519 switch (m
->get_type()) {
2520 // mounting and mds sessions
2521 case CEPH_MSG_MDS_MAP
:
2522 handle_mds_map(static_cast<MMDSMap
*>(m
));
2524 case CEPH_MSG_FS_MAP
:
2525 handle_fs_map(static_cast<MFSMap
*>(m
));
2527 case CEPH_MSG_FS_MAP_USER
:
2528 handle_fs_map_user(static_cast<MFSMapUser
*>(m
));
2530 case CEPH_MSG_CLIENT_SESSION
:
2531 handle_client_session(static_cast<MClientSession
*>(m
));
2534 case CEPH_MSG_OSD_MAP
:
2535 handle_osd_map(static_cast<MOSDMap
*>(m
));
2539 case CEPH_MSG_CLIENT_REQUEST_FORWARD
:
2540 handle_client_request_forward(static_cast<MClientRequestForward
*>(m
));
2542 case CEPH_MSG_CLIENT_REPLY
:
2543 handle_client_reply(static_cast<MClientReply
*>(m
));
2546 case CEPH_MSG_CLIENT_SNAP
:
2547 handle_snap(static_cast<MClientSnap
*>(m
));
2549 case CEPH_MSG_CLIENT_CAPS
:
2550 handle_caps(static_cast<MClientCaps
*>(m
));
2552 case CEPH_MSG_CLIENT_LEASE
:
2553 handle_lease(static_cast<MClientLease
*>(m
));
2555 case MSG_COMMAND_REPLY
:
2556 if (m
->get_source().type() == CEPH_ENTITY_TYPE_MDS
) {
2557 handle_command_reply(static_cast<MCommandReply
*>(m
));
2562 case CEPH_MSG_CLIENT_QUOTA
:
2563 handle_quota(static_cast<MClientQuota
*>(m
));
2572 ldout(cct
, 10) << "unmounting: trim pass, size was " << lru
.lru_get_size()
2573 << "+" << inode_map
.size() << dendl
;
2574 long unsigned size
= lru
.lru_get_size() + inode_map
.size();
2576 if (size
< lru
.lru_get_size() + inode_map
.size()) {
2577 ldout(cct
, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl
;
2578 mount_cond
.Signal();
2580 ldout(cct
, 10) << "unmounting: trim pass, size still " << lru
.lru_get_size()
2581 << "+" << inode_map
.size() << dendl
;
2588 void Client::handle_fs_map(MFSMap
*m
)
2590 fsmap
.reset(new FSMap(m
->get_fsmap()));
2593 signal_cond_list(waiting_for_fsmap
);
2595 monclient
->sub_got("fsmap", fsmap
->get_epoch());
2598 void Client::handle_fs_map_user(MFSMapUser
*m
)
2600 fsmap_user
.reset(new FSMapUser
);
2601 *fsmap_user
= m
->get_fsmap();
2604 monclient
->sub_got("fsmap.user", fsmap_user
->get_epoch());
2605 signal_cond_list(waiting_for_fsmap
);
2608 void Client::handle_mds_map(MMDSMap
* m
)
2610 if (m
->get_epoch() <= mdsmap
->get_epoch()) {
2611 ldout(cct
, 1) << "handle_mds_map epoch " << m
->get_epoch()
2612 << " is identical to or older than our "
2613 << mdsmap
->get_epoch() << dendl
;
2618 ldout(cct
, 1) << "handle_mds_map epoch " << m
->get_epoch() << dendl
;
2620 std::unique_ptr
<MDSMap
> oldmap(new MDSMap
);
2621 oldmap
.swap(mdsmap
);
2623 mdsmap
->decode(m
->get_encoded());
2625 // Cancel any commands for missing or laggy GIDs
2626 std::list
<ceph_tid_t
> cancel_ops
;
2627 auto &commands
= command_table
.get_commands();
2628 for (const auto &i
: commands
) {
2629 auto &op
= i
.second
;
2630 const mds_gid_t op_mds_gid
= op
.mds_gid
;
2631 if (mdsmap
->is_dne_gid(op_mds_gid
) || mdsmap
->is_laggy_gid(op_mds_gid
)) {
2632 ldout(cct
, 1) << __func__
<< ": cancelling command op " << i
.first
<< dendl
;
2633 cancel_ops
.push_back(i
.first
);
2635 std::ostringstream ss
;
2636 ss
<< "MDS " << op_mds_gid
<< " went away";
2637 *(op
.outs
) = ss
.str();
2639 op
.con
->mark_down();
2641 op
.on_finish
->complete(-ETIMEDOUT
);
2646 for (std::list
<ceph_tid_t
>::iterator i
= cancel_ops
.begin();
2647 i
!= cancel_ops
.end(); ++i
) {
2648 command_table
.erase(*i
);
2652 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2653 p
!= mds_sessions
.end(); ) {
2654 mds_rank_t mds
= p
->first
;
2655 MetaSession
*session
= p
->second
;
2658 int oldstate
= oldmap
->get_state(mds
);
2659 int newstate
= mdsmap
->get_state(mds
);
2660 if (!mdsmap
->is_up(mds
)) {
2661 session
->con
->mark_down();
2662 } else if (mdsmap
->get_inst(mds
) != session
->inst
) {
2663 session
->con
->mark_down();
2664 session
->inst
= mdsmap
->get_inst(mds
);
2665 // When new MDS starts to take over, notify kernel to trim unused entries
2666 // in its dcache/icache. Hopefully, the kernel will release some unused
2667 // inodes before the new MDS enters reconnect state.
2668 trim_cache_for_reconnect(session
);
2669 } else if (oldstate
== newstate
)
2670 continue; // no change
2672 session
->mds_state
= newstate
;
2673 if (newstate
== MDSMap::STATE_RECONNECT
) {
2674 session
->con
= messenger
->get_connection(session
->inst
);
2675 send_reconnect(session
);
2676 } else if (newstate
>= MDSMap::STATE_ACTIVE
) {
2677 if (oldstate
< MDSMap::STATE_ACTIVE
) {
2678 // kick new requests
2679 kick_requests(session
);
2680 kick_flushing_caps(session
);
2681 signal_context_list(session
->waiting_for_open
);
2682 kick_maxsize_requests(session
);
2683 wake_inode_waiters(session
);
2685 connect_mds_targets(mds
);
2686 } else if (newstate
== MDSMap::STATE_NULL
&&
2687 mds
>= mdsmap
->get_max_mds()) {
2688 _closed_mds_session(session
);
2692 // kick any waiting threads
2693 signal_cond_list(waiting_for_mdsmap
);
2697 monclient
->sub_got("mdsmap", mdsmap
->get_epoch());
2700 void Client::send_reconnect(MetaSession
*session
)
2702 mds_rank_t mds
= session
->mds_num
;
2703 ldout(cct
, 10) << "send_reconnect to mds." << mds
<< dendl
;
2705 // trim unused caps to reduce MDS's cache rejoin time
2706 trim_cache_for_reconnect(session
);
2708 session
->readonly
= false;
2710 if (session
->release
) {
2711 session
->release
->put();
2712 session
->release
= NULL
;
2715 // reset my cap seq number
2717 //connect to the mds' offload targets
2718 connect_mds_targets(mds
);
2719 //make sure unsafe requests get saved
2720 resend_unsafe_requests(session
);
2722 MClientReconnect
*m
= new MClientReconnect
;
2724 // i have an open session.
2725 ceph::unordered_set
<inodeno_t
> did_snaprealm
;
2726 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
2727 p
!= inode_map
.end();
2729 Inode
*in
= p
->second
;
2730 if (in
->caps
.count(mds
)) {
2731 ldout(cct
, 10) << " caps on " << p
->first
2732 << " " << ccap_string(in
->caps
[mds
]->issued
)
2733 << " wants " << ccap_string(in
->caps_wanted())
2736 in
->make_long_path(path
);
2737 ldout(cct
, 10) << " path " << path
<< dendl
;
2740 _encode_filelocks(in
, flockbl
);
2742 Cap
*cap
= in
->caps
[mds
];
2743 cap
->seq
= 0; // reset seq.
2744 cap
->issue_seq
= 0; // reset seq.
2745 cap
->mseq
= 0; // reset seq.
2746 cap
->issued
= cap
->implemented
;
2748 snapid_t snap_follows
= 0;
2749 if (!in
->cap_snaps
.empty())
2750 snap_follows
= in
->cap_snaps
.begin()->first
;
2752 m
->add_cap(p
->first
.ino
,
2754 path
.get_ino(), path
.get_path(), // ino
2755 in
->caps_wanted(), // wanted
2756 cap
->issued
, // issued
2761 if (did_snaprealm
.count(in
->snaprealm
->ino
) == 0) {
2762 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
2763 m
->add_snaprealm(in
->snaprealm
->ino
, in
->snaprealm
->seq
, in
->snaprealm
->parent
);
2764 did_snaprealm
.insert(in
->snaprealm
->ino
);
2769 early_kick_flushing_caps(session
);
2771 session
->con
->send_message(m
);
2773 mount_cond
.Signal();
2777 void Client::kick_requests(MetaSession
*session
)
2779 ldout(cct
, 10) << "kick_requests for mds." << session
->mds_num
<< dendl
;
2780 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2781 p
!= mds_requests
.end();
2783 MetaRequest
*req
= p
->second
;
2784 if (req
->got_unsafe
)
2786 if (req
->aborted()) {
2787 if (req
->caller_cond
) {
2789 req
->caller_cond
->Signal();
2793 if (req
->retry_attempt
> 0)
2794 continue; // new requests only
2795 if (req
->mds
== session
->mds_num
) {
2796 send_request(p
->second
, session
);
2801 void Client::resend_unsafe_requests(MetaSession
*session
)
2803 for (xlist
<MetaRequest
*>::iterator iter
= session
->unsafe_requests
.begin();
2806 send_request(*iter
, session
);
2808 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2809 // process completed requests in clientreplay stage.
2810 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2811 p
!= mds_requests
.end();
2813 MetaRequest
*req
= p
->second
;
2814 if (req
->got_unsafe
)
2818 if (req
->retry_attempt
== 0)
2819 continue; // old requests only
2820 if (req
->mds
== session
->mds_num
)
2821 send_request(req
, session
, true);
2825 void Client::wait_unsafe_requests()
2827 list
<MetaRequest
*> last_unsafe_reqs
;
2828 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
2829 p
!= mds_sessions
.end();
2831 MetaSession
*s
= p
->second
;
2832 if (!s
->unsafe_requests
.empty()) {
2833 MetaRequest
*req
= s
->unsafe_requests
.back();
2835 last_unsafe_reqs
.push_back(req
);
2839 for (list
<MetaRequest
*>::iterator p
= last_unsafe_reqs
.begin();
2840 p
!= last_unsafe_reqs
.end();
2842 MetaRequest
*req
= *p
;
2843 if (req
->unsafe_item
.is_on_list())
2844 wait_on_list(req
->waitfor_safe
);
2849 void Client::kick_requests_closed(MetaSession
*session
)
2851 ldout(cct
, 10) << "kick_requests_closed for mds." << session
->mds_num
<< dendl
;
2852 for (map
<ceph_tid_t
, MetaRequest
*>::iterator p
= mds_requests
.begin();
2853 p
!= mds_requests
.end(); ) {
2854 MetaRequest
*req
= p
->second
;
2856 if (req
->mds
== session
->mds_num
) {
2857 if (req
->caller_cond
) {
2859 req
->caller_cond
->Signal();
2861 req
->item
.remove_myself();
2862 if (req
->got_unsafe
) {
2863 lderr(cct
) << "kick_requests_closed removing unsafe request " << req
->get_tid() << dendl
;
2864 req
->unsafe_item
.remove_myself();
2865 req
->unsafe_dir_item
.remove_myself();
2866 req
->unsafe_target_item
.remove_myself();
2867 signal_cond_list(req
->waitfor_safe
);
2868 unregister_request(req
);
2872 assert(session
->requests
.empty());
2873 assert(session
->unsafe_requests
.empty());
2883 void Client::got_mds_push(MetaSession
*s
)
2886 ldout(cct
, 10) << " mds." << s
->mds_num
<< " seq now " << s
->seq
<< dendl
;
2887 if (s
->state
== MetaSession::STATE_CLOSING
) {
2888 s
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE
, s
->seq
));
2892 void Client::handle_lease(MClientLease
*m
)
2894 ldout(cct
, 10) << "handle_lease " << *m
<< dendl
;
2896 assert(m
->get_action() == CEPH_MDS_LEASE_REVOKE
);
2898 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
2899 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
2905 got_mds_push(session
);
2907 ceph_seq_t seq
= m
->get_seq();
2910 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
2911 if (inode_map
.count(vino
) == 0) {
2912 ldout(cct
, 10) << " don't have vino " << vino
<< dendl
;
2915 in
= inode_map
[vino
];
2917 if (m
->get_mask() & CEPH_LOCK_DN
) {
2918 if (!in
->dir
|| in
->dir
->dentries
.count(m
->dname
) == 0) {
2919 ldout(cct
, 10) << " don't have dir|dentry " << m
->get_ino() << "/" << m
->dname
<<dendl
;
2922 Dentry
*dn
= in
->dir
->dentries
[m
->dname
];
2923 ldout(cct
, 10) << " revoked DN lease on " << dn
<< dendl
;
2928 m
->get_connection()->send_message(
2930 CEPH_MDS_LEASE_RELEASE
, seq
,
2931 m
->get_mask(), m
->get_ino(), m
->get_first(), m
->get_last(), m
->dname
));
2935 void Client::put_inode(Inode
*in
, int n
)
2937 ldout(cct
, 10) << "put_inode on " << *in
<< dendl
;
2938 int left
= in
->_put(n
);
2941 remove_all_caps(in
);
2943 ldout(cct
, 10) << "put_inode deleting " << *in
<< dendl
;
2944 bool unclean
= objectcacher
->release_set(&in
->oset
);
2946 inode_map
.erase(in
->vino());
2947 if (use_faked_inos())
2948 _release_faked_ino(in
);
2953 while (!root_parents
.empty())
2954 root_parents
.erase(root_parents
.begin());
2961 void Client::close_dir(Dir
*dir
)
2963 Inode
*in
= dir
->parent_inode
;
2964 ldout(cct
, 15) << "close_dir dir " << dir
<< " on " << in
<< dendl
;
2965 assert(dir
->is_empty());
2966 assert(in
->dir
== dir
);
2967 assert(in
->dn_set
.size() < 2); // dirs can't be hard-linked
2968 if (!in
->dn_set
.empty())
2969 in
->get_first_parent()->put(); // unpin dentry
2973 put_inode(in
); // unpin inode
2977 * Don't call this with in==NULL, use get_or_create for that
2978 * leave dn set to default NULL unless you're trying to add
2979 * a new inode to a pre-created Dentry
2981 Dentry
* Client::link(Dir
*dir
, const string
& name
, Inode
*in
, Dentry
*dn
)
2984 // create a new Dentry
2990 dir
->dentries
[dn
->name
] = dn
;
2991 lru
.lru_insert_mid(dn
); // mid or top?
2993 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
2994 << " dn " << dn
<< " (new dn)" << dendl
;
2996 ldout(cct
, 15) << "link dir " << dir
->parent_inode
<< " '" << name
<< "' to inode " << in
2997 << " dn " << dn
<< " (old dn)" << dendl
;
3000 if (in
) { // link to inode
3004 dn
->get(); // dir -> dn pin
3006 dn
->get(); // ll_ref -> dn pin
3009 assert(in
->dn_set
.count(dn
) == 0);
3011 // only one parent for directories!
3012 if (in
->is_dir() && !in
->dn_set
.empty()) {
3013 Dentry
*olddn
= in
->get_first_parent();
3014 assert(olddn
->dir
!= dir
|| olddn
->name
!= name
);
3015 Inode
*old_diri
= olddn
->dir
->parent_inode
;
3016 old_diri
->dir_release_count
++;
3017 clear_dir_complete_and_ordered(old_diri
, true);
3018 unlink(olddn
, true, true); // keep dir, dentry
3021 in
->dn_set
.insert(dn
);
3023 ldout(cct
, 20) << "link inode " << in
<< " parents now " << in
->dn_set
<< dendl
;
3029 void Client::unlink(Dentry
*dn
, bool keepdir
, bool keepdentry
)
3033 ldout(cct
, 15) << "unlink dir " << dn
->dir
->parent_inode
<< " '" << dn
->name
<< "' dn " << dn
3034 << " inode " << dn
->inode
<< dendl
;
3036 // unlink from inode
3040 dn
->put(); // dir -> dn pin
3042 dn
->put(); // ll_ref -> dn pin
3045 assert(in
->dn_set
.count(dn
));
3046 in
->dn_set
.erase(dn
);
3047 ldout(cct
, 20) << "unlink inode " << in
<< " parents now " << in
->dn_set
<< dendl
;
3053 ldout(cct
, 15) << "unlink removing '" << dn
->name
<< "' dn " << dn
<< dendl
;
3056 dn
->dir
->dentries
.erase(dn
->name
);
3057 if (dn
->dir
->is_empty() && !keepdir
)
3068 * For asynchronous flushes, check for errors from the IO and
3069 * update the inode if necessary
3071 class C_Client_FlushComplete
: public Context
{
3076 C_Client_FlushComplete(Client
*c
, Inode
*in
) : client(c
), inode(in
) { }
3077 void finish(int r
) override
{
3078 assert(client
->client_lock
.is_locked_by_me());
3080 client_t
const whoami
= client
->whoami
; // For the benefit of ldout prefix
3081 ldout(client
->cct
, 1) << "I/O error from flush on inode " << inode
3082 << " 0x" << std::hex
<< inode
->ino
<< std::dec
3083 << ": " << r
<< "(" << cpp_strerror(r
) << ")" << dendl
;
3084 inode
->set_async_err(r
);
3094 void Client::get_cap_ref(Inode
*in
, int cap
)
3096 if ((cap
& CEPH_CAP_FILE_BUFFER
) &&
3097 in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] == 0) {
3098 ldout(cct
, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in
<< dendl
;
3101 if ((cap
& CEPH_CAP_FILE_CACHE
) &&
3102 in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3103 ldout(cct
, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in
<< dendl
;
3106 in
->get_cap_ref(cap
);
3109 void Client::put_cap_ref(Inode
*in
, int cap
)
3111 int last
= in
->put_cap_ref(cap
);
3114 int drop
= last
& ~in
->caps_issued();
3115 if (in
->snapid
== CEPH_NOSNAP
) {
3116 if ((last
& CEPH_CAP_FILE_WR
) &&
3117 !in
->cap_snaps
.empty() &&
3118 in
->cap_snaps
.rbegin()->second
.writing
) {
3119 ldout(cct
, 10) << "put_cap_ref finishing pending cap_snap on " << *in
<< dendl
;
3120 in
->cap_snaps
.rbegin()->second
.writing
= 0;
3121 finish_cap_snap(in
, in
->cap_snaps
.rbegin()->second
, get_caps_used(in
));
3122 signal_cond_list(in
->waitfor_caps
); // wake up blocked sync writers
3124 if (last
& CEPH_CAP_FILE_BUFFER
) {
3125 for (auto &p
: in
->cap_snaps
)
3126 p
.second
.dirty_data
= 0;
3127 signal_cond_list(in
->waitfor_commit
);
3128 ldout(cct
, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in
<< dendl
;
3132 if (last
& CEPH_CAP_FILE_CACHE
) {
3133 ldout(cct
, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in
<< dendl
;
3139 put_inode(in
, put_nref
);
3143 int Client::get_caps(Inode
*in
, int need
, int want
, int *phave
, loff_t endoff
)
3145 int r
= check_pool_perm(in
, need
);
3150 int file_wanted
= in
->caps_file_wanted();
3151 if ((file_wanted
& need
) != need
) {
3152 ldout(cct
, 10) << "get_caps " << *in
<< " need " << ccap_string(need
)
3153 << " file_wanted " << ccap_string(file_wanted
) << ", EBADF "
3159 int have
= in
->caps_issued(&implemented
);
3161 bool waitfor_caps
= false;
3162 bool waitfor_commit
= false;
3164 if (have
& need
& CEPH_CAP_FILE_WR
) {
3166 (endoff
>= (loff_t
)in
->max_size
||
3167 endoff
> (loff_t
)(in
->size
<< 1)) &&
3168 endoff
> (loff_t
)in
->wanted_max_size
) {
3169 ldout(cct
, 10) << "wanted_max_size " << in
->wanted_max_size
<< " -> " << endoff
<< dendl
;
3170 in
->wanted_max_size
= endoff
;
3174 if (endoff
>= 0 && endoff
> (loff_t
)in
->max_size
) {
3175 ldout(cct
, 10) << "waiting on max_size, endoff " << endoff
<< " max_size " << in
->max_size
<< " on " << *in
<< dendl
;
3176 waitfor_caps
= true;
3178 if (!in
->cap_snaps
.empty()) {
3179 if (in
->cap_snaps
.rbegin()->second
.writing
) {
3180 ldout(cct
, 10) << "waiting on cap_snap write to complete" << dendl
;
3181 waitfor_caps
= true;
3183 for (auto &p
: in
->cap_snaps
) {
3184 if (p
.second
.dirty_data
) {
3185 waitfor_commit
= true;
3189 if (waitfor_commit
) {
3190 _flush(in
, new C_Client_FlushComplete(this, in
));
3191 ldout(cct
, 10) << "waiting for WRBUFFER to get dropped" << dendl
;
3196 if (!waitfor_caps
&& !waitfor_commit
) {
3197 if ((have
& need
) == need
) {
3198 int revoking
= implemented
& ~have
;
3199 ldout(cct
, 10) << "get_caps " << *in
<< " have " << ccap_string(have
)
3200 << " need " << ccap_string(need
) << " want " << ccap_string(want
)
3201 << " revoking " << ccap_string(revoking
)
3203 if ((revoking
& want
) == 0) {
3204 *phave
= need
| (have
& want
);
3205 in
->get_cap_ref(need
);
3209 ldout(cct
, 10) << "waiting for caps " << *in
<< " need " << ccap_string(need
) << " want " << ccap_string(want
) << dendl
;
3210 waitfor_caps
= true;
3213 if ((need
& CEPH_CAP_FILE_WR
) && in
->auth_cap
&&
3214 in
->auth_cap
->session
->readonly
)
3217 if (in
->flags
& I_CAP_DROPPED
) {
3218 int mds_wanted
= in
->caps_mds_wanted();
3219 if ((mds_wanted
& need
) != need
) {
3220 int ret
= _renew_caps(in
);
3225 if ((mds_wanted
& file_wanted
) ==
3226 (file_wanted
& (CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_WR
))) {
3227 in
->flags
&= ~I_CAP_DROPPED
;
3232 wait_on_list(in
->waitfor_caps
);
3233 else if (waitfor_commit
)
3234 wait_on_list(in
->waitfor_commit
);
3238 int Client::get_caps_used(Inode
*in
)
3240 unsigned used
= in
->caps_used();
3241 if (!(used
& CEPH_CAP_FILE_CACHE
) &&
3242 !objectcacher
->set_is_empty(&in
->oset
))
3243 used
|= CEPH_CAP_FILE_CACHE
;
3247 void Client::cap_delay_requeue(Inode
*in
)
3249 ldout(cct
, 10) << "cap_delay_requeue on " << *in
<< dendl
;
3250 in
->hold_caps_until
= ceph_clock_now();
3251 in
->hold_caps_until
+= cct
->_conf
->client_caps_release_delay
;
3252 delayed_list
.push_back(&in
->delay_cap_item
);
3255 void Client::send_cap(Inode
*in
, MetaSession
*session
, Cap
*cap
,
3256 bool sync
, int used
, int want
, int retain
,
3257 int flush
, ceph_tid_t flush_tid
)
3259 int held
= cap
->issued
| cap
->implemented
;
3260 int revoking
= cap
->implemented
& ~cap
->issued
;
3261 retain
&= ~revoking
;
3262 int dropping
= cap
->issued
& ~retain
;
3263 int op
= CEPH_CAP_OP_UPDATE
;
3265 ldout(cct
, 10) << "send_cap " << *in
3266 << " mds." << session
->mds_num
<< " seq " << cap
->seq
3267 << (sync
? " sync " : " async ")
3268 << " used " << ccap_string(used
)
3269 << " want " << ccap_string(want
)
3270 << " flush " << ccap_string(flush
)
3271 << " retain " << ccap_string(retain
)
3272 << " held "<< ccap_string(held
)
3273 << " revoking " << ccap_string(revoking
)
3274 << " dropping " << ccap_string(dropping
)
3277 if (cct
->_conf
->client_inject_release_failure
&& revoking
) {
3278 const int would_have_issued
= cap
->issued
& retain
;
3279 const int would_have_implemented
= cap
->implemented
& (cap
->issued
| used
);
3281 // - tell the server we think issued is whatever they issued plus whatever we implemented
3282 // - leave what we have implemented in place
3283 ldout(cct
, 20) << __func__
<< " injecting failure to release caps" << dendl
;
3284 cap
->issued
= cap
->issued
| cap
->implemented
;
3286 // Make an exception for revoking xattr caps: we are injecting
3287 // failure to release other caps, but allow xattr because client
3288 // will block on xattr ops if it can't release these to MDS (#9800)
3289 const int xattr_mask
= CEPH_CAP_XATTR_SHARED
| CEPH_CAP_XATTR_EXCL
;
3290 cap
->issued
^= xattr_mask
& revoking
;
3291 cap
->implemented
^= xattr_mask
& revoking
;
3293 ldout(cct
, 20) << __func__
<< " issued " << ccap_string(cap
->issued
) << " vs " << ccap_string(would_have_issued
) << dendl
;
3294 ldout(cct
, 20) << __func__
<< " implemented " << ccap_string(cap
->implemented
) << " vs " << ccap_string(would_have_implemented
) << dendl
;
3297 cap
->issued
&= retain
;
3298 cap
->implemented
&= cap
->issued
| used
;
3301 snapid_t follows
= 0;
3304 follows
= in
->snaprealm
->get_snap_context().seq
;
3306 MClientCaps
*m
= new MClientCaps(op
,
3309 cap
->cap_id
, cap
->seq
,
3315 m
->caller_uid
= in
->cap_dirtier_uid
;
3316 m
->caller_gid
= in
->cap_dirtier_gid
;
3318 m
->head
.issue_seq
= cap
->issue_seq
;
3319 m
->set_tid(flush_tid
);
3321 m
->head
.uid
= in
->uid
;
3322 m
->head
.gid
= in
->gid
;
3323 m
->head
.mode
= in
->mode
;
3325 m
->head
.nlink
= in
->nlink
;
3327 if (flush
& CEPH_CAP_XATTR_EXCL
) {
3328 ::encode(in
->xattrs
, m
->xattrbl
);
3329 m
->head
.xattr_version
= in
->xattr_version
;
3333 m
->max_size
= in
->max_size
;
3334 m
->truncate_seq
= in
->truncate_seq
;
3335 m
->truncate_size
= in
->truncate_size
;
3336 m
->mtime
= in
->mtime
;
3337 m
->atime
= in
->atime
;
3338 m
->ctime
= in
->ctime
;
3339 m
->btime
= in
->btime
;
3340 m
->time_warp_seq
= in
->time_warp_seq
;
3341 m
->change_attr
= in
->change_attr
;
3343 m
->flags
|= CLIENT_CAPS_SYNC
;
3345 if (flush
& CEPH_CAP_FILE_WR
) {
3346 m
->inline_version
= in
->inline_version
;
3347 m
->inline_data
= in
->inline_data
;
3350 in
->reported_size
= in
->size
;
3351 m
->set_snap_follows(follows
);
3353 if (cap
== in
->auth_cap
) {
3354 m
->set_max_size(in
->wanted_max_size
);
3355 in
->requested_max_size
= in
->wanted_max_size
;
3356 ldout(cct
, 15) << "auth cap, setting max_size = " << in
->requested_max_size
<< dendl
;
3359 if (!session
->flushing_caps_tids
.empty())
3360 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3362 session
->con
->send_message(m
);
3365 static bool is_max_size_approaching(Inode
*in
)
3367 /* mds will adjust max size according to the reported size */
3368 if (in
->flushing_caps
& CEPH_CAP_FILE_WR
)
3370 if (in
->size
>= in
->max_size
)
3372 /* half of previous max_size increment has been used */
3373 if (in
->max_size
> in
->reported_size
&&
3374 (in
->size
<< 1) >= in
->max_size
+ in
->reported_size
)
3382 * Examine currently used and wanted versus held caps. Release, flush or ack
3383 * revoked caps to the MDS as appropriate.
3385 * @param in the inode to check
3386 * @param flags flags to apply to cap check
3388 void Client::check_caps(Inode
*in
, unsigned flags
)
3390 unsigned wanted
= in
->caps_wanted();
3391 unsigned used
= get_caps_used(in
);
3394 if (in
->is_dir() && (in
->flags
& I_COMPLETE
)) {
3395 // we do this here because we don't want to drop to Fs (and then
3396 // drop the Fs if we do a create!) if that alone makes us send lookups
3397 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3398 wanted
|= CEPH_CAP_FILE_EXCL
;
3402 int issued
= in
->caps_issued(&implemented
);
3403 int revoking
= implemented
& ~issued
;
3405 int retain
= wanted
| used
| CEPH_CAP_PIN
;
3408 retain
|= CEPH_CAP_ANY
;
3410 retain
|= CEPH_CAP_ANY_SHARED
;
3413 ldout(cct
, 10) << "check_caps on " << *in
3414 << " wanted " << ccap_string(wanted
)
3415 << " used " << ccap_string(used
)
3416 << " issued " << ccap_string(issued
)
3417 << " revoking " << ccap_string(revoking
)
3418 << " flags=" << flags
3421 if (in
->snapid
!= CEPH_NOSNAP
)
3422 return; //snap caps last forever, can't write
3424 if (in
->caps
.empty())
3425 return; // guard if at end of func
3427 if ((revoking
& (CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_LAZYIO
)) &&
3428 (used
& CEPH_CAP_FILE_CACHE
) && !(used
& CEPH_CAP_FILE_BUFFER
)) {
3430 used
&= ~CEPH_CAP_FILE_CACHE
;
3433 if (!in
->cap_snaps
.empty())
3436 if (flags
& CHECK_CAPS_NODELAY
)
3437 in
->hold_caps_until
= utime_t();
3439 cap_delay_requeue(in
);
3441 utime_t now
= ceph_clock_now();
3443 map
<mds_rank_t
, Cap
*>::iterator it
= in
->caps
.begin();
3444 while (it
!= in
->caps
.end()) {
3445 mds_rank_t mds
= it
->first
;
3446 Cap
*cap
= it
->second
;
3449 MetaSession
*session
= mds_sessions
[mds
];
3453 if (in
->auth_cap
&& cap
!= in
->auth_cap
)
3454 cap_used
&= ~in
->auth_cap
->issued
;
3456 revoking
= cap
->implemented
& ~cap
->issued
;
3458 ldout(cct
, 10) << " cap mds." << mds
3459 << " issued " << ccap_string(cap
->issued
)
3460 << " implemented " << ccap_string(cap
->implemented
)
3461 << " revoking " << ccap_string(revoking
) << dendl
;
3463 if (in
->wanted_max_size
> in
->max_size
&&
3464 in
->wanted_max_size
> in
->requested_max_size
&&
3465 cap
== in
->auth_cap
)
3468 /* approaching file_max? */
3469 if ((cap
->issued
& CEPH_CAP_FILE_WR
) &&
3470 cap
== in
->auth_cap
&&
3471 is_max_size_approaching(in
)) {
3472 ldout(cct
, 10) << "size " << in
->size
<< " approaching max_size " << in
->max_size
3473 << ", reported " << in
->reported_size
<< dendl
;
3477 /* completed revocation? */
3478 if (revoking
&& (revoking
& cap_used
) == 0) {
3479 ldout(cct
, 10) << "completed revocation of " << ccap_string(cap
->implemented
& ~cap
->issued
) << dendl
;
3483 /* want more caps from mds? */
3484 if (wanted
& ~(cap
->wanted
| cap
->issued
))
3487 if (!revoking
&& unmounting
&& (cap_used
== 0))
3490 if (wanted
== cap
->wanted
&& // mds knows what we want.
3491 ((cap
->issued
& ~retain
) == 0) &&// and we don't have anything we wouldn't like
3492 !in
->dirty_caps
) // and we have no dirty caps
3495 if (now
< in
->hold_caps_until
) {
3496 ldout(cct
, 10) << "delaying cap release" << dendl
;
3501 // re-send old cap/snapcap flushes first.
3502 if (session
->mds_state
>= MDSMap::STATE_RECONNECT
&&
3503 session
->mds_state
< MDSMap::STATE_ACTIVE
&&
3504 session
->early_flushing_caps
.count(in
) == 0) {
3505 ldout(cct
, 20) << " reflushing caps (check_caps) on " << *in
3506 << " to mds." << session
->mds_num
<< dendl
;
3507 session
->early_flushing_caps
.insert(in
);
3508 if (in
->cap_snaps
.size())
3509 flush_snaps(in
, true);
3510 if (in
->flushing_caps
)
3511 flush_caps(in
, session
, flags
& CHECK_CAPS_SYNCHRONOUS
);
3515 ceph_tid_t flush_tid
;
3516 if (in
->auth_cap
== cap
&& in
->dirty_caps
) {
3517 flushing
= mark_caps_flushing(in
, &flush_tid
);
3523 send_cap(in
, session
, cap
, flags
& CHECK_CAPS_SYNCHRONOUS
, cap_used
, wanted
,
3524 retain
, flushing
, flush_tid
);
3529 void Client::queue_cap_snap(Inode
*in
, SnapContext
& old_snapc
)
3531 int used
= get_caps_used(in
);
3532 int dirty
= in
->caps_dirty();
3533 ldout(cct
, 10) << "queue_cap_snap " << *in
<< " snapc " << old_snapc
<< " used " << ccap_string(used
) << dendl
;
3535 if (in
->cap_snaps
.size() &&
3536 in
->cap_snaps
.rbegin()->second
.writing
) {
3537 ldout(cct
, 10) << "queue_cap_snap already have pending cap_snap on " << *in
<< dendl
;
3539 } else if (in
->caps_dirty() ||
3540 (used
& CEPH_CAP_FILE_WR
) ||
3541 (dirty
& CEPH_CAP_ANY_WR
)) {
3542 const auto &capsnapem
= in
->cap_snaps
.emplace(std::piecewise_construct
, std::make_tuple(old_snapc
.seq
), std::make_tuple(in
));
3543 assert(capsnapem
.second
== true); /* element inserted */
3544 CapSnap
&capsnap
= capsnapem
.first
->second
;
3545 capsnap
.context
= old_snapc
;
3546 capsnap
.issued
= in
->caps_issued();
3547 capsnap
.dirty
= in
->caps_dirty();
3549 capsnap
.dirty_data
= (used
& CEPH_CAP_FILE_BUFFER
);
3551 capsnap
.uid
= in
->uid
;
3552 capsnap
.gid
= in
->gid
;
3553 capsnap
.mode
= in
->mode
;
3554 capsnap
.btime
= in
->btime
;
3555 capsnap
.xattrs
= in
->xattrs
;
3556 capsnap
.xattr_version
= in
->xattr_version
;
3558 if (used
& CEPH_CAP_FILE_WR
) {
3559 ldout(cct
, 10) << "queue_cap_snap WR used on " << *in
<< dendl
;
3560 capsnap
.writing
= 1;
3562 finish_cap_snap(in
, capsnap
, used
);
3565 ldout(cct
, 10) << "queue_cap_snap not dirty|writing on " << *in
<< dendl
;
3569 void Client::finish_cap_snap(Inode
*in
, CapSnap
&capsnap
, int used
)
3571 ldout(cct
, 10) << "finish_cap_snap " << *in
<< " capsnap " << (void *)&capsnap
<< " used " << ccap_string(used
) << dendl
;
3572 capsnap
.size
= in
->size
;
3573 capsnap
.mtime
= in
->mtime
;
3574 capsnap
.atime
= in
->atime
;
3575 capsnap
.ctime
= in
->ctime
;
3576 capsnap
.time_warp_seq
= in
->time_warp_seq
;
3577 capsnap
.change_attr
= in
->change_attr
;
3579 capsnap
.dirty
|= in
->caps_dirty();
3581 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3582 capsnap
.inline_data
= in
->inline_data
;
3583 capsnap
.inline_version
= in
->inline_version
;
3586 if (used
& CEPH_CAP_FILE_BUFFER
) {
3587 ldout(cct
, 10) << "finish_cap_snap " << *in
<< " cap_snap " << &capsnap
<< " used " << used
3588 << " WRBUFFER, delaying" << dendl
;
3590 capsnap
.dirty_data
= 0;
3595 void Client::_flushed_cap_snap(Inode
*in
, snapid_t seq
)
3597 ldout(cct
, 10) << "_flushed_cap_snap seq " << seq
<< " on " << *in
<< dendl
;
3598 in
->cap_snaps
.at(seq
).dirty_data
= 0;
3602 void Client::flush_snaps(Inode
*in
, bool all_again
)
3604 ldout(cct
, 10) << "flush_snaps on " << *in
<< " all_again " << all_again
<< dendl
;
3605 assert(in
->cap_snaps
.size());
3608 assert(in
->auth_cap
);
3609 MetaSession
*session
= in
->auth_cap
->session
;
3610 int mseq
= in
->auth_cap
->mseq
;
3612 for (auto &p
: in
->cap_snaps
) {
3613 CapSnap
&capsnap
= p
.second
;
3615 // only flush once per session
3616 if (capsnap
.flush_tid
> 0)
3620 ldout(cct
, 10) << "flush_snaps mds." << session
->mds_num
3621 << " follows " << p
.first
3622 << " size " << capsnap
.size
3623 << " mtime " << capsnap
.mtime
3624 << " dirty_data=" << capsnap
.dirty_data
3625 << " writing=" << capsnap
.writing
3626 << " on " << *in
<< dendl
;
3627 if (capsnap
.dirty_data
|| capsnap
.writing
)
3630 if (capsnap
.flush_tid
== 0) {
3631 capsnap
.flush_tid
= ++last_flush_tid
;
3632 if (!in
->flushing_cap_item
.is_on_list())
3633 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
3634 session
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
3637 MClientCaps
*m
= new MClientCaps(CEPH_CAP_OP_FLUSHSNAP
, in
->ino
, in
->snaprealm
->ino
, 0, mseq
,
3640 m
->caller_uid
= user_id
;
3642 m
->caller_gid
= group_id
;
3644 m
->set_client_tid(capsnap
.flush_tid
);
3645 m
->head
.snap_follows
= p
.first
;
3647 m
->head
.caps
= capsnap
.issued
;
3648 m
->head
.dirty
= capsnap
.dirty
;
3650 m
->head
.uid
= capsnap
.uid
;
3651 m
->head
.gid
= capsnap
.gid
;
3652 m
->head
.mode
= capsnap
.mode
;
3653 m
->btime
= capsnap
.btime
;
3655 m
->size
= capsnap
.size
;
3657 m
->head
.xattr_version
= capsnap
.xattr_version
;
3658 ::encode(capsnap
.xattrs
, m
->xattrbl
);
3660 m
->ctime
= capsnap
.ctime
;
3661 m
->btime
= capsnap
.btime
;
3662 m
->mtime
= capsnap
.mtime
;
3663 m
->atime
= capsnap
.atime
;
3664 m
->time_warp_seq
= capsnap
.time_warp_seq
;
3665 m
->change_attr
= capsnap
.change_attr
;
3667 if (capsnap
.dirty
& CEPH_CAP_FILE_WR
) {
3668 m
->inline_version
= in
->inline_version
;
3669 m
->inline_data
= in
->inline_data
;
3672 assert(!session
->flushing_caps_tids
.empty());
3673 m
->set_oldest_flush_tid(*session
->flushing_caps_tids
.begin());
3675 session
->con
->send_message(m
);
3681 void Client::wait_on_list(list
<Cond
*>& ls
)
3684 ls
.push_back(&cond
);
3685 cond
.Wait(client_lock
);
3689 void Client::signal_cond_list(list
<Cond
*>& ls
)
3691 for (list
<Cond
*>::iterator it
= ls
.begin(); it
!= ls
.end(); ++it
)
3695 void Client::wait_on_context_list(list
<Context
*>& ls
)
3700 ls
.push_back(new C_Cond(&cond
, &done
, &r
));
3702 cond
.Wait(client_lock
);
3705 void Client::signal_context_list(list
<Context
*>& ls
)
3707 while (!ls
.empty()) {
3708 ls
.front()->complete(0);
3713 void Client::wake_inode_waiters(MetaSession
*s
)
3715 xlist
<Cap
*>::iterator iter
= s
->caps
.begin();
3716 while (!iter
.end()){
3717 signal_cond_list((*iter
)->inode
->waitfor_caps
);
3723 // flush dirty data (from objectcache)
3725 class C_Client_CacheInvalidate
: public Context
{
3729 int64_t offset
, length
;
3731 C_Client_CacheInvalidate(Client
*c
, Inode
*in
, int64_t off
, int64_t len
) :
3732 client(c
), offset(off
), length(len
) {
3733 if (client
->use_faked_inos())
3734 ino
= vinodeno_t(in
->faked_ino
, CEPH_NOSNAP
);
3738 void finish(int r
) override
{
3739 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3740 assert(!client
->client_lock
.is_locked_by_me());
3741 client
->_async_invalidate(ino
, offset
, length
);
3745 void Client::_async_invalidate(vinodeno_t ino
, int64_t off
, int64_t len
)
3749 ldout(cct
, 10) << "_async_invalidate " << ino
<< " " << off
<< "~" << len
<< dendl
;
3750 ino_invalidate_cb(callback_handle
, ino
, off
, len
);
3753 void Client::_schedule_invalidate_callback(Inode
*in
, int64_t off
, int64_t len
) {
3755 if (ino_invalidate_cb
)
3756 // we queue the invalidate, which calls the callback and decrements the ref
3757 async_ino_invalidator
.queue(new C_Client_CacheInvalidate(this, in
, off
, len
));
3760 void Client::_invalidate_inode_cache(Inode
*in
)
3762 ldout(cct
, 10) << "_invalidate_inode_cache " << *in
<< dendl
;
3764 // invalidate our userspace inode cache
3765 if (cct
->_conf
->client_oc
) {
3766 objectcacher
->release_set(&in
->oset
);
3767 if (!objectcacher
->set_is_empty(&in
->oset
))
3768 lderr(cct
) << "failed to invalidate cache for " << *in
<< dendl
;
3771 _schedule_invalidate_callback(in
, 0, 0);
3774 void Client::_invalidate_inode_cache(Inode
*in
, int64_t off
, int64_t len
)
3776 ldout(cct
, 10) << "_invalidate_inode_cache " << *in
<< " " << off
<< "~" << len
<< dendl
;
3778 // invalidate our userspace inode cache
3779 if (cct
->_conf
->client_oc
) {
3780 vector
<ObjectExtent
> ls
;
3781 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, len
, in
->truncate_size
, ls
);
3782 objectcacher
->discard_writeback(&in
->oset
, ls
, nullptr);
3785 _schedule_invalidate_callback(in
, off
, len
);
3788 bool Client::_release(Inode
*in
)
3790 ldout(cct
, 20) << "_release " << *in
<< dendl
;
3791 if (in
->cap_refs
[CEPH_CAP_FILE_CACHE
] == 0) {
3792 _invalidate_inode_cache(in
);
3798 bool Client::_flush(Inode
*in
, Context
*onfinish
)
3800 ldout(cct
, 10) << "_flush " << *in
<< dendl
;
3802 if (!in
->oset
.dirty_or_tx
) {
3803 ldout(cct
, 10) << " nothing to flush" << dendl
;
3804 onfinish
->complete(0);
3808 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
3809 ldout(cct
, 8) << __func__
<< ": FULL, purging for ENOSPC" << dendl
;
3810 objectcacher
->purge_set(&in
->oset
);
3812 onfinish
->complete(-ENOSPC
);
3817 return objectcacher
->flush_set(&in
->oset
, onfinish
);
3820 void Client::_flush_range(Inode
*in
, int64_t offset
, uint64_t size
)
3822 assert(client_lock
.is_locked());
3823 if (!in
->oset
.dirty_or_tx
) {
3824 ldout(cct
, 10) << " nothing to flush" << dendl
;
3828 Mutex
flock("Client::_flush_range flock");
3831 Context
*onflush
= new C_SafeCond(&flock
, &cond
, &safe
);
3832 bool ret
= objectcacher
->file_flush(&in
->oset
, &in
->layout
, in
->snaprealm
->get_snap_context(),
3833 offset
, size
, onflush
);
3836 client_lock
.Unlock();
3845 void Client::flush_set_callback(ObjectCacher::ObjectSet
*oset
)
3847 // Mutex::Locker l(client_lock);
3848 assert(client_lock
.is_locked()); // will be called via dispatch() -> objecter -> ...
3849 Inode
*in
= static_cast<Inode
*>(oset
->parent
);
3854 void Client::_flushed(Inode
*in
)
3856 ldout(cct
, 10) << "_flushed " << *in
<< dendl
;
3858 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
3863 // checks common to add_update_cap, handle_cap_grant
3864 void Client::check_cap_issue(Inode
*in
, Cap
*cap
, unsigned issued
)
3866 unsigned had
= in
->caps_issued();
3868 if ((issued
& CEPH_CAP_FILE_CACHE
) &&
3869 !(had
& CEPH_CAP_FILE_CACHE
))
3872 if ((issued
& CEPH_CAP_FILE_SHARED
) &&
3873 !(had
& CEPH_CAP_FILE_SHARED
)) {
3877 clear_dir_complete_and_ordered(in
, true);
3881 void Client::add_update_cap(Inode
*in
, MetaSession
*mds_session
, uint64_t cap_id
,
3882 unsigned issued
, unsigned seq
, unsigned mseq
, inodeno_t realm
,
3883 int flags
, const UserPerm
& cap_perms
)
3886 mds_rank_t mds
= mds_session
->mds_num
;
3887 if (in
->caps
.count(mds
)) {
3888 cap
= in
->caps
[mds
];
3891 * auth mds of the inode changed. we received the cap export
3892 * message, but still haven't received the cap import message.
3893 * handle_cap_export() updated the new auth MDS' cap.
3895 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3896 * a message that was send before the cap import message. So
3897 * don't remove caps.
3899 if (ceph_seq_cmp(seq
, cap
->seq
) <= 0) {
3900 assert(cap
== in
->auth_cap
);
3901 assert(cap
->cap_id
== cap_id
);
3904 issued
|= cap
->issued
;
3905 flags
|= CEPH_CAP_FLAG_AUTH
;
3908 mds_session
->num_caps
++;
3909 if (!in
->is_any_caps()) {
3910 assert(in
->snaprealm
== 0);
3911 in
->snaprealm
= get_snap_realm(realm
);
3912 in
->snaprealm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
3913 ldout(cct
, 15) << "add_update_cap first one, opened snaprealm " << in
->snaprealm
<< dendl
;
3915 in
->caps
[mds
] = cap
= new Cap
;
3917 mds_session
->caps
.push_back(&cap
->cap_item
);
3918 cap
->session
= mds_session
;
3920 cap
->gen
= mds_session
->cap_gen
;
3923 check_cap_issue(in
, cap
, issued
);
3925 if (flags
& CEPH_CAP_FLAG_AUTH
) {
3926 if (in
->auth_cap
!= cap
&&
3927 (!in
->auth_cap
|| ceph_seq_cmp(in
->auth_cap
->mseq
, mseq
) < 0)) {
3928 if (in
->auth_cap
&& in
->flushing_cap_item
.is_on_list()) {
3929 ldout(cct
, 10) << "add_update_cap changing auth cap: "
3930 << "add myself to new auth MDS' flushing caps list" << dendl
;
3931 adjust_session_flushing_caps(in
, in
->auth_cap
->session
, mds_session
);
3937 unsigned old_caps
= cap
->issued
;
3938 cap
->cap_id
= cap_id
;
3939 cap
->issued
|= issued
;
3940 cap
->implemented
|= issued
;
3942 cap
->issue_seq
= seq
;
3944 cap
->gen
= mds_session
->cap_gen
;
3945 cap
->latest_perms
= cap_perms
;
3946 ldout(cct
, 10) << "add_update_cap issued " << ccap_string(old_caps
) << " -> " << ccap_string(cap
->issued
)
3947 << " from mds." << mds
3951 if ((issued
& ~old_caps
) && in
->auth_cap
== cap
) {
3952 // non-auth MDS is revoking the newly grant caps ?
3953 for (map
<mds_rank_t
,Cap
*>::iterator it
= in
->caps
.begin(); it
!= in
->caps
.end(); ++it
) {
3954 if (it
->second
== cap
)
3956 if (it
->second
->implemented
& ~it
->second
->issued
& issued
) {
3957 check_caps(in
, CHECK_CAPS_NODELAY
);
3963 if (issued
& ~old_caps
)
3964 signal_cond_list(in
->waitfor_caps
);
3967 void Client::remove_cap(Cap
*cap
, bool queue_release
)
3969 Inode
*in
= cap
->inode
;
3970 MetaSession
*session
= cap
->session
;
3971 mds_rank_t mds
= cap
->session
->mds_num
;
3973 ldout(cct
, 10) << "remove_cap mds." << mds
<< " on " << *in
<< dendl
;
3975 if (queue_release
) {
3976 session
->enqueue_cap_release(
3984 if (in
->auth_cap
== cap
) {
3985 if (in
->flushing_cap_item
.is_on_list()) {
3986 ldout(cct
, 10) << " removing myself from flushing_cap list" << dendl
;
3987 in
->flushing_cap_item
.remove_myself();
3989 in
->auth_cap
= NULL
;
3991 assert(in
->caps
.count(mds
));
3992 in
->caps
.erase(mds
);
3994 cap
->cap_item
.remove_myself();
3998 if (!in
->is_any_caps()) {
3999 ldout(cct
, 15) << "remove_cap last one, closing snaprealm " << in
->snaprealm
<< dendl
;
4000 in
->snaprealm_item
.remove_myself();
4001 put_snap_realm(in
->snaprealm
);
4006 void Client::remove_all_caps(Inode
*in
)
4008 while (!in
->caps
.empty())
4009 remove_cap(in
->caps
.begin()->second
, true);
4012 void Client::remove_session_caps(MetaSession
*s
)
4014 ldout(cct
, 10) << "remove_session_caps mds." << s
->mds_num
<< dendl
;
4016 while (s
->caps
.size()) {
4017 Cap
*cap
= *s
->caps
.begin();
4018 Inode
*in
= cap
->inode
;
4019 bool dirty_caps
= false, cap_snaps
= false;
4020 if (in
->auth_cap
== cap
) {
4021 cap_snaps
= !in
->cap_snaps
.empty();
4022 dirty_caps
= in
->dirty_caps
| in
->flushing_caps
;
4023 in
->wanted_max_size
= 0;
4024 in
->requested_max_size
= 0;
4025 in
->flags
|= I_CAP_DROPPED
;
4027 remove_cap(cap
, false);
4028 signal_cond_list(in
->waitfor_caps
);
4030 InodeRef
tmp_ref(in
);
4031 in
->cap_snaps
.clear();
4034 lderr(cct
) << "remove_session_caps still has dirty|flushing caps on " << *in
<< dendl
;
4035 if (in
->flushing_caps
) {
4036 num_flushing_caps
--;
4037 in
->flushing_cap_tids
.clear();
4039 in
->flushing_caps
= 0;
4040 in
->mark_caps_clean();
4044 s
->flushing_caps_tids
.clear();
4048 int Client::_do_remount(void)
4051 int r
= remount_cb(callback_handle
);
4054 client_t whoami
= get_nodeid();
4057 "failed to remount (to trim kernel dentries): "
4058 "errno = " << e
<< " (" << strerror(e
) << ")" << dendl
;
4061 "failed to remount (to trim kernel dentries): "
4062 "return code = " << r
<< dendl
;
4064 bool should_abort
= cct
->_conf
->get_val
<bool>("client_die_on_failed_remount") ||
4065 cct
->_conf
->get_val
<bool>("client_die_on_failed_dentry_invalidate");
4066 if (should_abort
&& !unmounting
) {
4067 lderr(cct
) << "failed to remount for kernel dentry trimming; quitting!" << dendl
;
4074 class C_Client_Remount
: public Context
{
4078 explicit C_Client_Remount(Client
*c
) : client(c
) {}
4079 void finish(int r
) override
{
4081 client
->_do_remount();
4085 void Client::_invalidate_kernel_dcache()
4089 if (can_invalidate_dentries
) {
4090 if (dentry_invalidate_cb
&& root
->dir
) {
4091 for (ceph::unordered_map
<string
, Dentry
*>::iterator p
= root
->dir
->dentries
.begin();
4092 p
!= root
->dir
->dentries
.end();
4094 if (p
->second
->inode
)
4095 _schedule_invalidate_dentry_callback(p
->second
, false);
4098 } else if (remount_cb
) {
4100 // when remounting a file system, linux kernel trims all unused dentries in the fs
4101 remount_finisher
.queue(new C_Client_Remount(this));
4105 void Client::trim_caps(MetaSession
*s
, uint64_t max
)
4107 mds_rank_t mds
= s
->mds_num
;
4108 size_t caps_size
= s
->caps
.size();
4109 ldout(cct
, 10) << "trim_caps mds." << mds
<< " max " << max
4110 << " caps " << caps_size
<< dendl
;
4112 uint64_t trimmed
= 0;
4113 auto p
= s
->caps
.begin();
4114 std::set
<Dentry
*> to_trim
; /* this avoids caps other than the one we're
4115 * looking at from getting deleted during traversal. */
4116 while ((caps_size
- trimmed
) > max
&& !p
.end()) {
4118 InodeRef
in(cap
->inode
);
4120 // Increment p early because it will be invalidated if cap
4121 // is deleted inside remove_cap
4124 if (in
->caps
.size() > 1 && cap
!= in
->auth_cap
) {
4125 int mine
= cap
->issued
| cap
->implemented
;
4126 int oissued
= in
->auth_cap
? in
->auth_cap
->issued
: 0;
4127 // disposable non-auth cap
4128 if (!(get_caps_used(in
.get()) & ~oissued
& mine
)) {
4129 ldout(cct
, 20) << " removing unused, unneeded non-auth cap on " << *in
<< dendl
;
4130 cap
= (remove_cap(cap
, true), nullptr);
4134 ldout(cct
, 20) << " trying to trim dentries for " << *in
<< dendl
;
4136 set
<Dentry
*>::iterator q
= in
->dn_set
.begin();
4137 while (q
!= in
->dn_set
.end()) {
4139 if (dn
->lru_is_expireable()) {
4140 if (can_invalidate_dentries
&&
4141 dn
->dir
->parent_inode
->ino
== MDS_INO_ROOT
) {
4142 // Only issue one of these per DN for inodes in root: handle
4143 // others more efficiently by calling for root-child DNs at
4144 // the end of this function.
4145 _schedule_invalidate_dentry_callback(dn
, true);
4147 ldout(cct
, 20) << " queueing dentry for trimming: " << dn
->name
<< dendl
;
4150 ldout(cct
, 20) << " not expirable: " << dn
->name
<< dendl
;
4154 if (all
&& in
->ino
!= MDS_INO_ROOT
) {
4155 ldout(cct
, 20) << __func__
<< " counting as trimmed: " << *in
<< dendl
;
4160 ldout(cct
, 20) << " trimming queued dentries: " << dendl
;
4161 for (const auto &dn
: to_trim
) {
4166 caps_size
= s
->caps
.size();
4167 if (caps_size
> max
)
4168 _invalidate_kernel_dcache();
4171 void Client::force_session_readonly(MetaSession
*s
)
4174 for (xlist
<Cap
*>::iterator p
= s
->caps
.begin(); !p
.end(); ++p
) {
4175 Inode
*in
= (*p
)->inode
;
4176 if (in
->caps_wanted() & CEPH_CAP_FILE_WR
)
4177 signal_cond_list(in
->waitfor_caps
);
4181 int Client::mark_caps_flushing(Inode
*in
, ceph_tid_t
* ptid
)
4183 MetaSession
*session
= in
->auth_cap
->session
;
4185 int flushing
= in
->dirty_caps
;
4188 ceph_tid_t flush_tid
= ++last_flush_tid
;
4189 in
->flushing_cap_tids
[flush_tid
] = flushing
;
4191 if (!in
->flushing_caps
) {
4192 ldout(cct
, 10) << "mark_caps_flushing " << ccap_string(flushing
) << " " << *in
<< dendl
;
4193 num_flushing_caps
++;
4195 ldout(cct
, 10) << "mark_caps_flushing (more) " << ccap_string(flushing
) << " " << *in
<< dendl
;
4198 in
->flushing_caps
|= flushing
;
4199 in
->mark_caps_clean();
4201 if (!in
->flushing_cap_item
.is_on_list())
4202 session
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4203 session
->flushing_caps_tids
.insert(flush_tid
);
4209 void Client::adjust_session_flushing_caps(Inode
*in
, MetaSession
*old_s
, MetaSession
*new_s
)
4211 for (auto &p
: in
->cap_snaps
) {
4212 CapSnap
&capsnap
= p
.second
;
4213 if (capsnap
.flush_tid
> 0) {
4214 old_s
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4215 new_s
->flushing_caps_tids
.insert(capsnap
.flush_tid
);
4218 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4219 it
!= in
->flushing_cap_tids
.end();
4221 old_s
->flushing_caps_tids
.erase(it
->first
);
4222 new_s
->flushing_caps_tids
.insert(it
->first
);
4224 new_s
->flushing_caps
.push_back(&in
->flushing_cap_item
);
4228 * Flush all caps back to the MDS. Because the callers generally wait on the
4229 * result of this function (syncfs and umount cases), we set
4230 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4232 void Client::flush_caps_sync()
4234 ldout(cct
, 10) << __func__
<< dendl
;
4235 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
4237 unsigned flags
= CHECK_CAPS_NODELAY
;
4241 delayed_list
.pop_front();
4242 if (p
.end() && dirty_list
.empty())
4243 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4244 check_caps(in
, flags
);
4248 p
= dirty_list
.begin();
4250 unsigned flags
= CHECK_CAPS_NODELAY
;
4255 flags
|= CHECK_CAPS_SYNCHRONOUS
;
4256 check_caps(in
, flags
);
4260 void Client::flush_caps(Inode
*in
, MetaSession
*session
, bool sync
)
4262 ldout(cct
, 10) << "flush_caps " << in
<< " mds." << session
->mds_num
<< dendl
;
4263 Cap
*cap
= in
->auth_cap
;
4264 assert(cap
->session
== session
);
4266 for (map
<ceph_tid_t
,int>::iterator p
= in
->flushing_cap_tids
.begin();
4267 p
!= in
->flushing_cap_tids
.end();
4269 bool req_sync
= false;
4271 /* If this is a synchronous request, then flush the journal on last one */
4272 if (sync
&& (p
->first
== in
->flushing_cap_tids
.rbegin()->first
))
4275 send_cap(in
, session
, cap
, req_sync
,
4276 (get_caps_used(in
) | in
->caps_dirty()),
4277 in
->caps_wanted(), (cap
->issued
| cap
->implemented
),
4278 p
->second
, p
->first
);
4282 void Client::wait_sync_caps(Inode
*in
, ceph_tid_t want
)
4284 while (in
->flushing_caps
) {
4285 map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4286 assert(it
!= in
->flushing_cap_tids
.end());
4287 if (it
->first
> want
)
4289 ldout(cct
, 10) << "wait_sync_caps on " << *in
<< " flushing "
4290 << ccap_string(it
->second
) << " want " << want
4291 << " last " << it
->first
<< dendl
;
4292 wait_on_list(in
->waitfor_caps
);
4296 void Client::wait_sync_caps(ceph_tid_t want
)
4299 ldout(cct
, 10) << "wait_sync_caps want " << want
<< " (last is " << last_flush_tid
<< ", "
4300 << num_flushing_caps
<< " total flushing)" << dendl
;
4301 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
4302 p
!= mds_sessions
.end();
4304 MetaSession
*s
= p
->second
;
4305 if (s
->flushing_caps_tids
.empty())
4307 ceph_tid_t oldest_tid
= *s
->flushing_caps_tids
.begin();
4308 if (oldest_tid
<= want
) {
4309 ldout(cct
, 10) << " waiting on mds." << p
->first
<< " tid " << oldest_tid
4310 << " (want " << want
<< ")" << dendl
;
4311 sync_cond
.Wait(client_lock
);
4317 void Client::kick_flushing_caps(MetaSession
*session
)
4319 mds_rank_t mds
= session
->mds_num
;
4320 ldout(cct
, 10) << "kick_flushing_caps mds." << mds
<< dendl
;
4322 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4324 if (session
->early_flushing_caps
.count(in
))
4326 ldout(cct
, 20) << " reflushing caps on " << *in
<< " to mds." << mds
<< dendl
;
4327 if (in
->cap_snaps
.size())
4328 flush_snaps(in
, true);
4329 if (in
->flushing_caps
)
4330 flush_caps(in
, session
);
4333 session
->early_flushing_caps
.clear();
4336 void Client::early_kick_flushing_caps(MetaSession
*session
)
4338 session
->early_flushing_caps
.clear();
4340 for (xlist
<Inode
*>::iterator p
= session
->flushing_caps
.begin(); !p
.end(); ++p
) {
4342 assert(in
->auth_cap
);
4344 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4345 // stage. This guarantees that MDS processes the cap flush message before issuing
4346 // the flushing caps to other client.
4347 if ((in
->flushing_caps
& in
->auth_cap
->issued
) == in
->flushing_caps
)
4350 ldout(cct
, 20) << " reflushing caps (early_kick) on " << *in
4351 << " to mds." << session
->mds_num
<< dendl
;
4353 session
->early_flushing_caps
.insert(in
);
4355 if (in
->cap_snaps
.size())
4356 flush_snaps(in
, true);
4357 if (in
->flushing_caps
)
4358 flush_caps(in
, session
);
4363 void Client::kick_maxsize_requests(MetaSession
*session
)
4365 xlist
<Cap
*>::iterator iter
= session
->caps
.begin();
4366 while (!iter
.end()){
4367 (*iter
)->inode
->requested_max_size
= 0;
4368 (*iter
)->inode
->wanted_max_size
= 0;
4369 signal_cond_list((*iter
)->inode
->waitfor_caps
);
4374 void SnapRealm::build_snap_context()
4376 set
<snapid_t
> snaps
;
4377 snapid_t max_seq
= seq
;
4379 // start with prior_parents?
4380 for (unsigned i
=0; i
<prior_parent_snaps
.size(); i
++)
4381 snaps
.insert(prior_parent_snaps
[i
]);
4383 // current parent's snaps
4385 const SnapContext
& psnapc
= pparent
->get_snap_context();
4386 for (unsigned i
=0; i
<psnapc
.snaps
.size(); i
++)
4387 if (psnapc
.snaps
[i
] >= parent_since
)
4388 snaps
.insert(psnapc
.snaps
[i
]);
4389 if (psnapc
.seq
> max_seq
)
4390 max_seq
= psnapc
.seq
;
4394 for (unsigned i
=0; i
<my_snaps
.size(); i
++)
4395 snaps
.insert(my_snaps
[i
]);
4398 cached_snap_context
.seq
= max_seq
;
4399 cached_snap_context
.snaps
.resize(0);
4400 cached_snap_context
.snaps
.reserve(snaps
.size());
4401 for (set
<snapid_t
>::reverse_iterator p
= snaps
.rbegin(); p
!= snaps
.rend(); ++p
)
4402 cached_snap_context
.snaps
.push_back(*p
);
4405 void Client::invalidate_snaprealm_and_children(SnapRealm
*realm
)
4410 while (!q
.empty()) {
4414 ldout(cct
, 10) << "invalidate_snaprealm_and_children " << *realm
<< dendl
;
4415 realm
->invalidate_cache();
4417 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4418 p
!= realm
->pchildren
.end();
4424 SnapRealm
*Client::get_snap_realm(inodeno_t r
)
4426 SnapRealm
*realm
= snap_realms
[r
];
4428 snap_realms
[r
] = realm
= new SnapRealm(r
);
4429 ldout(cct
, 20) << "get_snap_realm " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4434 SnapRealm
*Client::get_snap_realm_maybe(inodeno_t r
)
4436 if (snap_realms
.count(r
) == 0) {
4437 ldout(cct
, 20) << "get_snap_realm_maybe " << r
<< " fail" << dendl
;
4440 SnapRealm
*realm
= snap_realms
[r
];
4441 ldout(cct
, 20) << "get_snap_realm_maybe " << r
<< " " << realm
<< " " << realm
->nref
<< " -> " << (realm
->nref
+ 1) << dendl
;
4446 void Client::put_snap_realm(SnapRealm
*realm
)
4448 ldout(cct
, 20) << "put_snap_realm " << realm
->ino
<< " " << realm
4449 << " " << realm
->nref
<< " -> " << (realm
->nref
- 1) << dendl
;
4450 if (--realm
->nref
== 0) {
4451 snap_realms
.erase(realm
->ino
);
4452 if (realm
->pparent
) {
4453 realm
->pparent
->pchildren
.erase(realm
);
4454 put_snap_realm(realm
->pparent
);
4460 bool Client::adjust_realm_parent(SnapRealm
*realm
, inodeno_t parent
)
4462 if (realm
->parent
!= parent
) {
4463 ldout(cct
, 10) << "adjust_realm_parent " << *realm
4464 << " " << realm
->parent
<< " -> " << parent
<< dendl
;
4465 realm
->parent
= parent
;
4466 if (realm
->pparent
) {
4467 realm
->pparent
->pchildren
.erase(realm
);
4468 put_snap_realm(realm
->pparent
);
4470 realm
->pparent
= get_snap_realm(parent
);
4471 realm
->pparent
->pchildren
.insert(realm
);
4477 static bool has_new_snaps(const SnapContext
& old_snapc
,
4478 const SnapContext
& new_snapc
)
4480 return !new_snapc
.snaps
.empty() && new_snapc
.snaps
[0] > old_snapc
.seq
;
4484 void Client::update_snap_trace(bufferlist
& bl
, SnapRealm
**realm_ret
, bool flush
)
4486 SnapRealm
*first_realm
= NULL
;
4487 ldout(cct
, 10) << "update_snap_trace len " << bl
.length() << dendl
;
4489 map
<SnapRealm
*, SnapContext
> dirty_realms
;
4491 bufferlist::iterator p
= bl
.begin();
4495 SnapRealm
*realm
= get_snap_realm(info
.ino());
4497 bool invalidate
= false;
4499 if (info
.seq() > realm
->seq
) {
4500 ldout(cct
, 10) << "update_snap_trace " << *realm
<< " seq " << info
.seq() << " > " << realm
->seq
4504 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4505 // flush me + children
4508 while (!q
.empty()) {
4509 SnapRealm
*realm
= q
.front();
4512 for (set
<SnapRealm
*>::iterator p
= realm
->pchildren
.begin();
4513 p
!= realm
->pchildren
.end();
4517 if (dirty_realms
.count(realm
) == 0) {
4519 dirty_realms
[realm
] = realm
->get_snap_context();
4525 realm
->seq
= info
.seq();
4526 realm
->created
= info
.created();
4527 realm
->parent_since
= info
.parent_since();
4528 realm
->prior_parent_snaps
= info
.prior_parent_snaps
;
4529 realm
->my_snaps
= info
.my_snaps
;
4533 // _always_ verify parent
4534 if (adjust_realm_parent(realm
, info
.parent()))
4538 invalidate_snaprealm_and_children(realm
);
4539 ldout(cct
, 15) << "update_snap_trace " << *realm
<< " self|parent updated" << dendl
;
4540 ldout(cct
, 15) << " snapc " << realm
->get_snap_context() << dendl
;
4542 ldout(cct
, 10) << "update_snap_trace " << *realm
<< " seq " << info
.seq()
4543 << " <= " << realm
->seq
<< " and same parent, SKIPPING" << dendl
;
4547 first_realm
= realm
;
4549 put_snap_realm(realm
);
4552 for (map
<SnapRealm
*, SnapContext
>::iterator q
= dirty_realms
.begin();
4553 q
!= dirty_realms
.end();
4555 SnapRealm
*realm
= q
->first
;
4556 // if there are new snaps ?
4557 if (has_new_snaps(q
->second
, realm
->get_snap_context())) {
4558 ldout(cct
, 10) << " flushing caps on " << *realm
<< dendl
;
4559 xlist
<Inode
*>::iterator r
= realm
->inodes_with_caps
.begin();
4563 queue_cap_snap(in
, q
->second
);
4566 ldout(cct
, 10) << " no new snap on " << *realm
<< dendl
;
4568 put_snap_realm(realm
);
4572 *realm_ret
= first_realm
;
4574 put_snap_realm(first_realm
);
4577 void Client::handle_snap(MClientSnap
*m
)
4579 ldout(cct
, 10) << "handle_snap " << *m
<< dendl
;
4580 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4581 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4587 got_mds_push(session
);
4589 map
<Inode
*, SnapContext
> to_move
;
4590 SnapRealm
*realm
= 0;
4592 if (m
->head
.op
== CEPH_SNAP_OP_SPLIT
) {
4593 assert(m
->head
.split
);
4595 bufferlist::iterator p
= m
->bl
.begin();
4597 assert(info
.ino() == m
->head
.split
);
4599 // flush, then move, ino's.
4600 realm
= get_snap_realm(info
.ino());
4601 ldout(cct
, 10) << " splitting off " << *realm
<< dendl
;
4602 for (vector
<inodeno_t
>::iterator p
= m
->split_inos
.begin();
4603 p
!= m
->split_inos
.end();
4605 vinodeno_t
vino(*p
, CEPH_NOSNAP
);
4606 if (inode_map
.count(vino
)) {
4607 Inode
*in
= inode_map
[vino
];
4608 if (!in
->snaprealm
|| in
->snaprealm
== realm
)
4610 if (in
->snaprealm
->created
> info
.created()) {
4611 ldout(cct
, 10) << " NOT moving " << *in
<< " from _newer_ realm "
4612 << *in
->snaprealm
<< dendl
;
4615 ldout(cct
, 10) << " moving " << *in
<< " from " << *in
->snaprealm
<< dendl
;
4618 in
->snaprealm_item
.remove_myself();
4619 to_move
[in
] = in
->snaprealm
->get_snap_context();
4620 put_snap_realm(in
->snaprealm
);
4624 // move child snaprealms, too
4625 for (vector
<inodeno_t
>::iterator p
= m
->split_realms
.begin();
4626 p
!= m
->split_realms
.end();
4628 ldout(cct
, 10) << "adjusting snaprealm " << *p
<< " parent" << dendl
;
4629 SnapRealm
*child
= get_snap_realm_maybe(*p
);
4632 adjust_realm_parent(child
, realm
->ino
);
4633 put_snap_realm(child
);
4637 update_snap_trace(m
->bl
, NULL
, m
->head
.op
!= CEPH_SNAP_OP_DESTROY
);
4640 for (auto p
= to_move
.begin(); p
!= to_move
.end(); ++p
) {
4641 Inode
*in
= p
->first
;
4642 in
->snaprealm
= realm
;
4643 realm
->inodes_with_caps
.push_back(&in
->snaprealm_item
);
4645 // queue for snap writeback
4646 if (has_new_snaps(p
->second
, realm
->get_snap_context()))
4647 queue_cap_snap(in
, p
->second
);
4649 put_snap_realm(realm
);
4655 void Client::handle_quota(MClientQuota
*m
)
4657 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4658 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4664 got_mds_push(session
);
4666 ldout(cct
, 10) << "handle_quota " << *m
<< " from mds." << mds
<< dendl
;
4668 vinodeno_t
vino(m
->ino
, CEPH_NOSNAP
);
4669 if (inode_map
.count(vino
)) {
4671 in
= inode_map
[vino
];
4674 in
->quota
= m
->quota
;
4675 in
->rstat
= m
->rstat
;
4682 void Client::handle_caps(MClientCaps
*m
)
4684 mds_rank_t mds
= mds_rank_t(m
->get_source().num());
4685 MetaSession
*session
= _get_mds_session(mds
, m
->get_connection().get());
4691 if (m
->osd_epoch_barrier
&& !objecter
->have_map(m
->osd_epoch_barrier
)) {
4692 // Pause RADOS operations until we see the required epoch
4693 objecter
->set_epoch_barrier(m
->osd_epoch_barrier
);
4696 if (m
->osd_epoch_barrier
> cap_epoch_barrier
) {
4697 // Record the barrier so that we will transmit it to MDS when releasing
4698 set_cap_epoch_barrier(m
->osd_epoch_barrier
);
4701 got_mds_push(session
);
4703 m
->clear_payload(); // for if/when we send back to MDS
4706 vinodeno_t
vino(m
->get_ino(), CEPH_NOSNAP
);
4707 if (inode_map
.count(vino
))
4708 in
= inode_map
[vino
];
4710 if (m
->get_op() == CEPH_CAP_OP_IMPORT
) {
4711 ldout(cct
, 5) << "handle_caps don't have vino " << vino
<< " on IMPORT, immediately releasing" << dendl
;
4712 session
->enqueue_cap_release(
4719 ldout(cct
, 5) << "handle_caps don't have vino " << vino
<< ", dropping" << dendl
;
4723 // in case the mds is waiting on e.g. a revocation
4724 flush_cap_releases();
4728 switch (m
->get_op()) {
4729 case CEPH_CAP_OP_EXPORT
:
4730 return handle_cap_export(session
, in
, m
);
4731 case CEPH_CAP_OP_FLUSHSNAP_ACK
:
4732 return handle_cap_flushsnap_ack(session
, in
, m
);
4733 case CEPH_CAP_OP_IMPORT
:
4734 handle_cap_import(session
, in
, m
);
4737 if (in
->caps
.count(mds
) == 0) {
4738 ldout(cct
, 5) << "handle_caps don't have " << *in
<< " cap on mds." << mds
<< dendl
;
4743 Cap
*cap
= in
->caps
[mds
];
4745 switch (m
->get_op()) {
4746 case CEPH_CAP_OP_TRUNC
: return handle_cap_trunc(session
, in
, m
);
4747 case CEPH_CAP_OP_IMPORT
:
4748 case CEPH_CAP_OP_REVOKE
:
4749 case CEPH_CAP_OP_GRANT
: return handle_cap_grant(session
, in
, cap
, m
);
4750 case CEPH_CAP_OP_FLUSH_ACK
: return handle_cap_flush_ack(session
, in
, cap
, m
);
4756 void Client::handle_cap_import(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4758 mds_rank_t mds
= session
->mds_num
;
4760 ldout(cct
, 5) << "handle_cap_import ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4761 << " IMPORT from mds." << mds
<< dendl
;
4763 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4766 if (m
->peer
.cap_id
&& in
->caps
.count(peer_mds
)) {
4767 cap
= in
->caps
[peer_mds
];
4769 cap_perms
= cap
->latest_perms
;
4774 SnapRealm
*realm
= NULL
;
4775 update_snap_trace(m
->snapbl
, &realm
);
4777 add_update_cap(in
, session
, m
->get_cap_id(),
4778 m
->get_caps(), m
->get_seq(), m
->get_mseq(), m
->get_realm(),
4779 CEPH_CAP_FLAG_AUTH
, cap_perms
);
4781 if (cap
&& cap
->cap_id
== m
->peer
.cap_id
) {
4782 remove_cap(cap
, (m
->peer
.flags
& CEPH_CAP_FLAG_RELEASE
));
4786 put_snap_realm(realm
);
4788 if (in
->auth_cap
&& in
->auth_cap
->session
->mds_num
== mds
) {
4789 // reflush any/all caps (if we are now the auth_cap)
4790 if (in
->cap_snaps
.size())
4791 flush_snaps(in
, true);
4792 if (in
->flushing_caps
)
4793 flush_caps(in
, session
);
4797 void Client::handle_cap_export(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4799 mds_rank_t mds
= session
->mds_num
;
4801 ldout(cct
, 5) << "handle_cap_export ino " << m
->get_ino() << " mseq " << m
->get_mseq()
4802 << " EXPORT from mds." << mds
<< dendl
;
4805 if (in
->caps
.count(mds
))
4806 cap
= in
->caps
[mds
];
4808 const mds_rank_t peer_mds
= mds_rank_t(m
->peer
.mds
);
4810 if (cap
&& cap
->cap_id
== m
->get_cap_id()) {
4811 if (m
->peer
.cap_id
) {
4812 MetaSession
*tsession
= _get_or_open_mds_session(peer_mds
);
4813 if (in
->caps
.count(peer_mds
)) {
4814 Cap
*tcap
= in
->caps
[peer_mds
];
4815 if (tcap
->cap_id
== m
->peer
.cap_id
&&
4816 ceph_seq_cmp(tcap
->seq
, m
->peer
.seq
) < 0) {
4817 tcap
->cap_id
= m
->peer
.cap_id
;
4818 tcap
->seq
= m
->peer
.seq
- 1;
4819 tcap
->issue_seq
= tcap
->seq
;
4820 tcap
->mseq
= m
->peer
.mseq
;
4821 tcap
->issued
|= cap
->issued
;
4822 tcap
->implemented
|= cap
->issued
;
4823 if (cap
== in
->auth_cap
)
4824 in
->auth_cap
= tcap
;
4825 if (in
->auth_cap
== tcap
&& in
->flushing_cap_item
.is_on_list())
4826 adjust_session_flushing_caps(in
, session
, tsession
);
4829 add_update_cap(in
, tsession
, m
->peer
.cap_id
, cap
->issued
,
4830 m
->peer
.seq
- 1, m
->peer
.mseq
, (uint64_t)-1,
4831 cap
== in
->auth_cap
? CEPH_CAP_FLAG_AUTH
: 0,
4835 if (cap
== in
->auth_cap
)
4836 in
->flags
|= I_CAP_DROPPED
;
4839 remove_cap(cap
, false);
4845 void Client::handle_cap_trunc(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4847 mds_rank_t mds
= session
->mds_num
;
4848 assert(in
->caps
[mds
]);
4850 ldout(cct
, 10) << "handle_cap_trunc on ino " << *in
4851 << " size " << in
->size
<< " -> " << m
->get_size()
4855 in
->caps_issued(&issued
);
4856 issued
|= in
->caps_dirty();
4857 update_inode_file_size(in
, issued
, m
->get_size(),
4858 m
->get_truncate_seq(), m
->get_truncate_size());
4862 void Client::handle_cap_flush_ack(MetaSession
*session
, Inode
*in
, Cap
*cap
, MClientCaps
*m
)
4864 ceph_tid_t flush_ack_tid
= m
->get_client_tid();
4865 int dirty
= m
->get_dirty();
4869 for (map
<ceph_tid_t
, int>::iterator it
= in
->flushing_cap_tids
.begin();
4870 it
!= in
->flushing_cap_tids
.end(); ) {
4871 if (it
->first
== flush_ack_tid
)
4872 cleaned
= it
->second
;
4873 if (it
->first
<= flush_ack_tid
) {
4874 session
->flushing_caps_tids
.erase(it
->first
);
4875 in
->flushing_cap_tids
.erase(it
++);
4879 cleaned
&= ~it
->second
;
4885 ldout(cct
, 5) << "handle_cap_flush_ack mds." << session
->mds_num
4886 << " cleaned " << ccap_string(cleaned
) << " on " << *in
4887 << " with " << ccap_string(dirty
) << dendl
;
4890 signal_cond_list(in
->waitfor_caps
);
4891 if (session
->flushing_caps_tids
.empty() ||
4892 *session
->flushing_caps_tids
.begin() > flush_ack_tid
)
4897 in
->cap_dirtier_uid
= -1;
4898 in
->cap_dirtier_gid
= -1;
4902 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != any cap bit tids" << dendl
;
4904 if (in
->flushing_caps
) {
4905 ldout(cct
, 5) << " flushing_caps " << ccap_string(in
->flushing_caps
)
4906 << " -> " << ccap_string(in
->flushing_caps
& ~cleaned
) << dendl
;
4907 in
->flushing_caps
&= ~cleaned
;
4908 if (in
->flushing_caps
== 0) {
4909 ldout(cct
, 10) << " " << *in
<< " !flushing" << dendl
;
4910 num_flushing_caps
--;
4911 if (in
->cap_snaps
.empty())
4912 in
->flushing_cap_item
.remove_myself();
4914 if (!in
->caps_dirty())
4923 void Client::handle_cap_flushsnap_ack(MetaSession
*session
, Inode
*in
, MClientCaps
*m
)
4925 mds_rank_t mds
= session
->mds_num
;
4926 assert(in
->caps
[mds
]);
4927 snapid_t follows
= m
->get_snap_follows();
4929 if (in
->cap_snaps
.count(follows
)) {
4930 CapSnap
&capsnap
= in
->cap_snaps
.at(follows
);
4931 if (m
->get_client_tid() != capsnap
.flush_tid
) {
4932 ldout(cct
, 10) << " tid " << m
->get_client_tid() << " != " << capsnap
.flush_tid
<< dendl
;
4934 ldout(cct
, 5) << "handle_cap_flushedsnap mds." << mds
<< " flushed snap follows " << follows
4935 << " on " << *in
<< dendl
;
4937 if (in
->get_num_ref() == 1)
4938 tmp_ref
= in
; // make sure inode not get freed while erasing item from in->cap_snaps
4939 if (in
->flushing_caps
== 0 && in
->cap_snaps
.empty())
4940 in
->flushing_cap_item
.remove_myself();
4941 session
->flushing_caps_tids
.erase(capsnap
.flush_tid
);
4942 in
->cap_snaps
.erase(follows
);
4945 ldout(cct
, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds
<< " flushed snap follows " << follows
4946 << " on " << *in
<< dendl
;
4947 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4953 class C_Client_DentryInvalidate
: public Context
{
4960 C_Client_DentryInvalidate(Client
*c
, Dentry
*dn
, bool del
) :
4961 client(c
), name(dn
->name
) {
4962 if (client
->use_faked_inos()) {
4963 dirino
.ino
= dn
->dir
->parent_inode
->faked_ino
;
4965 ino
.ino
= dn
->inode
->faked_ino
;
4967 dirino
= dn
->dir
->parent_inode
->vino();
4969 ino
= dn
->inode
->vino();
4972 ino
.ino
= inodeno_t();
4974 void finish(int r
) override
{
4975 // _async_dentry_invalidate is responsible for its own locking
4976 assert(!client
->client_lock
.is_locked_by_me());
4977 client
->_async_dentry_invalidate(dirino
, ino
, name
);
4981 void Client::_async_dentry_invalidate(vinodeno_t dirino
, vinodeno_t ino
, string
& name
)
4985 ldout(cct
, 10) << "_async_dentry_invalidate '" << name
<< "' ino " << ino
4986 << " in dir " << dirino
<< dendl
;
4987 dentry_invalidate_cb(callback_handle
, dirino
, ino
, name
);
4990 void Client::_schedule_invalidate_dentry_callback(Dentry
*dn
, bool del
)
4992 if (dentry_invalidate_cb
&& dn
->inode
->ll_ref
> 0)
4993 async_dentry_invalidator
.queue(new C_Client_DentryInvalidate(this, dn
, del
));
4996 void Client::_try_to_trim_inode(Inode
*in
, bool sched_inval
)
4998 int ref
= in
->get_num_ref();
5000 if (in
->dir
&& !in
->dir
->dentries
.empty()) {
5001 for (auto p
= in
->dir
->dentries
.begin();
5002 p
!= in
->dir
->dentries
.end(); ) {
5003 Dentry
*dn
= p
->second
;
5005 /* rmsnap removes whole subtree, need trim inodes recursively.
5006 * we don't need to invalidate dentries recursively. because
5007 * invalidating a directory dentry effectively invalidate
5009 if (in
->snapid
!= CEPH_NOSNAP
&& dn
->inode
&& dn
->inode
->is_dir())
5010 _try_to_trim_inode(dn
->inode
.get(), false);
5012 if (dn
->lru_is_expireable())
5013 unlink(dn
, true, false); // keep dir, drop dentry
5015 if (in
->dir
->dentries
.empty()) {
5021 if (ref
> 0 && (in
->flags
& I_SNAPDIR_OPEN
)) {
5022 InodeRef snapdir
= open_snapdir(in
);
5023 _try_to_trim_inode(snapdir
.get(), false);
5027 if (ref
> 0 && in
->ll_ref
> 0 && sched_inval
) {
5028 set
<Dentry
*>::iterator q
= in
->dn_set
.begin();
5029 while (q
!= in
->dn_set
.end()) {
5031 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5032 // so in->dn_set doesn't always reflect the state of kernel's dcache.
5033 _schedule_invalidate_dentry_callback(dn
, true);
5034 unlink(dn
, true, true);
5039 void Client::handle_cap_grant(MetaSession
*session
, Inode
*in
, Cap
*cap
, MClientCaps
*m
)
5041 mds_rank_t mds
= session
->mds_num
;
5042 int used
= get_caps_used(in
);
5043 int wanted
= in
->caps_wanted();
5045 const int old_caps
= cap
->issued
;
5046 const int new_caps
= m
->get_caps();
5047 ldout(cct
, 5) << "handle_cap_grant on in " << m
->get_ino()
5048 << " mds." << mds
<< " seq " << m
->get_seq()
5049 << " caps now " << ccap_string(new_caps
)
5050 << " was " << ccap_string(old_caps
) << dendl
;
5051 cap
->seq
= m
->get_seq();
5052 cap
->gen
= session
->cap_gen
;
5056 in
->caps_issued(&issued
);
5057 issued
|= in
->caps_dirty();
5059 if ((new_caps
& CEPH_CAP_AUTH_SHARED
) &&
5060 !(issued
& CEPH_CAP_AUTH_EXCL
)) {
5061 in
->mode
= m
->head
.mode
;
5062 in
->uid
= m
->head
.uid
;
5063 in
->gid
= m
->head
.gid
;
5064 in
->btime
= m
->btime
;
5066 bool deleted_inode
= false;
5067 if ((new_caps
& CEPH_CAP_LINK_SHARED
) &&
5068 !(issued
& CEPH_CAP_LINK_EXCL
)) {
5069 in
->nlink
= m
->head
.nlink
;
5070 if (in
->nlink
== 0 &&
5071 (new_caps
& (CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
)))
5072 deleted_inode
= true;
5074 if (!(issued
& CEPH_CAP_XATTR_EXCL
) &&
5075 m
->xattrbl
.length() &&
5076 m
->head
.xattr_version
> in
->xattr_version
) {
5077 bufferlist::iterator p
= m
->xattrbl
.begin();
5078 ::decode(in
->xattrs
, p
);
5079 in
->xattr_version
= m
->head
.xattr_version
;
5082 if ((new_caps
& CEPH_CAP_FILE_SHARED
) && m
->dirstat_is_valid()) {
5083 in
->dirstat
.nfiles
= m
->get_nfiles();
5084 in
->dirstat
.nsubdirs
= m
->get_nsubdirs();
5087 if (new_caps
& CEPH_CAP_ANY_RD
) {
5088 update_inode_file_time(in
, issued
, m
->get_time_warp_seq(),
5089 m
->get_ctime(), m
->get_mtime(), m
->get_atime());
5092 if (new_caps
& (CEPH_CAP_ANY_FILE_RD
| CEPH_CAP_ANY_FILE_WR
)) {
5093 in
->layout
= m
->get_layout();
5094 update_inode_file_size(in
, issued
, m
->get_size(),
5095 m
->get_truncate_seq(), m
->get_truncate_size());
5098 if (m
->inline_version
> in
->inline_version
) {
5099 in
->inline_data
= m
->inline_data
;
5100 in
->inline_version
= m
->inline_version
;
5103 /* always take a newer change attr */
5104 if (m
->get_change_attr() > in
->change_attr
)
5105 in
->change_attr
= m
->get_change_attr();
5108 if (cap
== in
->auth_cap
&&
5109 (new_caps
& CEPH_CAP_ANY_FILE_WR
) &&
5110 (m
->get_max_size() != in
->max_size
)) {
5111 ldout(cct
, 10) << "max_size " << in
->max_size
<< " -> " << m
->get_max_size() << dendl
;
5112 in
->max_size
= m
->get_max_size();
5113 if (in
->max_size
> in
->wanted_max_size
) {
5114 in
->wanted_max_size
= 0;
5115 in
->requested_max_size
= 0;
5120 if (m
->get_op() == CEPH_CAP_OP_IMPORT
&& m
->get_wanted() != wanted
)
5123 check_cap_issue(in
, cap
, new_caps
);
5126 int revoked
= old_caps
& ~new_caps
;
5128 ldout(cct
, 10) << " revocation of " << ccap_string(revoked
) << dendl
;
5129 cap
->issued
= new_caps
;
5130 cap
->implemented
|= new_caps
;
5132 // recall delegations if we're losing caps necessary for them
5133 if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_RD
))
5134 in
->recall_deleg(false);
5135 else if (revoked
& ceph_deleg_caps_for_type(CEPH_DELEGATION_WR
))
5136 in
->recall_deleg(true);
5138 if ((used
& revoked
& CEPH_CAP_FILE_BUFFER
) &&
5139 !_flush(in
, new C_Client_FlushComplete(this, in
))) {
5140 // waitin' for flush
5141 } else if (revoked
& CEPH_CAP_FILE_CACHE
) {
5145 cap
->wanted
= 0; // don't let check_caps skip sending a response to MDS
5148 } else if (old_caps
== new_caps
) {
5149 ldout(cct
, 10) << " caps unchanged at " << ccap_string(old_caps
) << dendl
;
5151 ldout(cct
, 10) << " grant, new caps are " << ccap_string(new_caps
& ~old_caps
) << dendl
;
5152 cap
->issued
= new_caps
;
5153 cap
->implemented
|= new_caps
;
5155 if (cap
== in
->auth_cap
) {
5156 // non-auth MDS is revoking the newly grant caps ?
5157 for (map
<mds_rank_t
, Cap
*>::iterator it
= in
->caps
.begin(); it
!= in
->caps
.end(); ++it
) {
5158 if (it
->second
== cap
)
5160 if (it
->second
->implemented
& ~it
->second
->issued
& new_caps
) {
5173 signal_cond_list(in
->waitfor_caps
);
5175 // may drop inode's last ref
5177 _try_to_trim_inode(in
, true);
5182 int Client::inode_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
5184 if (perms
.uid() == 0)
5187 if (perms
.uid() != in
->uid
&& (in
->mode
& S_IRWXG
)) {
5188 int ret
= _posix_acl_permission(in
, perms
, want
);
5193 // check permissions before doing anything else
5194 if (!in
->check_mode(perms
, want
))
5199 int Client::xattr_permission(Inode
*in
, const char *name
, unsigned want
,
5200 const UserPerm
& perms
)
5202 int r
= _getattr_for_perm(in
, perms
);
5207 if (strncmp(name
, "system.", 7) == 0) {
5208 if ((want
& MAY_WRITE
) && (perms
.uid() != 0 && perms
.uid() != in
->uid
))
5211 r
= inode_permission(in
, perms
, want
);
5214 ldout(cct
, 5) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5218 ostream
& operator<<(ostream
&out
, const UserPerm
& perm
) {
5219 out
<< "UserPerm(uid: " << perm
.uid() << ", gid: " << perm
.gid() << ")";
5223 int Client::may_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
5224 const UserPerm
& perms
)
5226 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5227 int r
= _getattr_for_perm(in
, perms
);
5231 if (mask
& CEPH_SETATTR_SIZE
) {
5232 r
= inode_permission(in
, perms
, MAY_WRITE
);
5238 if (mask
& CEPH_SETATTR_UID
) {
5239 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
|| stx
->stx_uid
!= in
->uid
))
5242 if (mask
& CEPH_SETATTR_GID
) {
5243 if (perms
.uid() != 0 && (perms
.uid() != in
->uid
||
5244 (!perms
.gid_in_groups(stx
->stx_gid
) && stx
->stx_gid
!= in
->gid
)))
5248 if (mask
& CEPH_SETATTR_MODE
) {
5249 if (perms
.uid() != 0 && perms
.uid() != in
->uid
)
5252 gid_t i_gid
= (mask
& CEPH_SETATTR_GID
) ? stx
->stx_gid
: in
->gid
;
5253 if (perms
.uid() != 0 && !perms
.gid_in_groups(i_gid
))
5254 stx
->stx_mode
&= ~S_ISGID
;
5257 if (mask
& (CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
|
5258 CEPH_SETATTR_MTIME
| CEPH_SETATTR_ATIME
)) {
5259 if (perms
.uid() != 0 && perms
.uid() != in
->uid
) {
5260 int check_mask
= CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
;
5261 if (!(mask
& CEPH_SETATTR_MTIME_NOW
))
5262 check_mask
|= CEPH_SETATTR_MTIME
;
5263 if (!(mask
& CEPH_SETATTR_ATIME_NOW
))
5264 check_mask
|= CEPH_SETATTR_ATIME
;
5265 if (check_mask
& mask
) {
5268 r
= inode_permission(in
, perms
, MAY_WRITE
);
5276 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5280 int Client::may_open(Inode
*in
, int flags
, const UserPerm
& perms
)
5282 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5285 if ((flags
& O_ACCMODE
) == O_WRONLY
)
5287 else if ((flags
& O_ACCMODE
) == O_RDWR
)
5288 want
= MAY_READ
| MAY_WRITE
;
5289 else if ((flags
& O_ACCMODE
) == O_RDONLY
)
5291 if (flags
& O_TRUNC
)
5295 switch (in
->mode
& S_IFMT
) {
5300 if (want
& MAY_WRITE
) {
5307 r
= _getattr_for_perm(in
, perms
);
5311 r
= inode_permission(in
, perms
, want
);
5313 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5317 int Client::may_lookup(Inode
*dir
, const UserPerm
& perms
)
5319 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5320 int r
= _getattr_for_perm(dir
, perms
);
5324 r
= inode_permission(dir
, perms
, MAY_EXEC
);
5326 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5330 int Client::may_create(Inode
*dir
, const UserPerm
& perms
)
5332 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << perms
<< dendl
;
5333 int r
= _getattr_for_perm(dir
, perms
);
5337 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5339 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5343 int Client::may_delete(Inode
*dir
, const char *name
, const UserPerm
& perms
)
5345 ldout(cct
, 20) << __func__
<< " " << *dir
<< "; " << "; name " << name
<< "; " << perms
<< dendl
;
5346 int r
= _getattr_for_perm(dir
, perms
);
5350 r
= inode_permission(dir
, perms
, MAY_EXEC
| MAY_WRITE
);
5354 /* 'name == NULL' means rmsnap */
5355 if (perms
.uid() != 0 && name
&& (dir
->mode
& S_ISVTX
)) {
5357 r
= _lookup(dir
, name
, CEPH_CAP_AUTH_SHARED
, &otherin
, perms
);
5360 if (dir
->uid
!= perms
.uid() && otherin
->uid
!= perms
.uid())
5364 ldout(cct
, 3) << __func__
<< " " << dir
<< " = " << r
<< dendl
;
5368 int Client::may_hardlink(Inode
*in
, const UserPerm
& perms
)
5370 ldout(cct
, 20) << __func__
<< " " << *in
<< "; " << perms
<< dendl
;
5371 int r
= _getattr_for_perm(in
, perms
);
5375 if (perms
.uid() == 0 || perms
.uid() == in
->uid
) {
5381 if (!S_ISREG(in
->mode
))
5384 if (in
->mode
& S_ISUID
)
5387 if ((in
->mode
& (S_ISGID
| S_IXGRP
)) == (S_ISGID
| S_IXGRP
))
5390 r
= inode_permission(in
, perms
, MAY_READ
| MAY_WRITE
);
5392 ldout(cct
, 3) << __func__
<< " " << in
<< " = " << r
<< dendl
;
5396 int Client::_getattr_for_perm(Inode
*in
, const UserPerm
& perms
)
5398 int mask
= CEPH_STAT_CAP_MODE
;
5400 if (acl_type
!= NO_ACL
) {
5401 mask
|= CEPH_STAT_CAP_XATTR
;
5402 force
= in
->xattr_version
== 0;
5404 return _getattr(in
, mask
, perms
, force
);
5407 vinodeno_t
Client::_get_vino(Inode
*in
)
5409 /* The caller must hold the client lock */
5410 return vinodeno_t(in
->ino
, in
->snapid
);
5413 inodeno_t
Client::_get_inodeno(Inode
*in
)
5415 /* The caller must hold the client lock */
5421 * Resolve an MDS spec to a list of MDS daemon GIDs.
5423 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5424 * It may be '*' in which case it matches all GIDs.
5426 * If no error is returned, the `targets` vector will be populated with at least
5429 int Client::resolve_mds(
5430 const std::string
&mds_spec
,
5431 std::vector
<mds_gid_t
> *targets
)
5434 assert(targets
!= nullptr);
5437 std::stringstream ss
;
5438 int role_r
= fsmap
->parse_role(mds_spec
, &role
, ss
);
5440 // We got a role, resolve it to a GID
5441 ldout(cct
, 10) << __func__
<< ": resolved '" << mds_spec
<< "' to role '"
5442 << role
<< "'" << dendl
;
5444 fsmap
->get_filesystem(role
.fscid
)->mds_map
.get_info(role
.rank
).global_id
);
5448 std::string strtol_err
;
5449 long long rank_or_gid
= strict_strtoll(mds_spec
.c_str(), 10, &strtol_err
);
5450 if (strtol_err
.empty()) {
5451 // It is a possible GID
5452 const mds_gid_t mds_gid
= mds_gid_t(rank_or_gid
);
5453 if (fsmap
->gid_exists(mds_gid
)) {
5454 ldout(cct
, 10) << __func__
<< ": validated GID " << mds_gid
<< dendl
;
5455 targets
->push_back(mds_gid
);
5457 lderr(cct
) << __func__
<< ": GID " << mds_gid
<< " not in MDS map"
5461 } else if (mds_spec
== "*") {
5462 // It is a wildcard: use all MDSs
5463 const auto mds_info
= fsmap
->get_mds_info();
5465 if (mds_info
.empty()) {
5466 lderr(cct
) << __func__
<< ": * passed but no MDS daemons found" << dendl
;
5470 for (const auto i
: mds_info
) {
5471 targets
->push_back(i
.first
);
5474 // It did not parse as an integer, it is not a wildcard, it must be a name
5475 const mds_gid_t mds_gid
= fsmap
->find_mds_gid_by_name(mds_spec
);
5477 lderr(cct
) << "MDS ID '" << mds_spec
<< "' not found" << dendl
;
5479 lderr(cct
) << "FSMap: " << *fsmap
<< dendl
;
5483 ldout(cct
, 10) << __func__
<< ": resolved ID '" << mds_spec
5484 << "' to GID " << mds_gid
<< dendl
;
5485 targets
->push_back(mds_gid
);
5494 * Authenticate with mon and establish global ID
5496 int Client::authenticate()
5498 assert(client_lock
.is_locked_by_me());
5500 if (monclient
->is_authenticated()) {
5504 client_lock
.Unlock();
5505 int r
= monclient
->authenticate(cct
->_conf
->client_mount_timeout
);
5511 whoami
= monclient
->get_global_id();
5512 messenger
->set_myname(entity_name_t::CLIENT(whoami
.v
));
5517 int Client::fetch_fsmap(bool user
)
5520 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5521 // rather than MDSMap because no one MDSMap contains all the daemons, and
5522 // a `tell` can address any daemon.
5523 version_t fsmap_latest
;
5526 monclient
->get_version("fsmap", &fsmap_latest
, NULL
, &cond
);
5527 client_lock
.Unlock();
5530 } while (r
== -EAGAIN
);
5533 lderr(cct
) << "Failed to learn FSMap version: " << cpp_strerror(r
) << dendl
;
5537 ldout(cct
, 10) << __func__
<< " learned FSMap version " << fsmap_latest
<< dendl
;
5540 if (!fsmap_user
|| fsmap_user
->get_epoch() < fsmap_latest
) {
5541 monclient
->sub_want("fsmap.user", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5542 monclient
->renew_subs();
5543 wait_on_list(waiting_for_fsmap
);
5546 assert(fsmap_user
->get_epoch() >= fsmap_latest
);
5548 if (!fsmap
|| fsmap
->get_epoch() < fsmap_latest
) {
5549 monclient
->sub_want("fsmap", fsmap_latest
, CEPH_SUBSCRIBE_ONETIME
);
5550 monclient
->renew_subs();
5551 wait_on_list(waiting_for_fsmap
);
5554 assert(fsmap
->get_epoch() >= fsmap_latest
);
5556 ldout(cct
, 10) << __func__
<< " finished waiting for FSMap version "
5557 << fsmap_latest
<< dendl
;
5563 * @mds_spec one of ID, rank, GID, "*"
5566 int Client::mds_command(
5567 const std::string
&mds_spec
,
5568 const vector
<string
>& cmd
,
5569 const bufferlist
& inbl
,
5574 Mutex::Locker
lock(client_lock
);
5585 r
= fetch_fsmap(false);
5590 // Look up MDS target(s) of the command
5591 std::vector
<mds_gid_t
> targets
;
5592 r
= resolve_mds(mds_spec
, &targets
);
5597 // If daemons are laggy, we won't send them commands. If all
5598 // are laggy then we fail.
5599 std::vector
<mds_gid_t
> non_laggy
;
5600 for (const auto gid
: targets
) {
5601 const auto info
= fsmap
->get_info_gid(gid
);
5602 if (!info
.laggy()) {
5603 non_laggy
.push_back(gid
);
5606 if (non_laggy
.size() == 0) {
5607 *outs
= "All targeted MDS daemons are laggy";
5611 if (metadata
.empty()) {
5612 // We are called on an unmounted client, so metadata
5613 // won't be initialized yet.
5614 populate_metadata("");
5617 // Send commands to targets
5618 C_GatherBuilder
gather(cct
, onfinish
);
5619 for (const auto target_gid
: non_laggy
) {
5620 const auto info
= fsmap
->get_info_gid(target_gid
);
5622 // Open a connection to the target MDS
5623 entity_inst_t inst
= info
.get_inst();
5624 ConnectionRef conn
= messenger
->get_connection(inst
);
5626 // Generate MDSCommandOp state
5627 auto &op
= command_table
.start_command();
5629 op
.on_finish
= gather
.new_sub();
5634 op
.mds_gid
= target_gid
;
5637 ldout(cct
, 4) << __func__
<< ": new command op to " << target_gid
5638 << " tid=" << op
.tid
<< cmd
<< dendl
;
5640 // Construct and send MCommand
5641 MCommand
*m
= op
.get_message(monclient
->get_fsid());
5642 conn
->send_message(m
);
5649 void Client::handle_command_reply(MCommandReply
*m
)
5651 ceph_tid_t
const tid
= m
->get_tid();
5653 ldout(cct
, 10) << __func__
<< ": tid=" << m
->get_tid() << dendl
;
5655 if (!command_table
.exists(tid
)) {
5656 ldout(cct
, 1) << __func__
<< ": unknown tid " << tid
<< ", dropping" << dendl
;
5661 auto &op
= command_table
.get_command(tid
);
5663 op
.outbl
->claim(m
->get_data());
5670 op
.on_finish
->complete(m
->r
);
5673 command_table
.erase(tid
);
5678 // -------------------
5681 int Client::mount(const std::string
&mount_root
, const UserPerm
& perms
,
5684 Mutex::Locker
lock(client_lock
);
5687 ldout(cct
, 5) << "already mounted" << dendl
;
5693 int r
= authenticate();
5695 lderr(cct
) << "authentication failed: " << cpp_strerror(r
) << dendl
;
5699 std::string want
= "mdsmap";
5700 const auto &mds_ns
= cct
->_conf
->client_mds_namespace
;
5701 if (!mds_ns
.empty()) {
5702 r
= fetch_fsmap(true);
5705 fs_cluster_id_t cid
= fsmap_user
->get_fs_cid(mds_ns
);
5706 if (cid
== FS_CLUSTER_ID_NONE
)
5709 std::ostringstream oss
;
5710 oss
<< want
<< "." << cid
;
5713 ldout(cct
, 10) << "Subscribing to map '" << want
<< "'" << dendl
;
5715 monclient
->sub_want(want
, 0, 0);
5716 monclient
->renew_subs();
5718 tick(); // start tick
5722 auto availability
= mdsmap
->is_cluster_available();
5723 if (availability
== MDSMap::STUCK_UNAVAILABLE
) {
5725 ldout(cct
, 10) << "mds cluster unavailable: epoch=" << mdsmap
->get_epoch() << dendl
;
5726 return CEPH_FUSE_NO_MDS_UP
;
5727 } else if (availability
== MDSMap::AVAILABLE
) {
5728 // Continue to mount
5730 } else if (availability
== MDSMap::TRANSIENT_UNAVAILABLE
) {
5731 // Else, wait. MDSMonitor will update the map to bring
5732 // us to a conclusion eventually.
5733 wait_on_list(waiting_for_mdsmap
);
5735 // Unexpected value!
5741 populate_metadata(mount_root
.empty() ? "/" : mount_root
);
5743 filepath
fp(CEPH_INO_ROOT
);
5744 if (!mount_root
.empty()) {
5745 fp
= filepath(mount_root
.c_str());
5748 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
5749 req
->set_filepath(fp
);
5750 req
->head
.args
.getattr
.mask
= CEPH_STAT_CAP_INODE_ALL
;
5751 int res
= make_request(req
, perms
);
5753 if (res
== -EACCES
&& root
) {
5754 ldout(cct
, 1) << __func__
<< " EACCES on parent of mount point; quotas may not work" << dendl
;
5772 if (!cct
->_conf
->client_trace
.empty()) {
5773 traceout
.open(cct
->_conf
->client_trace
.c_str());
5774 if (traceout
.is_open()) {
5775 ldout(cct
, 1) << "opened trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5777 ldout(cct
, 1) << "FAILED to open trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5782 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5783 ldout(cct, 3) << "op: struct stat st;" << dendl;
5784 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5785 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5786 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5787 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5788 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5789 ldout(cct, 3) << "op: int fd;" << dendl;
5796 void Client::_close_sessions()
5798 while (!mds_sessions
.empty()) {
5799 // send session closes!
5800 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
5801 p
!= mds_sessions
.end();
5803 if (p
->second
->state
!= MetaSession::STATE_CLOSING
) {
5804 _close_mds_session(p
->second
);
5808 // wait for sessions to close
5809 ldout(cct
, 2) << "waiting for " << mds_sessions
.size() << " mds sessions to close" << dendl
;
5810 mount_cond
.Wait(client_lock
);
5814 void Client::flush_mdlog_sync()
5816 if (mds_requests
.empty())
5818 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
5819 p
!= mds_sessions
.end();
5821 MetaSession
*s
= p
->second
;
5826 void Client::flush_mdlog(MetaSession
*session
)
5828 // Only send this to Luminous or newer MDS daemons, older daemons
5829 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5830 const uint64_t features
= session
->con
->get_features();
5831 if (HAVE_FEATURE(features
, SERVER_LUMINOUS
)) {
5832 MClientSession
*m
= new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG
);
5833 session
->con
->send_message(m
);
5838 void Client::_unmount()
5843 ldout(cct
, 2) << "unmounting" << dendl
;
5848 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5849 while (!mds_requests
.empty()) {
5850 ldout(cct
, 10) << "waiting on " << mds_requests
.size() << " requests" << dendl
;
5851 mount_cond
.Wait(client_lock
);
5855 timer
.cancel_event(tick_event
);
5860 // clean up any unclosed files
5861 while (!fd_map
.empty()) {
5862 Fh
*fh
= fd_map
.begin()->second
;
5863 fd_map
.erase(fd_map
.begin());
5864 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *fh
->inode
<< dendl
;
5868 while (!ll_unclosed_fh_set
.empty()) {
5869 set
<Fh
*>::iterator it
= ll_unclosed_fh_set
.begin();
5871 ll_unclosed_fh_set
.erase(fh
);
5872 ldout(cct
, 0) << " destroyed lost open file " << fh
<< " on " << *(fh
->inode
) << dendl
;
5876 while (!opened_dirs
.empty()) {
5877 dir_result_t
*dirp
= *opened_dirs
.begin();
5878 ldout(cct
, 0) << " destroyed lost open dir " << dirp
<< " on " << *dirp
->inode
<< dendl
;
5885 ldout(cct
, 0) << " skipping clean shutdown, we are blacklisted" << dendl
;
5887 if (cct
->_conf
->client_oc
) {
5888 // Purge all cached data so that ObjectCacher doesn't get hung up
5889 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5890 // is to just leave things marked dirty
5891 // (http://tracker.ceph.com/issues/9105)
5892 for (const auto &i
: inode_map
) {
5893 objectcacher
->purge_set(&(i
.second
->oset
));
5901 while (unsafe_sync_write
> 0) {
5902 ldout(cct
, 0) << unsafe_sync_write
<< " unsafe_sync_writes, waiting" << dendl
;
5903 mount_cond
.Wait(client_lock
);
5906 if (cct
->_conf
->client_oc
) {
5907 // flush/release all buffered data
5908 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
5909 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator p
= inode_map
.begin();
5910 p
!= inode_map
.end();
5914 Inode
*in
= p
->second
;
5916 ldout(cct
, 0) << "null inode_map entry ino " << p
->first
<< dendl
;
5919 if (!in
->caps
.empty()) {
5920 InodeRef
tmp_ref(in
);
5922 _flush(in
, new C_Client_FlushComplete(this, in
));
5928 wait_sync_caps(last_flush_tid
);
5933 while (lru
.lru_get_size() > 0 ||
5934 !inode_map
.empty()) {
5935 ldout(cct
, 2) << "cache still has " << lru
.lru_get_size()
5936 << "+" << inode_map
.size() << " items"
5937 << ", waiting (for caps to release?)"
5939 utime_t until
= ceph_clock_now() + utime_t(5, 0);
5940 int r
= mount_cond
.WaitUntil(client_lock
, until
);
5941 if (r
== ETIMEDOUT
) {
5945 assert(lru
.lru_get_size() == 0);
5946 assert(inode_map
.empty());
5949 if (!cct
->_conf
->client_trace
.empty()) {
5950 ldout(cct
, 1) << "closing trace file '" << cct
->_conf
->client_trace
<< "'" << dendl
;
5958 ldout(cct
, 2) << "unmounted." << dendl
;
5961 void Client::unmount()
5963 Mutex::Locker
lock(client_lock
);
5967 void Client::flush_cap_releases()
5969 // send any cap releases
5970 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
5971 p
!= mds_sessions
.end();
5973 if (p
->second
->release
&& mdsmap
->is_clientreplay_or_active_or_stopping(
5975 if (cct
->_conf
->client_inject_release_failure
) {
5976 ldout(cct
, 20) << __func__
<< " injecting failure to send cap release message" << dendl
;
5977 p
->second
->release
->put();
5979 p
->second
->con
->send_message(p
->second
->release
);
5981 p
->second
->release
= 0;
5988 if (cct
->_conf
->client_debug_inject_tick_delay
> 0) {
5989 sleep(cct
->_conf
->client_debug_inject_tick_delay
);
5990 assert(0 == cct
->_conf
->set_val("client_debug_inject_tick_delay", "0"));
5991 cct
->_conf
->apply_changes(NULL
);
5994 ldout(cct
, 21) << "tick" << dendl
;
5995 tick_event
= timer
.add_event_after(
5996 cct
->_conf
->client_tick_interval
,
5997 new FunctionContext([this](int) {
5998 // Called back via Timer, which takes client_lock for us
5999 assert(client_lock
.is_locked_by_me());
6002 utime_t now
= ceph_clock_now();
6004 if (!mounted
&& !mds_requests
.empty()) {
6005 MetaRequest
*req
= mds_requests
.begin()->second
;
6006 if (req
->op_stamp
+ cct
->_conf
->client_mount_timeout
< now
) {
6007 req
->abort(-ETIMEDOUT
);
6008 if (req
->caller_cond
) {
6010 req
->caller_cond
->Signal();
6012 signal_cond_list(waiting_for_mdsmap
);
6013 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
6014 p
!= mds_sessions
.end();
6016 signal_context_list(p
->second
->waiting_for_open
);
6020 if (mdsmap
->get_epoch()) {
6022 utime_t el
= now
- last_cap_renew
;
6023 if (el
> mdsmap
->get_session_timeout() / 3.0)
6026 flush_cap_releases();
6030 xlist
<Inode
*>::iterator p
= delayed_list
.begin();
6034 if (in
->hold_caps_until
> now
)
6036 delayed_list
.pop_front();
6037 check_caps(in
, CHECK_CAPS_NODELAY
);
6043 void Client::renew_caps()
6045 ldout(cct
, 10) << "renew_caps()" << dendl
;
6046 last_cap_renew
= ceph_clock_now();
6048 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
6049 p
!= mds_sessions
.end();
6051 ldout(cct
, 15) << "renew_caps requesting from mds." << p
->first
<< dendl
;
6052 if (mdsmap
->get_state(p
->first
) >= MDSMap::STATE_REJOIN
)
6053 renew_caps(p
->second
);
6057 void Client::renew_caps(MetaSession
*session
)
6059 ldout(cct
, 10) << "renew_caps mds." << session
->mds_num
<< dendl
;
6060 session
->last_cap_renew_request
= ceph_clock_now();
6061 uint64_t seq
= ++session
->cap_renew_seq
;
6062 session
->con
->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS
, seq
));
6066 // ===============================================================
6067 // high level (POSIXy) interface
6069 int Client::_do_lookup(Inode
*dir
, const string
& name
, int mask
,
6070 InodeRef
*target
, const UserPerm
& perms
)
6072 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_LOOKUPSNAP
: CEPH_MDS_OP_LOOKUP
;
6073 MetaRequest
*req
= new MetaRequest(op
);
6075 dir
->make_nosnap_relative_path(path
);
6076 path
.push_dentry(name
);
6077 req
->set_filepath(path
);
6078 req
->set_inode(dir
);
6079 if (cct
->_conf
->client_debug_getattr_caps
&& op
== CEPH_MDS_OP_LOOKUP
)
6080 mask
|= DEBUG_GETATTR_CAPS
;
6081 req
->head
.args
.getattr
.mask
= mask
;
6083 ldout(cct
, 10) << "_do_lookup on " << path
<< dendl
;
6085 int r
= make_request(req
, perms
, target
);
6086 ldout(cct
, 10) << "_do_lookup res is " << r
<< dendl
;
6090 int Client::_lookup(Inode
*dir
, const string
& dname
, int mask
, InodeRef
*target
,
6091 const UserPerm
& perms
)
6096 if (!dir
->is_dir()) {
6101 if (dname
== "..") {
6102 if (dir
->dn_set
.empty())
6105 *target
= dir
->get_first_parent()->dir
->parent_inode
; //dirs can't be hard-linked
6114 if (dname
.length() > NAME_MAX
) {
6119 if (dname
== cct
->_conf
->client_snapdir
&&
6120 dir
->snapid
== CEPH_NOSNAP
) {
6121 *target
= open_snapdir(dir
);
6126 dir
->dir
->dentries
.count(dname
)) {
6127 dn
= dir
->dir
->dentries
[dname
];
6129 ldout(cct
, 20) << "_lookup have dn " << dname
<< " mds." << dn
->lease_mds
<< " ttl " << dn
->lease_ttl
6130 << " seq " << dn
->lease_seq
6133 if (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)) {
6134 // is dn lease valid?
6135 utime_t now
= ceph_clock_now();
6136 if (dn
->lease_mds
>= 0 &&
6137 dn
->lease_ttl
> now
&&
6138 mds_sessions
.count(dn
->lease_mds
)) {
6139 MetaSession
*s
= mds_sessions
[dn
->lease_mds
];
6140 if (s
->cap_ttl
> now
&&
6141 s
->cap_gen
== dn
->lease_gen
) {
6142 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6143 // make trim_caps() behave.
6144 dir
->try_touch_cap(dn
->lease_mds
);
6147 ldout(cct
, 20) << " bad lease, cap_ttl " << s
->cap_ttl
<< ", cap_gen " << s
->cap_gen
6148 << " vs lease_gen " << dn
->lease_gen
<< dendl
;
6151 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
6152 if (dn
->cap_shared_gen
== dir
->shared_gen
&&
6153 (!dn
->inode
|| dn
->inode
->caps_issued_mask(mask
, true)))
6155 if (!dn
->inode
&& (dir
->flags
& I_COMPLETE
)) {
6156 ldout(cct
, 10) << "_lookup concluded ENOENT locally for "
6157 << *dir
<< " dn '" << dname
<< "'" << dendl
;
6162 ldout(cct
, 20) << " no cap on " << dn
->inode
->vino() << dendl
;
6165 // can we conclude ENOENT locally?
6166 if (dir
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true) &&
6167 (dir
->flags
& I_COMPLETE
)) {
6168 ldout(cct
, 10) << "_lookup concluded ENOENT locally for " << *dir
<< " dn '" << dname
<< "'" << dendl
;
6173 r
= _do_lookup(dir
, dname
, mask
, target
, perms
);
6178 *target
= dn
->inode
;
6186 ldout(cct
, 10) << "_lookup " << *dir
<< " " << dname
<< " = " << r
<< dendl
;
6188 ldout(cct
, 10) << "_lookup " << *dir
<< " " << dname
<< " = " << **target
<< dendl
;
6192 int Client::get_or_create(Inode
*dir
, const char* name
,
6193 Dentry
**pdn
, bool expect_null
)
6196 ldout(cct
, 20) << "get_or_create " << *dir
<< " name " << name
<< dendl
;
6198 if (dir
->dir
->dentries
.count(name
)) {
6199 Dentry
*dn
= dir
->dir
->dentries
[name
];
6201 // is dn lease valid?
6202 utime_t now
= ceph_clock_now();
6204 dn
->lease_mds
>= 0 &&
6205 dn
->lease_ttl
> now
&&
6206 mds_sessions
.count(dn
->lease_mds
)) {
6207 MetaSession
*s
= mds_sessions
[dn
->lease_mds
];
6208 if (s
->cap_ttl
> now
&&
6209 s
->cap_gen
== dn
->lease_gen
) {
6216 // otherwise link up a new one
6217 *pdn
= link(dir
->dir
, name
, NULL
, NULL
);
6224 int Client::path_walk(const filepath
& origpath
, InodeRef
*end
,
6225 const UserPerm
& perms
, bool followsym
, int mask
)
6227 filepath path
= origpath
;
6229 if (origpath
.absolute())
6235 ldout(cct
, 10) << "path_walk " << path
<< dendl
;
6240 while (i
< path
.depth() && cur
) {
6242 const string
&dname
= path
[i
];
6243 ldout(cct
, 10) << " " << i
<< " " << *cur
<< " " << dname
<< dendl
;
6244 ldout(cct
, 20) << " (path is " << path
<< ")" << dendl
;
6246 if (cct
->_conf
->client_permissions
) {
6247 int r
= may_lookup(cur
.get(), perms
);
6250 caps
= CEPH_CAP_AUTH_SHARED
;
6253 /* Get extra requested caps on the last component */
6254 if (i
== (path
.depth() - 1))
6256 int r
= _lookup(cur
.get(), dname
, caps
, &next
, perms
);
6259 // only follow trailing symlink if followsym. always follow
6260 // 'directory' symlinks.
6261 if (next
&& next
->is_symlink()) {
6263 ldout(cct
, 20) << " symlink count " << symlinks
<< ", value is '" << next
->symlink
<< "'" << dendl
;
6264 if (symlinks
> MAXSYMLINKS
) {
6268 if (i
< path
.depth() - 1) {
6270 // replace consumed components of path with symlink dir target
6271 filepath
resolved(next
->symlink
.c_str());
6272 resolved
.append(path
.postfixpath(i
+ 1));
6275 if (next
->symlink
[0] == '/') {
6279 } else if (followsym
) {
6280 if (next
->symlink
[0] == '/') {
6281 path
= next
->symlink
.c_str();
6286 filepath
more(next
->symlink
.c_str());
6287 // we need to remove the symlink component from off of the path
6288 // before adding the target that the symlink points to. remain
6289 // at the same position in the path.
6309 int Client::link(const char *relexisting
, const char *relpath
, const UserPerm
& perm
)
6311 Mutex::Locker
lock(client_lock
);
6312 tout(cct
) << "link" << std::endl
;
6313 tout(cct
) << relexisting
<< std::endl
;
6314 tout(cct
) << relpath
<< std::endl
;
6319 filepath
existing(relexisting
);
6322 int r
= path_walk(existing
, &in
, perm
, true);
6325 if (std::string(relpath
) == "/") {
6329 filepath
path(relpath
);
6330 string name
= path
.last_dentry();
6333 r
= path_walk(path
, &dir
, perm
, true);
6336 if (cct
->_conf
->client_permissions
) {
6337 if (S_ISDIR(in
->mode
)) {
6341 r
= may_hardlink(in
.get(), perm
);
6344 r
= may_create(dir
.get(), perm
);
6348 r
= _link(in
.get(), dir
.get(), name
.c_str(), perm
);
6352 int Client::unlink(const char *relpath
, const UserPerm
& perm
)
6354 Mutex::Locker
lock(client_lock
);
6355 tout(cct
) << "unlink" << std::endl
;
6356 tout(cct
) << relpath
<< std::endl
;
6361 if (std::string(relpath
) == "/")
6364 filepath
path(relpath
);
6365 string name
= path
.last_dentry();
6368 int r
= path_walk(path
, &dir
, perm
);
6371 if (cct
->_conf
->client_permissions
) {
6372 r
= may_delete(dir
.get(), name
.c_str(), perm
);
6376 return _unlink(dir
.get(), name
.c_str(), perm
);
6379 int Client::rename(const char *relfrom
, const char *relto
, const UserPerm
& perm
)
6381 Mutex::Locker
lock(client_lock
);
6382 tout(cct
) << "rename" << std::endl
;
6383 tout(cct
) << relfrom
<< std::endl
;
6384 tout(cct
) << relto
<< std::endl
;
6389 if (std::string(relfrom
) == "/" || std::string(relto
) == "/")
6392 filepath
from(relfrom
);
6394 string fromname
= from
.last_dentry();
6396 string toname
= to
.last_dentry();
6399 InodeRef fromdir
, todir
;
6400 int r
= path_walk(from
, &fromdir
, perm
);
6403 r
= path_walk(to
, &todir
, perm
);
6407 if (cct
->_conf
->client_permissions
) {
6408 int r
= may_delete(fromdir
.get(), fromname
.c_str(), perm
);
6411 r
= may_delete(todir
.get(), toname
.c_str(), perm
);
6412 if (r
< 0 && r
!= -ENOENT
)
6415 r
= _rename(fromdir
.get(), fromname
.c_str(), todir
.get(), toname
.c_str(), perm
);
6422 int Client::mkdir(const char *relpath
, mode_t mode
, const UserPerm
& perm
)
6424 Mutex::Locker
lock(client_lock
);
6425 tout(cct
) << "mkdir" << std::endl
;
6426 tout(cct
) << relpath
<< std::endl
;
6427 tout(cct
) << mode
<< std::endl
;
6428 ldout(cct
, 10) << "mkdir: " << relpath
<< dendl
;
6433 if (std::string(relpath
) == "/")
6436 filepath
path(relpath
);
6437 string name
= path
.last_dentry();
6440 int r
= path_walk(path
, &dir
, perm
);
6443 if (cct
->_conf
->client_permissions
) {
6444 r
= may_create(dir
.get(), perm
);
6448 return _mkdir(dir
.get(), name
.c_str(), mode
, perm
);
6451 int Client::mkdirs(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
6453 Mutex::Locker
lock(client_lock
);
6454 ldout(cct
, 10) << "Client::mkdirs " << relpath
<< dendl
;
6455 tout(cct
) << "mkdirs" << std::endl
;
6456 tout(cct
) << relpath
<< std::endl
;
6457 tout(cct
) << mode
<< std::endl
;
6462 //get through existing parts of path
6463 filepath
path(relpath
);
6465 int r
= 0, caps
= 0;
6468 for (i
=0; i
<path
.depth(); ++i
) {
6469 if (cct
->_conf
->client_permissions
) {
6470 r
= may_lookup(cur
.get(), perms
);
6473 caps
= CEPH_CAP_AUTH_SHARED
;
6475 r
= _lookup(cur
.get(), path
[i
].c_str(), caps
, &next
, perms
);
6480 //check that we have work left to do
6481 if (i
==path
.depth()) return -EEXIST
;
6482 if (r
!=-ENOENT
) return r
;
6483 ldout(cct
, 20) << "mkdirs got through " << i
<< " directories on path " << relpath
<< dendl
;
6484 //make new directory at each level
6485 for (; i
<path
.depth(); ++i
) {
6486 if (cct
->_conf
->client_permissions
) {
6487 r
= may_create(cur
.get(), perms
);
6492 r
= _mkdir(cur
.get(), path
[i
].c_str(), mode
, perms
, &next
);
6494 //check proper creation/existence
6495 if(-EEXIST
== r
&& i
< path
.depth() - 1) {
6496 r
= _lookup(cur
.get(), path
[i
].c_str(), CEPH_CAP_AUTH_SHARED
, &next
, perms
);
6500 //move to new dir and continue
6502 ldout(cct
, 20) << "mkdirs: successfully created directory "
6503 << filepath(cur
->ino
).get_path() << dendl
;
6508 int Client::rmdir(const char *relpath
, const UserPerm
& perms
)
6510 Mutex::Locker
lock(client_lock
);
6511 tout(cct
) << "rmdir" << std::endl
;
6512 tout(cct
) << relpath
<< std::endl
;
6517 if (std::string(relpath
) == "/")
6520 filepath
path(relpath
);
6521 string name
= path
.last_dentry();
6524 int r
= path_walk(path
, &dir
, perms
);
6527 if (cct
->_conf
->client_permissions
) {
6528 int r
= may_delete(dir
.get(), name
.c_str(), perms
);
6532 return _rmdir(dir
.get(), name
.c_str(), perms
);
6535 int Client::mknod(const char *relpath
, mode_t mode
, const UserPerm
& perms
, dev_t rdev
)
6537 Mutex::Locker
lock(client_lock
);
6538 tout(cct
) << "mknod" << std::endl
;
6539 tout(cct
) << relpath
<< std::endl
;
6540 tout(cct
) << mode
<< std::endl
;
6541 tout(cct
) << rdev
<< std::endl
;
6546 if (std::string(relpath
) == "/")
6549 filepath
path(relpath
);
6550 string name
= path
.last_dentry();
6553 int r
= path_walk(path
, &dir
, perms
);
6556 if (cct
->_conf
->client_permissions
) {
6557 int r
= may_create(dir
.get(), perms
);
6561 return _mknod(dir
.get(), name
.c_str(), mode
, rdev
, perms
);
6566 int Client::symlink(const char *target
, const char *relpath
, const UserPerm
& perms
)
6568 Mutex::Locker
lock(client_lock
);
6569 tout(cct
) << "symlink" << std::endl
;
6570 tout(cct
) << target
<< std::endl
;
6571 tout(cct
) << relpath
<< std::endl
;
6576 if (std::string(relpath
) == "/")
6579 filepath
path(relpath
);
6580 string name
= path
.last_dentry();
6583 int r
= path_walk(path
, &dir
, perms
);
6586 if (cct
->_conf
->client_permissions
) {
6587 int r
= may_create(dir
.get(), perms
);
6591 return _symlink(dir
.get(), name
.c_str(), target
, perms
);
6594 int Client::readlink(const char *relpath
, char *buf
, loff_t size
, const UserPerm
& perms
)
6596 Mutex::Locker
lock(client_lock
);
6597 tout(cct
) << "readlink" << std::endl
;
6598 tout(cct
) << relpath
<< std::endl
;
6603 filepath
path(relpath
);
6605 int r
= path_walk(path
, &in
, perms
, false);
6609 return _readlink(in
.get(), buf
, size
);
6612 int Client::_readlink(Inode
*in
, char *buf
, size_t size
)
6614 if (!in
->is_symlink())
6617 // copy into buf (at most size bytes)
6618 int r
= in
->symlink
.length();
6621 memcpy(buf
, in
->symlink
.c_str(), r
);
6628 int Client::_getattr(Inode
*in
, int mask
, const UserPerm
& perms
, bool force
)
6630 bool yes
= in
->caps_issued_mask(mask
, true);
6632 ldout(cct
, 10) << "_getattr mask " << ccap_string(mask
) << " issued=" << yes
<< dendl
;
6636 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_GETATTR
);
6638 in
->make_nosnap_relative_path(path
);
6639 req
->set_filepath(path
);
6641 req
->head
.args
.getattr
.mask
= mask
;
6643 int res
= make_request(req
, perms
);
6644 ldout(cct
, 10) << "_getattr result=" << res
<< dendl
;
6648 int Client::_do_setattr(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6649 const UserPerm
& perms
, InodeRef
*inp
)
6651 int issued
= in
->caps_issued();
6653 ldout(cct
, 10) << "_setattr mask " << mask
<< " issued " <<
6654 ccap_string(issued
) << dendl
;
6656 if (in
->snapid
!= CEPH_NOSNAP
) {
6659 if ((mask
& CEPH_SETATTR_SIZE
) &&
6660 (unsigned long)stx
->stx_size
> in
->size
&&
6661 is_quota_bytes_exceeded(in
, (unsigned long)stx
->stx_size
- in
->size
,
6666 // make the change locally?
6667 if ((in
->cap_dirtier_uid
>= 0 && perms
.uid() != in
->cap_dirtier_uid
) ||
6668 (in
->cap_dirtier_gid
>= 0 && perms
.gid() != in
->cap_dirtier_gid
)) {
6669 ldout(cct
, 10) << __func__
<< " caller " << perms
.uid() << ":" << perms
.gid()
6670 << " != cap dirtier " << in
->cap_dirtier_uid
<< ":"
6671 << in
->cap_dirtier_gid
<< ", forcing sync setattr"
6674 * This works because we implicitly flush the caps as part of the
6675 * request, so the cap update check will happen with the writeback
6676 * cap context, and then the setattr check will happen with the
6679 * In reality this pattern is likely pretty rare (different users
6680 * setattr'ing the same file). If that turns out not to be the
6681 * case later, we can build a more complex pipelined cap writeback
6685 mask
|= CEPH_SETATTR_CTIME
;
6690 // caller just needs us to bump the ctime
6691 in
->ctime
= ceph_clock_now();
6692 in
->cap_dirtier_uid
= perms
.uid();
6693 in
->cap_dirtier_gid
= perms
.gid();
6694 if (issued
& CEPH_CAP_AUTH_EXCL
)
6695 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6696 else if (issued
& CEPH_CAP_FILE_EXCL
)
6697 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6698 else if (issued
& CEPH_CAP_XATTR_EXCL
)
6699 in
->mark_caps_dirty(CEPH_CAP_XATTR_EXCL
);
6701 mask
|= CEPH_SETATTR_CTIME
;
6704 if (in
->caps_issued_mask(CEPH_CAP_AUTH_EXCL
)) {
6705 bool kill_sguid
= mask
& (CEPH_SETATTR_SIZE
|CEPH_SETATTR_KILL_SGUID
);
6707 mask
&= ~CEPH_SETATTR_KILL_SGUID
;
6709 if (mask
& CEPH_SETATTR_UID
) {
6710 in
->ctime
= ceph_clock_now();
6711 in
->cap_dirtier_uid
= perms
.uid();
6712 in
->cap_dirtier_gid
= perms
.gid();
6713 in
->uid
= stx
->stx_uid
;
6714 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6715 mask
&= ~CEPH_SETATTR_UID
;
6717 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6719 if (mask
& CEPH_SETATTR_GID
) {
6720 in
->ctime
= ceph_clock_now();
6721 in
->cap_dirtier_uid
= perms
.uid();
6722 in
->cap_dirtier_gid
= perms
.gid();
6723 in
->gid
= stx
->stx_gid
;
6724 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6725 mask
&= ~CEPH_SETATTR_GID
;
6727 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6730 if (mask
& CEPH_SETATTR_MODE
) {
6731 in
->ctime
= ceph_clock_now();
6732 in
->cap_dirtier_uid
= perms
.uid();
6733 in
->cap_dirtier_gid
= perms
.gid();
6734 in
->mode
= (in
->mode
& ~07777) | (stx
->stx_mode
& 07777);
6735 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6736 mask
&= ~CEPH_SETATTR_MODE
;
6737 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6738 } else if (kill_sguid
&& S_ISREG(in
->mode
) && (in
->mode
& (S_IXUSR
|S_IXGRP
|S_IXOTH
))) {
6739 /* Must squash the any setuid/setgid bits with an ownership change */
6740 in
->mode
&= ~(S_ISUID
|S_ISGID
);
6741 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6744 if (mask
& CEPH_SETATTR_BTIME
) {
6745 in
->ctime
= ceph_clock_now();
6746 in
->cap_dirtier_uid
= perms
.uid();
6747 in
->cap_dirtier_gid
= perms
.gid();
6748 in
->btime
= utime_t(stx
->stx_btime
);
6749 in
->mark_caps_dirty(CEPH_CAP_AUTH_EXCL
);
6750 mask
&= ~CEPH_SETATTR_BTIME
;
6751 ldout(cct
,10) << "changing btime to " << in
->btime
<< dendl
;
6753 } else if (mask
& CEPH_SETATTR_SIZE
) {
6754 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6755 mask
|= CEPH_SETATTR_KILL_SGUID
;
6758 if (in
->caps_issued_mask(CEPH_CAP_FILE_EXCL
)) {
6759 if (mask
& (CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
)) {
6760 if (mask
& CEPH_SETATTR_MTIME
)
6761 in
->mtime
= utime_t(stx
->stx_mtime
);
6762 if (mask
& CEPH_SETATTR_ATIME
)
6763 in
->atime
= utime_t(stx
->stx_atime
);
6764 in
->ctime
= ceph_clock_now();
6765 in
->cap_dirtier_uid
= perms
.uid();
6766 in
->cap_dirtier_gid
= perms
.gid();
6767 in
->time_warp_seq
++;
6768 in
->mark_caps_dirty(CEPH_CAP_FILE_EXCL
);
6769 mask
&= ~(CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
);
6778 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETATTR
);
6782 in
->make_nosnap_relative_path(path
);
6783 req
->set_filepath(path
);
6786 if (mask
& CEPH_SETATTR_KILL_SGUID
) {
6787 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6789 if (mask
& CEPH_SETATTR_MODE
) {
6790 req
->head
.args
.setattr
.mode
= stx
->stx_mode
;
6791 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6792 ldout(cct
,10) << "changing mode to " << stx
->stx_mode
<< dendl
;
6794 if (mask
& CEPH_SETATTR_UID
) {
6795 req
->head
.args
.setattr
.uid
= stx
->stx_uid
;
6796 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6797 ldout(cct
,10) << "changing uid to " << stx
->stx_uid
<< dendl
;
6799 if (mask
& CEPH_SETATTR_GID
) {
6800 req
->head
.args
.setattr
.gid
= stx
->stx_gid
;
6801 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6802 ldout(cct
,10) << "changing gid to " << stx
->stx_gid
<< dendl
;
6804 if (mask
& CEPH_SETATTR_BTIME
) {
6805 req
->head
.args
.setattr
.btime
= utime_t(stx
->stx_btime
);
6806 req
->inode_drop
|= CEPH_CAP_AUTH_SHARED
;
6808 if (mask
& CEPH_SETATTR_MTIME
) {
6809 req
->head
.args
.setattr
.mtime
= utime_t(stx
->stx_mtime
);
6810 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6813 if (mask
& CEPH_SETATTR_ATIME
) {
6814 req
->head
.args
.setattr
.atime
= utime_t(stx
->stx_atime
);
6815 req
->inode_drop
|= CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_RD
|
6818 if (mask
& CEPH_SETATTR_SIZE
) {
6819 if ((unsigned long)stx
->stx_size
< mdsmap
->get_max_filesize()) {
6820 req
->head
.args
.setattr
.size
= stx
->stx_size
;
6821 ldout(cct
,10) << "changing size to " << stx
->stx_size
<< dendl
;
6824 ldout(cct
,10) << "unable to set size to " << stx
->stx_size
<< ". Too large!" << dendl
;
6827 req
->inode_drop
|= CEPH_CAP_FILE_SHARED
| CEPH_CAP_FILE_RD
|
6830 req
->head
.args
.setattr
.mask
= mask
;
6832 req
->regetattr_mask
= mask
;
6834 int res
= make_request(req
, perms
, inp
);
6835 ldout(cct
, 10) << "_setattr result=" << res
<< dendl
;
6839 /* Note that we only care about attrs that setattr cares about */
6840 void Client::stat_to_statx(struct stat
*st
, struct ceph_statx
*stx
)
6842 stx
->stx_size
= st
->st_size
;
6843 stx
->stx_mode
= st
->st_mode
;
6844 stx
->stx_uid
= st
->st_uid
;
6845 stx
->stx_gid
= st
->st_gid
;
6846 stx
->stx_mtime
= st
->st_mtim
;
6847 stx
->stx_atime
= st
->st_atim
;
6850 int Client::__setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
6851 const UserPerm
& perms
, InodeRef
*inp
)
6853 int ret
= _do_setattr(in
, stx
, mask
, perms
, inp
);
6856 if (mask
& CEPH_SETATTR_MODE
)
6857 ret
= _posix_acl_chmod(in
, stx
->stx_mode
, perms
);
6861 int Client::_setattrx(InodeRef
&in
, struct ceph_statx
*stx
, int mask
,
6862 const UserPerm
& perms
)
6864 mask
&= (CEPH_SETATTR_MODE
| CEPH_SETATTR_UID
|
6865 CEPH_SETATTR_GID
| CEPH_SETATTR_MTIME
|
6866 CEPH_SETATTR_ATIME
| CEPH_SETATTR_SIZE
|
6867 CEPH_SETATTR_CTIME
| CEPH_SETATTR_BTIME
);
6868 if (cct
->_conf
->client_permissions
) {
6869 int r
= may_setattr(in
.get(), stx
, mask
, perms
);
6873 return __setattrx(in
.get(), stx
, mask
, perms
);
6876 int Client::_setattr(InodeRef
&in
, struct stat
*attr
, int mask
,
6877 const UserPerm
& perms
)
6879 struct ceph_statx stx
;
6881 stat_to_statx(attr
, &stx
);
6882 mask
&= ~CEPH_SETATTR_BTIME
;
6884 if ((mask
& CEPH_SETATTR_UID
) && attr
->st_uid
== static_cast<uid_t
>(-1)) {
6885 mask
&= ~CEPH_SETATTR_UID
;
6887 if ((mask
& CEPH_SETATTR_GID
) && attr
->st_gid
== static_cast<uid_t
>(-1)) {
6888 mask
&= ~CEPH_SETATTR_GID
;
6891 return _setattrx(in
, &stx
, mask
, perms
);
6894 int Client::setattr(const char *relpath
, struct stat
*attr
, int mask
,
6895 const UserPerm
& perms
)
6897 Mutex::Locker
lock(client_lock
);
6898 tout(cct
) << "setattr" << std::endl
;
6899 tout(cct
) << relpath
<< std::endl
;
6900 tout(cct
) << mask
<< std::endl
;
6905 filepath
path(relpath
);
6907 int r
= path_walk(path
, &in
, perms
);
6910 return _setattr(in
, attr
, mask
, perms
);
6913 int Client::setattrx(const char *relpath
, struct ceph_statx
*stx
, int mask
,
6914 const UserPerm
& perms
, int flags
)
6916 Mutex::Locker
lock(client_lock
);
6917 tout(cct
) << "setattrx" << std::endl
;
6918 tout(cct
) << relpath
<< std::endl
;
6919 tout(cct
) << mask
<< std::endl
;
6924 filepath
path(relpath
);
6926 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
));
6929 return _setattrx(in
, stx
, mask
, perms
);
6932 int Client::fsetattr(int fd
, struct stat
*attr
, int mask
, const UserPerm
& perms
)
6934 Mutex::Locker
lock(client_lock
);
6935 tout(cct
) << "fsetattr" << std::endl
;
6936 tout(cct
) << fd
<< std::endl
;
6937 tout(cct
) << mask
<< std::endl
;
6942 Fh
*f
= get_filehandle(fd
);
6945 #if defined(__linux__) && defined(O_PATH)
6946 if (f
->flags
& O_PATH
)
6949 return _setattr(f
->inode
, attr
, mask
, perms
);
6952 int Client::fsetattrx(int fd
, struct ceph_statx
*stx
, int mask
, const UserPerm
& perms
)
6954 Mutex::Locker
lock(client_lock
);
6955 tout(cct
) << "fsetattr" << std::endl
;
6956 tout(cct
) << fd
<< std::endl
;
6957 tout(cct
) << mask
<< std::endl
;
6962 Fh
*f
= get_filehandle(fd
);
6965 #if defined(__linux__) && defined(O_PATH)
6966 if (f
->flags
& O_PATH
)
6969 return _setattrx(f
->inode
, stx
, mask
, perms
);
6972 int Client::stat(const char *relpath
, struct stat
*stbuf
, const UserPerm
& perms
,
6973 frag_info_t
*dirstat
, int mask
)
6975 ldout(cct
, 3) << "stat enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
6976 Mutex::Locker
lock(client_lock
);
6977 tout(cct
) << "stat" << std::endl
;
6978 tout(cct
) << relpath
<< std::endl
;
6983 filepath
path(relpath
);
6985 int r
= path_walk(path
, &in
, perms
, true, mask
);
6988 r
= _getattr(in
, mask
, perms
);
6990 ldout(cct
, 3) << "stat exit on error!" << dendl
;
6993 fill_stat(in
, stbuf
, dirstat
);
6994 ldout(cct
, 3) << "stat exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
6998 unsigned Client::statx_to_mask(unsigned int flags
, unsigned int want
)
7002 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7003 if (flags
& AT_NO_ATTR_SYNC
)
7006 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7007 mask
|= CEPH_CAP_PIN
;
7008 if (want
& (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7009 mask
|= CEPH_CAP_AUTH_SHARED
;
7010 if (want
& (CEPH_STATX_NLINK
|CEPH_STATX_CTIME
|CEPH_STATX_VERSION
))
7011 mask
|= CEPH_CAP_LINK_SHARED
;
7012 if (want
& (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|CEPH_STATX_CTIME
|CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
|CEPH_STATX_VERSION
))
7013 mask
|= CEPH_CAP_FILE_SHARED
;
7014 if (want
& (CEPH_STATX_VERSION
|CEPH_STATX_CTIME
))
7015 mask
|= CEPH_CAP_XATTR_SHARED
;
7020 int Client::statx(const char *relpath
, struct ceph_statx
*stx
,
7021 const UserPerm
& perms
,
7022 unsigned int want
, unsigned int flags
)
7024 ldout(cct
, 3) << "statx enter (relpath " << relpath
<< " want " << want
<< ")" << dendl
;
7025 Mutex::Locker
lock(client_lock
);
7026 tout(cct
) << "statx" << std::endl
;
7027 tout(cct
) << relpath
<< std::endl
;
7032 filepath
path(relpath
);
7035 unsigned mask
= statx_to_mask(flags
, want
);
7037 int r
= path_walk(path
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
7041 r
= _getattr(in
, mask
, perms
);
7043 ldout(cct
, 3) << "statx exit on error!" << dendl
;
7047 fill_statx(in
, mask
, stx
);
7048 ldout(cct
, 3) << "statx exit (relpath " << relpath
<< " mask " << stx
->stx_mask
<< ")" << dendl
;
7052 int Client::lstat(const char *relpath
, struct stat
*stbuf
,
7053 const UserPerm
& perms
, frag_info_t
*dirstat
, int mask
)
7055 ldout(cct
, 3) << "lstat enter (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7056 Mutex::Locker
lock(client_lock
);
7057 tout(cct
) << "lstat" << std::endl
;
7058 tout(cct
) << relpath
<< std::endl
;
7063 filepath
path(relpath
);
7065 // don't follow symlinks
7066 int r
= path_walk(path
, &in
, perms
, false, mask
);
7069 r
= _getattr(in
, mask
, perms
);
7071 ldout(cct
, 3) << "lstat exit on error!" << dendl
;
7074 fill_stat(in
, stbuf
, dirstat
);
7075 ldout(cct
, 3) << "lstat exit (relpath " << relpath
<< " mask " << mask
<< ")" << dendl
;
7079 int Client::fill_stat(Inode
*in
, struct stat
*st
, frag_info_t
*dirstat
, nest_info_t
*rstat
)
7081 ldout(cct
, 10) << "fill_stat on " << in
->ino
<< " snap/dev" << in
->snapid
7082 << " mode 0" << oct
<< in
->mode
<< dec
7083 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7084 memset(st
, 0, sizeof(struct stat
));
7085 if (use_faked_inos())
7086 st
->st_ino
= in
->faked_ino
;
7088 st
->st_ino
= in
->ino
;
7089 st
->st_dev
= in
->snapid
;
7090 st
->st_mode
= in
->mode
;
7091 st
->st_rdev
= in
->rdev
;
7093 switch (in
->nlink
) {
7095 st
->st_nlink
= 0; /* dir is unlinked */
7098 st
->st_nlink
= 1 /* parent dentry */
7100 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7106 st
->st_nlink
= in
->nlink
;
7108 st
->st_uid
= in
->uid
;
7109 st
->st_gid
= in
->gid
;
7110 if (in
->ctime
> in
->mtime
) {
7111 stat_set_ctime_sec(st
, in
->ctime
.sec());
7112 stat_set_ctime_nsec(st
, in
->ctime
.nsec());
7114 stat_set_ctime_sec(st
, in
->mtime
.sec());
7115 stat_set_ctime_nsec(st
, in
->mtime
.nsec());
7117 stat_set_atime_sec(st
, in
->atime
.sec());
7118 stat_set_atime_nsec(st
, in
->atime
.nsec());
7119 stat_set_mtime_sec(st
, in
->mtime
.sec());
7120 stat_set_mtime_nsec(st
, in
->mtime
.nsec());
7122 if (cct
->_conf
->client_dirsize_rbytes
)
7123 st
->st_size
= in
->rstat
.rbytes
;
7125 st
->st_size
= in
->dirstat
.size();
7128 st
->st_size
= in
->size
;
7129 st
->st_blocks
= (in
->size
+ 511) >> 9;
7131 st
->st_blksize
= MAX(in
->layout
.stripe_unit
, 4096);
7134 *dirstat
= in
->dirstat
;
7138 return in
->caps_issued();
7141 void Client::fill_statx(Inode
*in
, unsigned int mask
, struct ceph_statx
*stx
)
7143 ldout(cct
, 10) << "fill_statx on " << in
->ino
<< " snap/dev" << in
->snapid
7144 << " mode 0" << oct
<< in
->mode
<< dec
7145 << " mtime " << in
->mtime
<< " ctime " << in
->ctime
<< dendl
;
7146 memset(stx
, 0, sizeof(struct ceph_statx
));
7149 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7150 * so that all bits are set.
7155 /* These are always considered to be available */
7156 stx
->stx_dev
= in
->snapid
;
7157 stx
->stx_blksize
= MAX(in
->layout
.stripe_unit
, 4096);
7159 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7160 stx
->stx_mode
= S_IFMT
& in
->mode
;
7161 stx
->stx_ino
= use_faked_inos() ? in
->faked_ino
: (ino_t
)in
->ino
;
7162 stx
->stx_rdev
= in
->rdev
;
7163 stx
->stx_mask
|= (CEPH_STATX_INO
|CEPH_STATX_RDEV
);
7165 if (mask
& CEPH_CAP_AUTH_SHARED
) {
7166 stx
->stx_uid
= in
->uid
;
7167 stx
->stx_gid
= in
->gid
;
7168 stx
->stx_mode
= in
->mode
;
7169 in
->btime
.to_timespec(&stx
->stx_btime
);
7170 stx
->stx_mask
|= (CEPH_STATX_MODE
|CEPH_STATX_UID
|CEPH_STATX_GID
|CEPH_STATX_BTIME
);
7173 if (mask
& CEPH_CAP_LINK_SHARED
) {
7175 switch (in
->nlink
) {
7177 stx
->stx_nlink
= 0; /* dir is unlinked */
7180 stx
->stx_nlink
= 1 /* parent dentry */
7182 + in
->dirstat
.nsubdirs
; /* include <dir>/. self-reference */
7188 stx
->stx_nlink
= in
->nlink
;
7190 stx
->stx_mask
|= CEPH_STATX_NLINK
;
7193 if (mask
& CEPH_CAP_FILE_SHARED
) {
7195 in
->atime
.to_timespec(&stx
->stx_atime
);
7196 in
->mtime
.to_timespec(&stx
->stx_mtime
);
7199 if (cct
->_conf
->client_dirsize_rbytes
)
7200 stx
->stx_size
= in
->rstat
.rbytes
;
7202 stx
->stx_size
= in
->dirstat
.size();
7203 stx
->stx_blocks
= 1;
7205 stx
->stx_size
= in
->size
;
7206 stx
->stx_blocks
= (in
->size
+ 511) >> 9;
7208 stx
->stx_mask
|= (CEPH_STATX_ATIME
|CEPH_STATX_MTIME
|
7209 CEPH_STATX_SIZE
|CEPH_STATX_BLOCKS
);
7212 /* Change time and change_attr both require all shared caps to view */
7213 if ((mask
& CEPH_STAT_CAP_INODE_ALL
) == CEPH_STAT_CAP_INODE_ALL
) {
7214 stx
->stx_version
= in
->change_attr
;
7215 if (in
->ctime
> in
->mtime
)
7216 in
->ctime
.to_timespec(&stx
->stx_ctime
);
7218 in
->mtime
.to_timespec(&stx
->stx_ctime
);
7219 stx
->stx_mask
|= (CEPH_STATX_CTIME
|CEPH_STATX_VERSION
);
7224 void Client::touch_dn(Dentry
*dn
)
7229 int Client::chmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7231 Mutex::Locker
lock(client_lock
);
7232 tout(cct
) << "chmod" << std::endl
;
7233 tout(cct
) << relpath
<< std::endl
;
7234 tout(cct
) << mode
<< std::endl
;
7239 filepath
path(relpath
);
7241 int r
= path_walk(path
, &in
, perms
);
7245 attr
.st_mode
= mode
;
7246 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7249 int Client::fchmod(int fd
, mode_t mode
, const UserPerm
& perms
)
7251 Mutex::Locker
lock(client_lock
);
7252 tout(cct
) << "fchmod" << std::endl
;
7253 tout(cct
) << fd
<< std::endl
;
7254 tout(cct
) << mode
<< std::endl
;
7259 Fh
*f
= get_filehandle(fd
);
7262 #if defined(__linux__) && defined(O_PATH)
7263 if (f
->flags
& O_PATH
)
7267 attr
.st_mode
= mode
;
7268 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_MODE
, perms
);
7271 int Client::lchmod(const char *relpath
, mode_t mode
, const UserPerm
& perms
)
7273 Mutex::Locker
lock(client_lock
);
7274 tout(cct
) << "lchmod" << std::endl
;
7275 tout(cct
) << relpath
<< std::endl
;
7276 tout(cct
) << mode
<< std::endl
;
7281 filepath
path(relpath
);
7283 // don't follow symlinks
7284 int r
= path_walk(path
, &in
, perms
, false);
7288 attr
.st_mode
= mode
;
7289 return _setattr(in
, &attr
, CEPH_SETATTR_MODE
, perms
);
7292 int Client::chown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7293 const UserPerm
& perms
)
7295 Mutex::Locker
lock(client_lock
);
7296 tout(cct
) << "chown" << std::endl
;
7297 tout(cct
) << relpath
<< std::endl
;
7298 tout(cct
) << new_uid
<< std::endl
;
7299 tout(cct
) << new_gid
<< std::endl
;
7304 filepath
path(relpath
);
7306 int r
= path_walk(path
, &in
, perms
);
7310 attr
.st_uid
= new_uid
;
7311 attr
.st_gid
= new_gid
;
7312 return _setattr(in
, &attr
, CEPH_SETATTR_UID
|CEPH_SETATTR_GID
, perms
);
7315 int Client::fchown(int fd
, uid_t new_uid
, gid_t new_gid
, const UserPerm
& perms
)
7317 Mutex::Locker
lock(client_lock
);
7318 tout(cct
) << "fchown" << std::endl
;
7319 tout(cct
) << fd
<< std::endl
;
7320 tout(cct
) << new_uid
<< std::endl
;
7321 tout(cct
) << new_gid
<< std::endl
;
7326 Fh
*f
= get_filehandle(fd
);
7329 #if defined(__linux__) && defined(O_PATH)
7330 if (f
->flags
& O_PATH
)
7334 attr
.st_uid
= new_uid
;
7335 attr
.st_gid
= new_gid
;
7337 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7338 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7339 return _setattr(f
->inode
, &attr
, mask
, perms
);
7342 int Client::lchown(const char *relpath
, uid_t new_uid
, gid_t new_gid
,
7343 const UserPerm
& perms
)
7345 Mutex::Locker
lock(client_lock
);
7346 tout(cct
) << "lchown" << std::endl
;
7347 tout(cct
) << relpath
<< std::endl
;
7348 tout(cct
) << new_uid
<< std::endl
;
7349 tout(cct
) << new_gid
<< std::endl
;
7354 filepath
path(relpath
);
7356 // don't follow symlinks
7357 int r
= path_walk(path
, &in
, perms
, false);
7361 attr
.st_uid
= new_uid
;
7362 attr
.st_gid
= new_gid
;
7364 if (new_uid
!= static_cast<uid_t
>(-1)) mask
|= CEPH_SETATTR_UID
;
7365 if (new_gid
!= static_cast<gid_t
>(-1)) mask
|= CEPH_SETATTR_GID
;
7366 return _setattr(in
, &attr
, mask
, perms
);
7369 int Client::utime(const char *relpath
, struct utimbuf
*buf
,
7370 const UserPerm
& perms
)
7372 Mutex::Locker
lock(client_lock
);
7373 tout(cct
) << "utime" << std::endl
;
7374 tout(cct
) << relpath
<< std::endl
;
7375 tout(cct
) << buf
->modtime
<< std::endl
;
7376 tout(cct
) << buf
->actime
<< std::endl
;
7381 filepath
path(relpath
);
7383 int r
= path_walk(path
, &in
, perms
);
7387 stat_set_mtime_sec(&attr
, buf
->modtime
);
7388 stat_set_mtime_nsec(&attr
, 0);
7389 stat_set_atime_sec(&attr
, buf
->actime
);
7390 stat_set_atime_nsec(&attr
, 0);
7391 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7394 int Client::lutime(const char *relpath
, struct utimbuf
*buf
,
7395 const UserPerm
& perms
)
7397 Mutex::Locker
lock(client_lock
);
7398 tout(cct
) << "lutime" << std::endl
;
7399 tout(cct
) << relpath
<< std::endl
;
7400 tout(cct
) << buf
->modtime
<< std::endl
;
7401 tout(cct
) << buf
->actime
<< std::endl
;
7406 filepath
path(relpath
);
7408 // don't follow symlinks
7409 int r
= path_walk(path
, &in
, perms
, false);
7413 stat_set_mtime_sec(&attr
, buf
->modtime
);
7414 stat_set_mtime_nsec(&attr
, 0);
7415 stat_set_atime_sec(&attr
, buf
->actime
);
7416 stat_set_atime_nsec(&attr
, 0);
7417 return _setattr(in
, &attr
, CEPH_SETATTR_MTIME
|CEPH_SETATTR_ATIME
, perms
);
7420 int Client::flock(int fd
, int operation
, uint64_t owner
)
7422 Mutex::Locker
lock(client_lock
);
7423 tout(cct
) << "flock" << std::endl
;
7424 tout(cct
) << fd
<< std::endl
;
7425 tout(cct
) << operation
<< std::endl
;
7426 tout(cct
) << owner
<< std::endl
;
7431 Fh
*f
= get_filehandle(fd
);
7435 return _flock(f
, operation
, owner
);
7438 int Client::opendir(const char *relpath
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7440 Mutex::Locker
lock(client_lock
);
7441 tout(cct
) << "opendir" << std::endl
;
7442 tout(cct
) << relpath
<< std::endl
;
7447 filepath
path(relpath
);
7449 int r
= path_walk(path
, &in
, perms
, true);
7452 if (cct
->_conf
->client_permissions
) {
7453 int r
= may_open(in
.get(), O_RDONLY
, perms
);
7457 r
= _opendir(in
.get(), dirpp
, perms
);
7458 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7460 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
7464 int Client::_opendir(Inode
*in
, dir_result_t
**dirpp
, const UserPerm
& perms
)
7468 *dirpp
= new dir_result_t(in
, perms
);
7469 opened_dirs
.insert(*dirpp
);
7470 ldout(cct
, 8) << "_opendir(" << in
->ino
<< ") = " << 0 << " (" << *dirpp
<< ")" << dendl
;
7475 int Client::closedir(dir_result_t
*dir
)
7477 Mutex::Locker
lock(client_lock
);
7478 tout(cct
) << "closedir" << std::endl
;
7479 tout(cct
) << (unsigned long)dir
<< std::endl
;
7481 ldout(cct
, 3) << "closedir(" << dir
<< ") = 0" << dendl
;
7486 void Client::_closedir(dir_result_t
*dirp
)
7488 ldout(cct
, 10) << "_closedir(" << dirp
<< ")" << dendl
;
7490 ldout(cct
, 10) << "_closedir detaching inode " << dirp
->inode
<< dendl
;
7491 dirp
->inode
.reset();
7493 _readdir_drop_dirp_buffer(dirp
);
7494 opened_dirs
.erase(dirp
);
7498 void Client::rewinddir(dir_result_t
*dirp
)
7500 Mutex::Locker
lock(client_lock
);
7501 ldout(cct
, 3) << "rewinddir(" << dirp
<< ")" << dendl
;
7506 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7507 _readdir_drop_dirp_buffer(d
);
7511 loff_t
Client::telldir(dir_result_t
*dirp
)
7513 dir_result_t
*d
= static_cast<dir_result_t
*>(dirp
);
7514 ldout(cct
, 3) << "telldir(" << dirp
<< ") = " << d
->offset
<< dendl
;
7518 void Client::seekdir(dir_result_t
*dirp
, loff_t offset
)
7520 Mutex::Locker
lock(client_lock
);
7522 ldout(cct
, 3) << "seekdir(" << dirp
<< ", " << offset
<< ")" << dendl
;
7527 if (offset
== dirp
->offset
)
7530 if (offset
> dirp
->offset
)
7531 dirp
->release_count
= 0; // bump if we do a forward seek
7533 dirp
->ordered_count
= 0; // disable filling readdir cache
7535 if (dirp
->hash_order()) {
7536 if (dirp
->offset
> offset
) {
7537 _readdir_drop_dirp_buffer(dirp
);
7542 dirp
->buffer_frag
!= frag_t(dir_result_t::fpos_high(offset
)) ||
7543 dirp
->offset_low() > dir_result_t::fpos_low(offset
)) {
7544 _readdir_drop_dirp_buffer(dirp
);
7549 dirp
->offset
= offset
;
7554 // ino_t d_ino; /* inode number */
7555 // off_t d_off; /* offset to the next dirent */
7556 // unsigned short d_reclen; /* length of this record */
7557 // unsigned char d_type; /* type of file */
7558 // char d_name[256]; /* filename */
7560 void Client::fill_dirent(struct dirent
*de
, const char *name
, int type
, uint64_t ino
, loff_t next_off
)
7562 strncpy(de
->d_name
, name
, 255);
7563 de
->d_name
[255] = '\0';
7566 #if !defined(DARWIN) && !defined(__FreeBSD__)
7567 de
->d_off
= next_off
;
7570 de
->d_type
= IFTODT(type
);
7571 ldout(cct
, 10) << "fill_dirent '" << de
->d_name
<< "' -> " << inodeno_t(de
->d_ino
)
7572 << " type " << (int)de
->d_type
<< " w/ next_off " << hex
<< next_off
<< dec
<< dendl
;
7576 void Client::_readdir_next_frag(dir_result_t
*dirp
)
7578 frag_t fg
= dirp
->buffer_frag
;
7580 if (fg
.is_rightmost()) {
7581 ldout(cct
, 10) << "_readdir_next_frag advance from " << fg
<< " to END" << dendl
;
7588 ldout(cct
, 10) << "_readdir_next_frag advance from " << dirp
->buffer_frag
<< " to " << fg
<< dendl
;
7590 if (dirp
->hash_order()) {
7592 int64_t new_offset
= dir_result_t::make_fpos(fg
.value(), 2, true);
7593 if (dirp
->offset
< new_offset
) // don't decrease offset
7594 dirp
->offset
= new_offset
;
7596 dirp
->last_name
.clear();
7597 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7598 _readdir_rechoose_frag(dirp
);
7602 void Client::_readdir_rechoose_frag(dir_result_t
*dirp
)
7604 assert(dirp
->inode
);
7606 if (dirp
->hash_order())
7609 frag_t cur
= frag_t(dirp
->offset_high());
7610 frag_t fg
= dirp
->inode
->dirfragtree
[cur
.value()];
7612 ldout(cct
, 10) << "_readdir_rechoose_frag frag " << cur
<< " maps to " << fg
<< dendl
;
7613 dirp
->offset
= dir_result_t::make_fpos(fg
, 2, false);
7614 dirp
->last_name
.clear();
7615 dirp
->next_offset
= 2;
7619 void Client::_readdir_drop_dirp_buffer(dir_result_t
*dirp
)
7621 ldout(cct
, 10) << "_readdir_drop_dirp_buffer " << dirp
<< dendl
;
7622 dirp
->buffer
.clear();
7625 int Client::_readdir_get_frag(dir_result_t
*dirp
)
7628 assert(dirp
->inode
);
7630 // get the current frag.
7632 if (dirp
->hash_order())
7633 fg
= dirp
->inode
->dirfragtree
[dirp
->offset_high()];
7635 fg
= frag_t(dirp
->offset_high());
7637 ldout(cct
, 10) << "_readdir_get_frag " << dirp
<< " on " << dirp
->inode
->ino
<< " fg " << fg
7638 << " offset " << hex
<< dirp
->offset
<< dec
<< dendl
;
7640 int op
= CEPH_MDS_OP_READDIR
;
7641 if (dirp
->inode
&& dirp
->inode
->snapid
== CEPH_SNAPDIR
)
7642 op
= CEPH_MDS_OP_LSSNAP
;
7644 InodeRef
& diri
= dirp
->inode
;
7646 MetaRequest
*req
= new MetaRequest(op
);
7648 diri
->make_nosnap_relative_path(path
);
7649 req
->set_filepath(path
);
7650 req
->set_inode(diri
.get());
7651 req
->head
.args
.readdir
.frag
= fg
;
7652 req
->head
.args
.readdir
.flags
= CEPH_READDIR_REPLY_BITFLAGS
;
7653 if (dirp
->last_name
.length()) {
7654 req
->path2
.set_path(dirp
->last_name
);
7655 } else if (dirp
->hash_order()) {
7656 req
->head
.args
.readdir
.offset_hash
= dirp
->offset_high();
7661 int res
= make_request(req
, dirp
->perms
, NULL
, NULL
, -1, &dirbl
);
7663 if (res
== -EAGAIN
) {
7664 ldout(cct
, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl
;
7665 _readdir_rechoose_frag(dirp
);
7666 return _readdir_get_frag(dirp
);
7670 ldout(cct
, 10) << "_readdir_get_frag " << dirp
<< " got frag " << dirp
->buffer_frag
7671 << " size " << dirp
->buffer
.size() << dendl
;
7673 ldout(cct
, 10) << "_readdir_get_frag got error " << res
<< ", setting end flag" << dendl
;
7680 struct dentry_off_lt
{
7681 bool operator()(const Dentry
* dn
, int64_t off
) const {
7682 return dir_result_t::fpos_cmp(dn
->offset
, off
) < 0;
7686 int Client::_readdir_cache_cb(dir_result_t
*dirp
, add_dirent_cb_t cb
, void *p
,
7687 int caps
, bool getref
)
7689 assert(client_lock
.is_locked());
7690 ldout(cct
, 10) << "_readdir_cache_cb " << dirp
<< " on " << dirp
->inode
->ino
7691 << " last_name " << dirp
->last_name
<< " offset " << hex
<< dirp
->offset
<< dec
7693 Dir
*dir
= dirp
->inode
->dir
;
7696 ldout(cct
, 10) << " dir is empty" << dendl
;
7701 vector
<Dentry
*>::iterator pd
= std::lower_bound(dir
->readdir_cache
.begin(),
7702 dir
->readdir_cache
.end(),
7703 dirp
->offset
, dentry_off_lt());
7707 if (!dirp
->inode
->is_complete_and_ordered())
7709 if (pd
== dir
->readdir_cache
.end())
7712 if (dn
->inode
== NULL
) {
7713 ldout(cct
, 15) << " skipping null '" << dn
->name
<< "'" << dendl
;
7717 if (dn
->cap_shared_gen
!= dir
->parent_inode
->shared_gen
) {
7718 ldout(cct
, 15) << " skipping mismatch shared gen '" << dn
->name
<< "'" << dendl
;
7723 int r
= _getattr(dn
->inode
, caps
, dirp
->perms
);
7727 struct ceph_statx stx
;
7729 fill_statx(dn
->inode
, caps
, &stx
);
7731 uint64_t next_off
= dn
->offset
+ 1;
7733 if (pd
== dir
->readdir_cache
.end())
7734 next_off
= dir_result_t::END
;
7737 fill_dirent(&de
, dn
->name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
7739 in
= dn
->inode
.get();
7743 dn_name
= dn
->name
; // fill in name while we have lock
7745 client_lock
.Unlock();
7746 r
= cb(p
, &de
, &stx
, next_off
, in
); // _next_ offset
7748 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< dn
->offset
<< dec
7749 << " = " << r
<< dendl
;
7754 dirp
->offset
= next_off
;
7756 dirp
->next_offset
= 2;
7758 dirp
->next_offset
= dirp
->offset_low();
7759 dirp
->last_name
= dn_name
; // we successfully returned this one; update!
7760 dirp
->release_count
= 0; // last_name no longer match cache index
7765 ldout(cct
, 10) << "_readdir_cache_cb " << dirp
<< " on " << dirp
->inode
->ino
<< " at end" << dendl
;
7770 int Client::readdir_r_cb(dir_result_t
*d
, add_dirent_cb_t cb
, void *p
,
7771 unsigned want
, unsigned flags
, bool getref
)
7773 int caps
= statx_to_mask(flags
, want
);
7775 Mutex::Locker
lock(client_lock
);
7780 dir_result_t
*dirp
= static_cast<dir_result_t
*>(d
);
7782 ldout(cct
, 10) << "readdir_r_cb " << *dirp
->inode
<< " offset " << hex
<< dirp
->offset
7783 << dec
<< " at_end=" << dirp
->at_end()
7784 << " hash_order=" << dirp
->hash_order() << dendl
;
7787 struct ceph_statx stx
;
7788 memset(&de
, 0, sizeof(de
));
7789 memset(&stx
, 0, sizeof(stx
));
7791 InodeRef
& diri
= dirp
->inode
;
7796 if (dirp
->offset
== 0) {
7797 ldout(cct
, 15) << " including ." << dendl
;
7798 assert(diri
->dn_set
.size() < 2); // can't have multiple hard-links to a dir
7799 uint64_t next_off
= 1;
7802 r
= _getattr(diri
, caps
, dirp
->perms
);
7806 fill_statx(diri
, caps
, &stx
);
7807 fill_dirent(&de
, ".", S_IFDIR
, stx
.stx_ino
, next_off
);
7809 Inode
*inode
= NULL
;
7815 client_lock
.Unlock();
7816 r
= cb(p
, &de
, &stx
, next_off
, inode
);
7821 dirp
->offset
= next_off
;
7825 if (dirp
->offset
== 1) {
7826 ldout(cct
, 15) << " including .." << dendl
;
7827 uint64_t next_off
= 2;
7829 if (diri
->dn_set
.empty())
7832 in
= diri
->get_first_parent()->dir
->parent_inode
;
7835 r
= _getattr(in
, caps
, dirp
->perms
);
7839 fill_statx(in
, caps
, &stx
);
7840 fill_dirent(&de
, "..", S_IFDIR
, stx
.stx_ino
, next_off
);
7842 Inode
*inode
= NULL
;
7848 client_lock
.Unlock();
7849 r
= cb(p
, &de
, &stx
, next_off
, inode
);
7854 dirp
->offset
= next_off
;
7859 // can we read from our cache?
7860 ldout(cct
, 10) << "offset " << hex
<< dirp
->offset
<< dec
7861 << " snapid " << dirp
->inode
->snapid
<< " (complete && ordered) "
7862 << dirp
->inode
->is_complete_and_ordered()
7863 << " issued " << ccap_string(dirp
->inode
->caps_issued())
7865 if (dirp
->inode
->snapid
!= CEPH_SNAPDIR
&&
7866 dirp
->inode
->is_complete_and_ordered() &&
7867 dirp
->inode
->caps_issued_mask(CEPH_CAP_FILE_SHARED
, true)) {
7868 int err
= _readdir_cache_cb(dirp
, cb
, p
, caps
, getref
);
7877 bool check_caps
= true;
7878 if (!dirp
->is_cached()) {
7879 int r
= _readdir_get_frag(dirp
);
7882 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7883 // different than the requested one. (our dirfragtree was outdated)
7886 frag_t fg
= dirp
->buffer_frag
;
7888 ldout(cct
, 10) << "frag " << fg
<< " buffer size " << dirp
->buffer
.size()
7889 << " offset " << hex
<< dirp
->offset
<< dendl
;
7891 for (auto it
= std::lower_bound(dirp
->buffer
.begin(), dirp
->buffer
.end(),
7892 dirp
->offset
, dir_result_t::dentry_off_lt());
7893 it
!= dirp
->buffer
.end();
7895 dir_result_t::dentry
&entry
= *it
;
7897 uint64_t next_off
= entry
.offset
+ 1;
7901 r
= _getattr(entry
.inode
, caps
, dirp
->perms
);
7906 fill_statx(entry
.inode
, caps
, &stx
);
7907 fill_dirent(&de
, entry
.name
.c_str(), stx
.stx_mode
, stx
.stx_ino
, next_off
);
7909 Inode
*inode
= NULL
;
7911 inode
= entry
.inode
.get();
7915 client_lock
.Unlock();
7916 r
= cb(p
, &de
, &stx
, next_off
, inode
); // _next_ offset
7919 ldout(cct
, 15) << " de " << de
.d_name
<< " off " << hex
<< next_off
- 1 << dec
7920 << " = " << r
<< dendl
;
7924 dirp
->offset
= next_off
;
7929 if (dirp
->next_offset
> 2) {
7930 ldout(cct
, 10) << " fetching next chunk of this frag" << dendl
;
7931 _readdir_drop_dirp_buffer(dirp
);
7935 if (!fg
.is_rightmost()) {
7937 _readdir_next_frag(dirp
);
7941 if (diri
->shared_gen
== dirp
->start_shared_gen
&&
7942 diri
->dir_release_count
== dirp
->release_count
) {
7943 if (diri
->dir_ordered_count
== dirp
->ordered_count
) {
7944 ldout(cct
, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri
<< dendl
;
7946 assert(diri
->dir
->readdir_cache
.size() >= dirp
->cache_index
);
7947 diri
->dir
->readdir_cache
.resize(dirp
->cache_index
);
7949 diri
->flags
|= I_COMPLETE
| I_DIR_ORDERED
;
7951 ldout(cct
, 10) << " marking I_COMPLETE on " << *diri
<< dendl
;
7952 diri
->flags
|= I_COMPLETE
;
7964 int Client::readdir_r(dir_result_t
*d
, struct dirent
*de
)
7966 return readdirplus_r(d
, de
, 0, 0, 0, NULL
);
7973 * 1 if we got a dirent
7974 * 0 for end of directory
7978 struct single_readdir
{
7980 struct ceph_statx
*stx
;
7985 static int _readdir_single_dirent_cb(void *p
, struct dirent
*de
,
7986 struct ceph_statx
*stx
, off_t off
,
7989 single_readdir
*c
= static_cast<single_readdir
*>(p
);
7992 return -1; // already filled this dirent
8002 struct dirent
*Client::readdir(dir_result_t
*d
)
8005 static struct dirent de
;
8012 // our callback fills the dirent and sets sr.full=true on first
8013 // call, and returns -1 the second time around.
8014 ret
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
);
8016 errno
= -ret
; // this sucks.
8017 return (dirent
*) NULL
;
8022 return (dirent
*) NULL
;
8025 int Client::readdirplus_r(dir_result_t
*d
, struct dirent
*de
,
8026 struct ceph_statx
*stx
, unsigned want
,
8027 unsigned flags
, Inode
**out
)
8035 // our callback fills the dirent and sets sr.full=true on first
8036 // call, and returns -1 the second time around.
8037 int r
= readdir_r_cb(d
, _readdir_single_dirent_cb
, (void *)&sr
, want
, flags
, out
);
8049 struct getdents_result
{
8056 static int _readdir_getdent_cb(void *p
, struct dirent
*de
,
8057 struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8059 struct getdents_result
*c
= static_cast<getdents_result
*>(p
);
8065 dlen
= strlen(de
->d_name
) + 1;
8067 if (c
->pos
+ dlen
> c
->buflen
)
8068 return -1; // doesn't fit
8071 memcpy(c
->buf
+ c
->pos
, de
, sizeof(*de
));
8073 memcpy(c
->buf
+ c
->pos
, de
->d_name
, dlen
);
8079 int Client::_getdents(dir_result_t
*dir
, char *buf
, int buflen
, bool fullent
)
8084 gr
.fullent
= fullent
;
8087 int r
= readdir_r_cb(dir
, _readdir_getdent_cb
, (void *)&gr
);
8089 if (r
< 0) { // some error
8090 if (r
== -1) { // buffer ran out of space
8091 if (gr
.pos
) { // but we got some entries already!
8093 } // or we need a larger buffer
8095 } else { // actual error, return it
8104 struct getdir_result
{
8105 list
<string
> *contents
;
8109 static int _getdir_cb(void *p
, struct dirent
*de
, struct ceph_statx
*stx
, off_t off
, Inode
*in
)
8111 getdir_result
*r
= static_cast<getdir_result
*>(p
);
8113 r
->contents
->push_back(de
->d_name
);
8118 int Client::getdir(const char *relpath
, list
<string
>& contents
,
8119 const UserPerm
& perms
)
8121 ldout(cct
, 3) << "getdir(" << relpath
<< ")" << dendl
;
8123 Mutex::Locker
lock(client_lock
);
8124 tout(cct
) << "getdir" << std::endl
;
8125 tout(cct
) << relpath
<< std::endl
;
8129 int r
= opendir(relpath
, &d
, perms
);
8134 gr
.contents
= &contents
;
8136 r
= readdir_r_cb(d
, _getdir_cb
, (void *)&gr
);
8146 /****** file i/o **********/
8147 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
,
8148 mode_t mode
, int stripe_unit
, int stripe_count
,
8149 int object_size
, const char *data_pool
)
8151 ldout(cct
, 3) << "open enter(" << relpath
<< ", " << ceph_flags_sys2wire(flags
) << "," << mode
<< ")" << dendl
;
8152 Mutex::Locker
lock(client_lock
);
8153 tout(cct
) << "open" << std::endl
;
8154 tout(cct
) << relpath
<< std::endl
;
8155 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
8162 #if defined(__linux__) && defined(O_PATH)
8163 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8164 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8165 * in kernel (fs/open.c). */
8167 flags
&= O_DIRECTORY
| O_NOFOLLOW
| O_PATH
;
8170 filepath
path(relpath
);
8172 bool created
= false;
8173 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8174 bool followsym
= !((flags
& O_NOFOLLOW
) || ((flags
& O_CREAT
) && (flags
& O_EXCL
)));
8175 int r
= path_walk(path
, &in
, perms
, followsym
, ceph_caps_for_mode(mode
));
8177 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
8180 #if defined(__linux__) && defined(O_PATH)
8181 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
) && !(flags
& O_PATH
))
8183 if (r
== 0 && in
->is_symlink() && (flags
& O_NOFOLLOW
))
8187 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
8188 filepath dirpath
= path
;
8189 string dname
= dirpath
.last_dentry();
8190 dirpath
.pop_dentry();
8192 r
= path_walk(dirpath
, &dir
, perms
, true,
8193 cct
->_conf
->client_permissions
? CEPH_CAP_AUTH_SHARED
: 0);
8196 if (cct
->_conf
->client_permissions
) {
8197 r
= may_create(dir
.get(), perms
);
8201 r
= _create(dir
.get(), dname
.c_str(), flags
, mode
, &in
, &fh
, stripe_unit
,
8202 stripe_count
, object_size
, data_pool
, &created
, perms
);
8208 // posix says we can only check permissions of existing files
8209 if (cct
->_conf
->client_permissions
) {
8210 r
= may_open(in
.get(), flags
, perms
);
8217 r
= _open(in
.get(), flags
, mode
, &fh
, perms
);
8219 // allocate a integer file descriptor
8222 assert(fd_map
.count(r
) == 0);
8227 tout(cct
) << r
<< std::endl
;
8228 ldout(cct
, 3) << "open exit(" << path
<< ", " << ceph_flags_sys2wire(flags
) << ") = " << r
<< dendl
;
8232 int Client::open(const char *relpath
, int flags
, const UserPerm
& perms
, mode_t mode
)
8234 /* Use default file striping parameters */
8235 return open(relpath
, flags
, perms
, mode
, 0, 0, 0, NULL
);
8238 int Client::lookup_hash(inodeno_t ino
, inodeno_t dirino
, const char *name
,
8239 const UserPerm
& perms
)
8241 Mutex::Locker
lock(client_lock
);
8242 ldout(cct
, 3) << "lookup_hash enter(" << ino
<< ", #" << dirino
<< "/" << name
<< ")" << dendl
;
8247 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPHASH
);
8249 req
->set_filepath(path
);
8251 uint32_t h
= ceph_str_hash(CEPH_STR_HASH_RJENKINS
, name
, strlen(name
));
8253 sprintf(f
, "%u", h
);
8254 filepath
path2(dirino
);
8255 path2
.push_dentry(string(f
));
8256 req
->set_filepath2(path2
);
8258 int r
= make_request(req
, perms
, NULL
, NULL
,
8259 rand() % mdsmap
->get_num_in_mds());
8260 ldout(cct
, 3) << "lookup_hash exit(" << ino
<< ", #" << dirino
<< "/" << name
<< ") = " << r
<< dendl
;
8266 * Load inode into local cache.
8268 * If inode pointer is non-NULL, and take a reference on
8269 * the resulting Inode object in one operation, so that caller
8270 * can safely assume inode will still be there after return.
8272 int Client::_lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8274 ldout(cct
, 8) << "lookup_ino enter(" << ino
<< ")" << dendl
;
8279 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPINO
);
8281 req
->set_filepath(path
);
8283 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8284 if (r
== 0 && inode
!= NULL
) {
8285 vinodeno_t
vino(ino
, CEPH_NOSNAP
);
8286 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
8287 assert(p
!= inode_map
.end());
8291 ldout(cct
, 8) << "lookup_ino exit(" << ino
<< ") = " << r
<< dendl
;
8295 int Client::lookup_ino(inodeno_t ino
, const UserPerm
& perms
, Inode
**inode
)
8297 Mutex::Locker
lock(client_lock
);
8298 return _lookup_ino(ino
, perms
, inode
);
8302 * Find the parent inode of `ino` and insert it into
8303 * our cache. Conditionally also set `parent` to a referenced
8304 * Inode* if caller provides non-NULL value.
8306 int Client::_lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8308 ldout(cct
, 8) << "lookup_parent enter(" << ino
->ino
<< ")" << dendl
;
8313 if (!ino
->dn_set
.empty()) {
8314 // if we exposed the parent here, we'd need to check permissions,
8315 // but right now we just rely on the MDS doing so in make_request
8316 ldout(cct
, 8) << "lookup_parent dentry already present" << dendl
;
8320 if (ino
->is_root()) {
8322 ldout(cct
, 8) << "ino is root, no parent" << dendl
;
8326 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT
);
8327 filepath
path(ino
->ino
);
8328 req
->set_filepath(path
);
8331 int r
= make_request(req
, perms
, &target
, NULL
, rand() % mdsmap
->get_num_in_mds());
8332 // Give caller a reference to the parent ino if they provided a pointer.
8333 if (parent
!= NULL
) {
8335 *parent
= target
.get();
8337 ldout(cct
, 8) << "lookup_parent found parent " << (*parent
)->ino
<< dendl
;
8342 ldout(cct
, 8) << "lookup_parent exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8346 int Client::lookup_parent(Inode
*ino
, const UserPerm
& perms
, Inode
**parent
)
8348 Mutex::Locker
lock(client_lock
);
8349 return _lookup_parent(ino
, perms
, parent
);
8353 * Populate the parent dentry for `ino`, provided it is
8354 * a child of `parent`.
8356 int Client::_lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8358 assert(parent
->is_dir());
8359 ldout(cct
, 3) << "lookup_name enter(" << ino
->ino
<< ")" << dendl
;
8364 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
8365 req
->set_filepath2(filepath(parent
->ino
));
8366 req
->set_filepath(filepath(ino
->ino
));
8367 req
->set_inode(ino
);
8369 int r
= make_request(req
, perms
, NULL
, NULL
, rand() % mdsmap
->get_num_in_mds());
8370 ldout(cct
, 3) << "lookup_name exit(" << ino
->ino
<< ") = " << r
<< dendl
;
8374 int Client::lookup_name(Inode
*ino
, Inode
*parent
, const UserPerm
& perms
)
8376 Mutex::Locker
lock(client_lock
);
8377 return _lookup_name(ino
, parent
, perms
);
8380 Fh
*Client::_create_fh(Inode
*in
, int flags
, int cmode
, const UserPerm
& perms
)
8388 f
->actor_perms
= perms
;
8390 ldout(cct
, 10) << "_create_fh " << in
->ino
<< " mode " << cmode
<< dendl
;
8392 if (in
->snapid
!= CEPH_NOSNAP
) {
8393 in
->snap_cap_refs
++;
8394 ldout(cct
, 5) << "open success, fh is " << f
<< " combined IMMUTABLE SNAP caps "
8395 << ccap_string(in
->caps_issued()) << dendl
;
8398 const md_config_t
*conf
= cct
->_conf
;
8399 f
->readahead
.set_trigger_requests(1);
8400 f
->readahead
.set_min_readahead_size(conf
->client_readahead_min
);
8401 uint64_t max_readahead
= Readahead::NO_LIMIT
;
8402 if (conf
->client_readahead_max_bytes
) {
8403 max_readahead
= MIN(max_readahead
, (uint64_t)conf
->client_readahead_max_bytes
);
8405 if (conf
->client_readahead_max_periods
) {
8406 max_readahead
= MIN(max_readahead
, in
->layout
.get_period()*(uint64_t)conf
->client_readahead_max_periods
);
8408 f
->readahead
.set_max_readahead_size(max_readahead
);
8409 vector
<uint64_t> alignments
;
8410 alignments
.push_back(in
->layout
.get_period());
8411 alignments
.push_back(in
->layout
.stripe_unit
);
8412 f
->readahead
.set_alignments(alignments
);
8417 int Client::_release_fh(Fh
*f
)
8419 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8420 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8421 Inode
*in
= f
->inode
.get();
8422 ldout(cct
, 8) << "_release_fh " << f
<< " mode " << f
->mode
<< " on " << *in
<< dendl
;
8426 if (in
->snapid
== CEPH_NOSNAP
) {
8427 if (in
->put_open_ref(f
->mode
)) {
8428 _flush(in
, new C_Client_FlushComplete(this, in
));
8432 assert(in
->snap_cap_refs
> 0);
8433 in
->snap_cap_refs
--;
8436 _release_filelocks(f
);
8438 // Finally, read any async err (i.e. from flushes)
8439 int err
= f
->take_async_err();
8441 ldout(cct
, 1) << "_release_fh " << f
<< " on inode " << *in
<< " caught async_err = "
8442 << cpp_strerror(err
) << dendl
;
8444 ldout(cct
, 10) << "_release_fh " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
8452 void Client::_put_fh(Fh
*f
)
8454 int left
= f
->put();
8460 int Client::_open(Inode
*in
, int flags
, mode_t mode
, Fh
**fhp
,
8461 const UserPerm
& perms
)
8463 if (in
->snapid
!= CEPH_NOSNAP
&&
8464 (flags
& (O_WRONLY
| O_RDWR
| O_CREAT
| O_TRUNC
| O_APPEND
))) {
8468 // use normalized flags to generate cmode
8469 int cmode
= ceph_flags_to_mode(ceph_flags_sys2wire(flags
));
8472 int want
= ceph_caps_for_mode(cmode
);
8475 in
->get_open_ref(cmode
); // make note of pending open, since it effects _wanted_ caps.
8477 if ((flags
& O_TRUNC
) == 0 && in
->caps_issued_mask(want
)) {
8479 check_caps(in
, CHECK_CAPS_NODELAY
);
8482 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8484 in
->make_nosnap_relative_path(path
);
8485 req
->set_filepath(path
);
8486 req
->head
.args
.open
.flags
= ceph_flags_sys2wire(flags
& ~O_CREAT
);
8487 req
->head
.args
.open
.mode
= mode
;
8488 req
->head
.args
.open
.pool
= -1;
8489 if (cct
->_conf
->client_debug_getattr_caps
)
8490 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8492 req
->head
.args
.open
.mask
= 0;
8493 req
->head
.args
.open
.old_size
= in
->size
; // for O_TRUNC
8495 result
= make_request(req
, perms
);
8498 * NFS expects that delegations will be broken on a conflicting open,
8499 * not just when there is actual conflicting access to the file. SMB leases
8500 * and oplocks also have similar semantics.
8502 * Ensure that clients that have delegations enabled will wait on minimal
8503 * caps during open, just to ensure that other clients holding delegations
8504 * return theirs first.
8506 if (deleg_timeout
&& result
== 0) {
8509 if (cmode
& CEPH_FILE_MODE_WR
)
8510 need
|= CEPH_CAP_FILE_WR
;
8511 if (cmode
& CEPH_FILE_MODE_RD
)
8512 need
|= CEPH_CAP_FILE_RD
;
8514 result
= get_caps(in
, need
, want
, &have
, -1);
8516 ldout(cct
, 8) << "Unable to get caps after open of inode " << *in
<<
8517 " . Denying open: " <<
8518 cpp_strerror(result
) << dendl
;
8519 in
->put_open_ref(cmode
);
8521 put_cap_ref(in
, need
);
8529 *fhp
= _create_fh(in
, flags
, cmode
, perms
);
8531 in
->put_open_ref(cmode
);
8539 int Client::_renew_caps(Inode
*in
)
8541 int wanted
= in
->caps_file_wanted();
8542 if (in
->is_any_caps() &&
8543 ((wanted
& CEPH_CAP_ANY_WR
) == 0 || in
->auth_cap
)) {
8544 check_caps(in
, CHECK_CAPS_NODELAY
);
8549 if ((wanted
& CEPH_CAP_FILE_RD
) && (wanted
& CEPH_CAP_FILE_WR
))
8551 else if (wanted
& CEPH_CAP_FILE_RD
)
8553 else if (wanted
& CEPH_CAP_FILE_WR
)
8556 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_OPEN
);
8558 in
->make_nosnap_relative_path(path
);
8559 req
->set_filepath(path
);
8560 req
->head
.args
.open
.flags
= flags
;
8561 req
->head
.args
.open
.pool
= -1;
8562 if (cct
->_conf
->client_debug_getattr_caps
)
8563 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
8565 req
->head
.args
.open
.mask
= 0;
8568 // duplicate in case Cap goes away; not sure if that race is a concern?
8569 const UserPerm
*pperm
= in
->get_best_perms();
8573 int ret
= make_request(req
, perms
);
8577 int Client::close(int fd
)
8579 ldout(cct
, 3) << "close enter(" << fd
<< ")" << dendl
;
8580 Mutex::Locker
lock(client_lock
);
8581 tout(cct
) << "close" << std::endl
;
8582 tout(cct
) << fd
<< std::endl
;
8587 Fh
*fh
= get_filehandle(fd
);
8590 int err
= _release_fh(fh
);
8593 ldout(cct
, 3) << "close exit(" << fd
<< ")" << dendl
;
8601 loff_t
Client::lseek(int fd
, loff_t offset
, int whence
)
8603 Mutex::Locker
lock(client_lock
);
8604 tout(cct
) << "lseek" << std::endl
;
8605 tout(cct
) << fd
<< std::endl
;
8606 tout(cct
) << offset
<< std::endl
;
8607 tout(cct
) << whence
<< std::endl
;
8612 Fh
*f
= get_filehandle(fd
);
8615 #if defined(__linux__) && defined(O_PATH)
8616 if (f
->flags
& O_PATH
)
8619 return _lseek(f
, offset
, whence
);
8622 loff_t
Client::_lseek(Fh
*f
, loff_t offset
, int whence
)
8624 Inode
*in
= f
->inode
.get();
8637 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8640 f
->pos
= in
->size
+ offset
;
8647 ldout(cct
, 8) << "_lseek(" << f
<< ", " << offset
<< ", " << whence
<< ") = " << f
->pos
<< dendl
;
8652 void Client::lock_fh_pos(Fh
*f
)
8654 ldout(cct
, 10) << "lock_fh_pos " << f
<< dendl
;
8656 if (f
->pos_locked
|| !f
->pos_waiters
.empty()) {
8658 f
->pos_waiters
.push_back(&cond
);
8659 ldout(cct
, 10) << "lock_fh_pos BLOCKING on " << f
<< dendl
;
8660 while (f
->pos_locked
|| f
->pos_waiters
.front() != &cond
)
8661 cond
.Wait(client_lock
);
8662 ldout(cct
, 10) << "lock_fh_pos UNBLOCKING on " << f
<< dendl
;
8663 assert(f
->pos_waiters
.front() == &cond
);
8664 f
->pos_waiters
.pop_front();
8667 f
->pos_locked
= true;
8670 void Client::unlock_fh_pos(Fh
*f
)
8672 ldout(cct
, 10) << "unlock_fh_pos " << f
<< dendl
;
8673 f
->pos_locked
= false;
8676 int Client::uninline_data(Inode
*in
, Context
*onfinish
)
8678 if (!in
->inline_data
.length()) {
8679 onfinish
->complete(0);
8684 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (long long unsigned)in
->ino
);
8685 object_t oid
= oid_buf
;
8687 ObjectOperation create_ops
;
8688 create_ops
.create(false);
8690 objecter
->mutate(oid
,
8691 OSDMap::file_to_object_locator(in
->layout
),
8693 in
->snaprealm
->get_snap_context(),
8694 ceph::real_clock::now(),
8698 bufferlist inline_version_bl
;
8699 ::encode(in
->inline_version
, inline_version_bl
);
8701 ObjectOperation uninline_ops
;
8702 uninline_ops
.cmpxattr("inline_version",
8703 CEPH_OSD_CMPXATTR_OP_GT
,
8704 CEPH_OSD_CMPXATTR_MODE_U64
,
8706 bufferlist inline_data
= in
->inline_data
;
8707 uninline_ops
.write(0, inline_data
, in
->truncate_size
, in
->truncate_seq
);
8708 uninline_ops
.setxattr("inline_version", stringify(in
->inline_version
));
8710 objecter
->mutate(oid
,
8711 OSDMap::file_to_object_locator(in
->layout
),
8713 in
->snaprealm
->get_snap_context(),
8714 ceph::real_clock::now(),
8723 // blocking osd interface
8725 int Client::read(int fd
, char *buf
, loff_t size
, loff_t offset
)
8727 Mutex::Locker
lock(client_lock
);
8728 tout(cct
) << "read" << std::endl
;
8729 tout(cct
) << fd
<< std::endl
;
8730 tout(cct
) << size
<< std::endl
;
8731 tout(cct
) << offset
<< std::endl
;
8736 Fh
*f
= get_filehandle(fd
);
8739 #if defined(__linux__) && defined(O_PATH)
8740 if (f
->flags
& O_PATH
)
8744 int r
= _read(f
, offset
, size
, &bl
);
8745 ldout(cct
, 3) << "read(" << fd
<< ", " << (void*)buf
<< ", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
8747 bl
.copy(0, bl
.length(), buf
);
8753 int Client::preadv(int fd
, const struct iovec
*iov
, int iovcnt
, loff_t offset
)
8757 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, false);
8760 int Client::_read(Fh
*f
, int64_t offset
, uint64_t size
, bufferlist
*bl
)
8762 const md_config_t
*conf
= cct
->_conf
;
8763 Inode
*in
= f
->inode
.get();
8765 if ((f
->mode
& CEPH_FILE_MODE_RD
) == 0)
8767 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8769 bool movepos
= false;
8775 loff_t start_pos
= offset
;
8777 if (in
->inline_version
== 0) {
8778 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
8784 assert(in
->inline_version
> 0);
8789 int r
= get_caps(in
, CEPH_CAP_FILE_RD
, CEPH_CAP_FILE_CACHE
, &have
, -1);
8795 if (f
->flags
& O_DIRECT
)
8796 have
&= ~CEPH_CAP_FILE_CACHE
;
8798 Mutex
uninline_flock("Client::_read_uninline_data flock");
8800 bool uninline_done
= false;
8801 int uninline_ret
= 0;
8802 Context
*onuninline
= NULL
;
8804 if (in
->inline_version
< CEPH_INLINE_NONE
) {
8805 if (!(have
& CEPH_CAP_FILE_CACHE
)) {
8806 onuninline
= new C_SafeCond(&uninline_flock
,
8810 uninline_data(in
, onuninline
);
8812 uint32_t len
= in
->inline_data
.length();
8814 uint64_t endoff
= offset
+ size
;
8815 if (endoff
> in
->size
)
8819 if (endoff
<= len
) {
8820 bl
->substr_of(in
->inline_data
, offset
, endoff
- offset
);
8822 bl
->substr_of(in
->inline_data
, offset
, len
- offset
);
8823 bl
->append_zero(endoff
- len
);
8825 } else if ((uint64_t)offset
< endoff
) {
8826 bl
->append_zero(endoff
- offset
);
8833 if (!conf
->client_debug_force_sync_read
&&
8834 (conf
->client_oc
&& (have
& CEPH_CAP_FILE_CACHE
))) {
8836 if (f
->flags
& O_RSYNC
) {
8837 _flush_range(in
, offset
, size
);
8839 r
= _read_async(f
, offset
, size
, bl
);
8843 if (f
->flags
& O_DIRECT
)
8844 _flush_range(in
, offset
, size
);
8846 bool checkeof
= false;
8847 r
= _read_sync(f
, offset
, size
, bl
, &checkeof
);
8854 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
8857 r
= _getattr(in
, CEPH_STAT_CAP_SIZE
, f
->actor_perms
);
8862 if ((uint64_t)offset
< in
->size
)
8870 f
->pos
= start_pos
+ bl
->length();
8878 client_lock
.Unlock();
8879 uninline_flock
.Lock();
8880 while (!uninline_done
)
8881 uninline_cond
.Wait(uninline_flock
);
8882 uninline_flock
.Unlock();
8885 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
8886 in
->inline_data
.clear();
8887 in
->inline_version
= CEPH_INLINE_NONE
;
8888 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
8895 put_cap_ref(in
, CEPH_CAP_FILE_RD
);
8901 return bl
->length();
8904 Client::C_Readahead::C_Readahead(Client
*c
, Fh
*f
) :
8907 f
->readahead
.inc_pending();
8910 Client::C_Readahead::~C_Readahead() {
8911 f
->readahead
.dec_pending();
8915 void Client::C_Readahead::finish(int r
) {
8916 lgeneric_subdout(client
->cct
, client
, 20) << "client." << client
->get_nodeid() << " " << "C_Readahead on " << f
->inode
<< dendl
;
8917 client
->put_cap_ref(f
->inode
.get(), CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
8920 int Client::_read_async(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
)
8922 const md_config_t
*conf
= cct
->_conf
;
8923 Inode
*in
= f
->inode
.get();
8925 ldout(cct
, 10) << "_read_async " << *in
<< " " << off
<< "~" << len
<< dendl
;
8927 // trim read based on file size?
8928 if (off
>= in
->size
)
8932 if (off
+ len
> in
->size
) {
8933 len
= in
->size
- off
;
8936 ldout(cct
, 10) << " min_bytes=" << f
->readahead
.get_min_readahead_size()
8937 << " max_bytes=" << f
->readahead
.get_max_readahead_size()
8938 << " max_periods=" << conf
->client_readahead_max_periods
<< dendl
;
8940 // read (and possibly block)
8942 Mutex
flock("Client::_read_async flock");
8945 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
, &rvalue
);
8946 r
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
8947 off
, len
, bl
, 0, onfinish
);
8949 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
8950 client_lock
.Unlock();
8956 put_cap_ref(in
, CEPH_CAP_FILE_CACHE
);
8963 if(f
->readahead
.get_min_readahead_size() > 0) {
8964 pair
<uint64_t, uint64_t> readahead_extent
= f
->readahead
.update(off
, len
, in
->size
);
8965 if (readahead_extent
.second
> 0) {
8966 ldout(cct
, 20) << "readahead " << readahead_extent
.first
<< "~" << readahead_extent
.second
8967 << " (caller wants " << off
<< "~" << len
<< ")" << dendl
;
8968 Context
*onfinish2
= new C_Readahead(this, f
);
8969 int r2
= objectcacher
->file_read(&in
->oset
, &in
->layout
, in
->snapid
,
8970 readahead_extent
.first
, readahead_extent
.second
,
8971 NULL
, 0, onfinish2
);
8973 ldout(cct
, 20) << "readahead initiated, c " << onfinish2
<< dendl
;
8974 get_cap_ref(in
, CEPH_CAP_FILE_RD
| CEPH_CAP_FILE_CACHE
);
8976 ldout(cct
, 20) << "readahead was no-op, already cached" << dendl
;
8985 int Client::_read_sync(Fh
*f
, uint64_t off
, uint64_t len
, bufferlist
*bl
,
8988 Inode
*in
= f
->inode
.get();
8993 ldout(cct
, 10) << "_read_sync " << *in
<< " " << off
<< "~" << len
<< dendl
;
8995 Mutex
flock("Client::_read_sync flock");
9000 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
, &r
);
9004 filer
->read_trunc(in
->ino
, &in
->layout
, in
->snapid
,
9006 in
->truncate_size
, in
->truncate_seq
,
9008 client_lock
.Unlock();
9015 // if we get ENOENT from OSD, assume 0 bytes returned
9026 bl
->claim_append(tbl
);
9029 if (r
>= 0 && r
< wanted
) {
9030 if (pos
< in
->size
) {
9031 // zero up to known EOF
9032 int64_t some
= in
->size
- pos
;
9054 * we keep count of uncommitted sync writes on the inode, so that
9057 void Client::_sync_write_commit(Inode
*in
)
9059 assert(unsafe_sync_write
> 0);
9060 unsafe_sync_write
--;
9062 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9064 ldout(cct
, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write
<< dendl
;
9065 if (unsafe_sync_write
== 0 && unmounting
) {
9066 ldout(cct
, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl
;
9067 mount_cond
.Signal();
9071 int Client::write(int fd
, const char *buf
, loff_t size
, loff_t offset
)
9073 Mutex::Locker
lock(client_lock
);
9074 tout(cct
) << "write" << std::endl
;
9075 tout(cct
) << fd
<< std::endl
;
9076 tout(cct
) << size
<< std::endl
;
9077 tout(cct
) << offset
<< std::endl
;
9082 Fh
*fh
= get_filehandle(fd
);
9085 #if defined(__linux__) && defined(O_PATH)
9086 if (fh
->flags
& O_PATH
)
9089 int r
= _write(fh
, offset
, size
, buf
, NULL
, 0);
9090 ldout(cct
, 3) << "write(" << fd
<< ", \"...\", " << size
<< ", " << offset
<< ") = " << r
<< dendl
;
9094 int Client::pwritev(int fd
, const struct iovec
*iov
, int iovcnt
, int64_t offset
)
9098 return _preadv_pwritev(fd
, iov
, iovcnt
, offset
, true);
9101 int Client::_preadv_pwritev(int fd
, const struct iovec
*iov
, unsigned iovcnt
, int64_t offset
, bool write
)
9103 Mutex::Locker
lock(client_lock
);
9104 tout(cct
) << fd
<< std::endl
;
9105 tout(cct
) << offset
<< std::endl
;
9110 Fh
*fh
= get_filehandle(fd
);
9113 #if defined(__linux__) && defined(O_PATH)
9114 if (fh
->flags
& O_PATH
)
9117 loff_t totallen
= 0;
9118 for (unsigned i
= 0; i
< iovcnt
; i
++) {
9119 totallen
+= iov
[i
].iov_len
;
9122 int w
= _write(fh
, offset
, totallen
, NULL
, iov
, iovcnt
);
9123 ldout(cct
, 3) << "pwritev(" << fd
<< ", \"...\", " << totallen
<< ", " << offset
<< ") = " << w
<< dendl
;
9127 int r
= _read(fh
, offset
, totallen
, &bl
);
9128 ldout(cct
, 3) << "preadv(" << fd
<< ", " << offset
<< ") = " << r
<< dendl
;
9133 for (unsigned j
= 0, resid
= r
; j
< iovcnt
&& resid
> 0; j
++) {
9135 * This piece of code aims to handle the case that bufferlist does not have enough data
9136 * to fill in the iov
9138 if (resid
< iov
[j
].iov_len
) {
9139 bl
.copy(bufoff
, resid
, (char *)iov
[j
].iov_base
);
9142 bl
.copy(bufoff
, iov
[j
].iov_len
, (char *)iov
[j
].iov_base
);
9144 resid
-= iov
[j
].iov_len
;
9145 bufoff
+= iov
[j
].iov_len
;
9151 int Client::_write(Fh
*f
, int64_t offset
, uint64_t size
, const char *buf
,
9152 const struct iovec
*iov
, int iovcnt
)
9154 if ((uint64_t)(offset
+size
) > mdsmap
->get_max_filesize()) //too large!
9157 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9158 Inode
*in
= f
->inode
.get();
9160 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
)) {
9164 assert(in
->snapid
== CEPH_NOSNAP
);
9166 // was Fh opened as writeable?
9167 if ((f
->mode
& CEPH_FILE_MODE_WR
) == 0)
9171 uint64_t endoff
= offset
+ size
;
9172 std::list
<InodeRef
> quota_roots
;
9173 if (endoff
> in
->size
&&
9174 is_quota_bytes_exceeded(in
, endoff
- in
->size
, f
->actor_perms
, "a_roots
)) {
9178 // use/adjust fd pos?
9182 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9183 * change out from under us.
9185 if (f
->flags
& O_APPEND
) {
9186 int r
= _lseek(f
, 0, SEEK_END
);
9193 f
->pos
= offset
+size
;
9197 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9199 ldout(cct
, 10) << "cur file size is " << in
->size
<< dendl
;
9202 utime_t start
= ceph_clock_now();
9204 if (in
->inline_version
== 0) {
9205 int r
= _getattr(in
, CEPH_STAT_CAP_INLINE_DATA
, f
->actor_perms
, true);
9208 assert(in
->inline_version
> 0);
9211 // copy into fresh buffer (since our write may be resub, async)
9215 bl
.append(buf
, size
);
9217 for (int i
= 0; i
< iovcnt
; i
++) {
9218 if (iov
[i
].iov_len
> 0) {
9219 bl
.append((const char *)iov
[i
].iov_base
, iov
[i
].iov_len
);
9225 uint64_t totalwritten
;
9227 int r
= get_caps(in
, CEPH_CAP_FILE_WR
|CEPH_CAP_AUTH_SHARED
,
9228 CEPH_CAP_FILE_BUFFER
, &have
, endoff
);
9232 /* clear the setuid/setgid bits, if any */
9233 if (unlikely(in
->mode
& (S_ISUID
|S_ISGID
)) && size
> 0) {
9234 struct ceph_statx stx
= { 0 };
9236 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9237 r
= __setattrx(in
, &stx
, CEPH_SETATTR_KILL_SGUID
, f
->actor_perms
);
9241 put_cap_ref(in
, CEPH_CAP_AUTH_SHARED
);
9244 if (f
->flags
& O_DIRECT
)
9245 have
&= ~CEPH_CAP_FILE_BUFFER
;
9247 ldout(cct
, 10) << " snaprealm " << *in
->snaprealm
<< dendl
;
9249 Mutex
uninline_flock("Client::_write_uninline_data flock");
9251 bool uninline_done
= false;
9252 int uninline_ret
= 0;
9253 Context
*onuninline
= NULL
;
9255 if (in
->inline_version
< CEPH_INLINE_NONE
) {
9256 if (endoff
> cct
->_conf
->client_max_inline_size
||
9257 endoff
> CEPH_INLINE_MAX_SIZE
||
9258 !(have
& CEPH_CAP_FILE_BUFFER
)) {
9259 onuninline
= new C_SafeCond(&uninline_flock
,
9263 uninline_data(in
, onuninline
);
9265 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9267 uint32_t len
= in
->inline_data
.length();
9270 in
->inline_data
.copy(endoff
, len
- endoff
, bl
);
9273 in
->inline_data
.splice(offset
, len
- offset
);
9274 else if (offset
> len
)
9275 in
->inline_data
.append_zero(offset
- len
);
9277 in
->inline_data
.append(bl
);
9278 in
->inline_version
++;
9280 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9286 if (cct
->_conf
->client_oc
&& (have
& CEPH_CAP_FILE_BUFFER
)) {
9287 // do buffered write
9288 if (!in
->oset
.dirty_or_tx
)
9289 get_cap_ref(in
, CEPH_CAP_FILE_CACHE
| CEPH_CAP_FILE_BUFFER
);
9291 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9293 // async, caching, non-blocking.
9294 r
= objectcacher
->file_write(&in
->oset
, &in
->layout
,
9295 in
->snaprealm
->get_snap_context(),
9296 offset
, size
, bl
, ceph::real_clock::now(),
9298 put_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
9303 // flush cached write if O_SYNC is set on file fh
9304 // O_DSYNC == O_SYNC on linux < 2.6.33
9305 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9306 if ((f
->flags
& O_SYNC
) || (f
->flags
& O_DSYNC
)) {
9307 _flush_range(in
, offset
, size
);
9310 if (f
->flags
& O_DIRECT
)
9311 _flush_range(in
, offset
, size
);
9313 // simple, non-atomic sync write
9314 Mutex
flock("Client::_write flock");
9317 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
);
9319 unsafe_sync_write
++;
9320 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
); // released by onsafe callback
9322 filer
->write_trunc(in
->ino
, &in
->layout
, in
->snaprealm
->get_snap_context(),
9323 offset
, size
, bl
, ceph::real_clock::now(), 0,
9324 in
->truncate_size
, in
->truncate_seq
,
9326 client_lock
.Unlock();
9333 _sync_write_commit(in
);
9336 // if we get here, write was successful, update client metadata
9339 lat
= ceph_clock_now();
9341 logger
->tinc(l_c_wrlat
, lat
);
9343 totalwritten
= size
;
9344 r
= (int)totalwritten
;
9347 if (totalwritten
+ offset
> in
->size
) {
9348 in
->size
= totalwritten
+ offset
;
9349 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9351 if (is_quota_bytes_approaching(in
, quota_roots
)) {
9352 check_caps(in
, CHECK_CAPS_NODELAY
);
9353 } else if (is_max_size_approaching(in
)) {
9357 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", extending file size" << dendl
;
9359 ldout(cct
, 7) << "wrote to " << totalwritten
+offset
<< ", leaving file size at " << in
->size
<< dendl
;
9363 in
->mtime
= ceph_clock_now();
9365 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9370 client_lock
.Unlock();
9371 uninline_flock
.Lock();
9372 while (!uninline_done
)
9373 uninline_cond
.Wait(uninline_flock
);
9374 uninline_flock
.Unlock();
9377 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
9378 in
->inline_data
.clear();
9379 in
->inline_version
= CEPH_INLINE_NONE
;
9380 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
9386 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
9390 int Client::_flush(Fh
*f
)
9392 Inode
*in
= f
->inode
.get();
9393 int err
= f
->take_async_err();
9395 ldout(cct
, 1) << __func__
<< ": " << f
<< " on inode " << *in
<< " caught async_err = "
9396 << cpp_strerror(err
) << dendl
;
9398 ldout(cct
, 10) << __func__
<< ": " << f
<< " on inode " << *in
<< " no async_err state" << dendl
;
9404 int Client::truncate(const char *relpath
, loff_t length
, const UserPerm
& perms
)
9406 struct ceph_statx stx
;
9407 stx
.stx_size
= length
;
9408 return setattrx(relpath
, &stx
, CEPH_SETATTR_SIZE
, perms
);
9411 int Client::ftruncate(int fd
, loff_t length
, const UserPerm
& perms
)
9413 Mutex::Locker
lock(client_lock
);
9414 tout(cct
) << "ftruncate" << std::endl
;
9415 tout(cct
) << fd
<< std::endl
;
9416 tout(cct
) << length
<< std::endl
;
9421 Fh
*f
= get_filehandle(fd
);
9424 #if defined(__linux__) && defined(O_PATH)
9425 if (f
->flags
& O_PATH
)
9429 attr
.st_size
= length
;
9430 return _setattr(f
->inode
, &attr
, CEPH_SETATTR_SIZE
, perms
);
9433 int Client::fsync(int fd
, bool syncdataonly
)
9435 Mutex::Locker
lock(client_lock
);
9436 tout(cct
) << "fsync" << std::endl
;
9437 tout(cct
) << fd
<< std::endl
;
9438 tout(cct
) << syncdataonly
<< std::endl
;
9443 Fh
*f
= get_filehandle(fd
);
9446 #if defined(__linux__) && defined(O_PATH)
9447 if (f
->flags
& O_PATH
)
9450 int r
= _fsync(f
, syncdataonly
);
9452 // The IOs in this fsync were okay, but maybe something happened
9453 // in the background that we shoudl be reporting?
9454 r
= f
->take_async_err();
9455 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
9456 << ") = 0, async_err = " << r
<< dendl
;
9458 // Assume that an error we encountered during fsync, even reported
9459 // synchronously, would also have applied the error to the Fh, and we
9460 // should clear it here to avoid returning the same error again on next
9462 ldout(cct
, 5) << "fsync(" << fd
<< ", " << syncdataonly
<< ") = "
9464 f
->take_async_err();
9469 int Client::_fsync(Inode
*in
, bool syncdataonly
)
9472 Mutex
lock("Client::_fsync::lock");
9475 C_SafeCond
*object_cacher_completion
= NULL
;
9476 ceph_tid_t flush_tid
= 0;
9479 ldout(cct
, 8) << "_fsync on " << *in
<< " " << (syncdataonly
? "(dataonly)":"(data+metadata)") << dendl
;
9481 if (cct
->_conf
->client_oc
) {
9482 object_cacher_completion
= new C_SafeCond(&lock
, &cond
, &done
, &r
);
9483 tmp_ref
= in
; // take a reference; C_SafeCond doesn't and _flush won't either
9484 _flush(in
, object_cacher_completion
);
9485 ldout(cct
, 15) << "using return-valued form of _fsync" << dendl
;
9488 if (!syncdataonly
&& in
->dirty_caps
) {
9489 check_caps(in
, CHECK_CAPS_NODELAY
|CHECK_CAPS_SYNCHRONOUS
);
9490 if (in
->flushing_caps
)
9491 flush_tid
= last_flush_tid
;
9492 } else ldout(cct
, 10) << "no metadata needs to commit" << dendl
;
9494 if (!syncdataonly
&& !in
->unsafe_ops
.empty()) {
9497 MetaRequest
*req
= in
->unsafe_ops
.back();
9498 ldout(cct
, 15) << "waiting on unsafe requests, last tid " << req
->get_tid() << dendl
;
9501 wait_on_list(req
->waitfor_safe
);
9505 if (object_cacher_completion
) { // wait on a real reply instead of guessing
9506 client_lock
.Unlock();
9508 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
9513 ldout(cct
, 15) << "got " << r
<< " from flush writeback" << dendl
;
9515 // FIXME: this can starve
9516 while (in
->cap_refs
[CEPH_CAP_FILE_BUFFER
] > 0) {
9517 ldout(cct
, 10) << "ino " << in
->ino
<< " has " << in
->cap_refs
[CEPH_CAP_FILE_BUFFER
]
9518 << " uncommitted, waiting" << dendl
;
9519 wait_on_list(in
->waitfor_commit
);
9525 wait_sync_caps(in
, flush_tid
);
9527 ldout(cct
, 10) << "ino " << in
->ino
<< " has no uncommitted writes" << dendl
;
9529 ldout(cct
, 8) << "ino " << in
->ino
<< " failed to commit to disk! "
9530 << cpp_strerror(-r
) << dendl
;
9536 int Client::_fsync(Fh
*f
, bool syncdataonly
)
9538 ldout(cct
, 8) << "_fsync(" << f
<< ", " << (syncdataonly
? "dataonly)":"data+metadata)") << dendl
;
9539 return _fsync(f
->inode
.get(), syncdataonly
);
9542 int Client::fstat(int fd
, struct stat
*stbuf
, const UserPerm
& perms
, int mask
)
9544 Mutex::Locker
lock(client_lock
);
9545 tout(cct
) << "fstat mask " << hex
<< mask
<< dec
<< std::endl
;
9546 tout(cct
) << fd
<< std::endl
;
9551 Fh
*f
= get_filehandle(fd
);
9554 int r
= _getattr(f
->inode
, mask
, perms
);
9557 fill_stat(f
->inode
, stbuf
, NULL
);
9558 ldout(cct
, 5) << "fstat(" << fd
<< ", " << stbuf
<< ") = " << r
<< dendl
;
9562 int Client::fstatx(int fd
, struct ceph_statx
*stx
, const UserPerm
& perms
,
9563 unsigned int want
, unsigned int flags
)
9565 Mutex::Locker
lock(client_lock
);
9566 tout(cct
) << "fstatx flags " << hex
<< flags
<< " want " << want
<< dec
<< std::endl
;
9567 tout(cct
) << fd
<< std::endl
;
9572 Fh
*f
= get_filehandle(fd
);
9576 unsigned mask
= statx_to_mask(flags
, want
);
9579 if (mask
&& !f
->inode
->caps_issued_mask(mask
, true)) {
9580 r
= _getattr(f
->inode
, mask
, perms
);
9582 ldout(cct
, 3) << "fstatx exit on error!" << dendl
;
9587 fill_statx(f
->inode
, mask
, stx
);
9588 ldout(cct
, 3) << "fstatx(" << fd
<< ", " << stx
<< ") = " << r
<< dendl
;
9592 // not written yet, but i want to link!
9594 int Client::chdir(const char *relpath
, std::string
&new_cwd
,
9595 const UserPerm
& perms
)
9597 Mutex::Locker
lock(client_lock
);
9598 tout(cct
) << "chdir" << std::endl
;
9599 tout(cct
) << relpath
<< std::endl
;
9604 filepath
path(relpath
);
9606 int r
= path_walk(path
, &in
, perms
);
9611 ldout(cct
, 3) << "chdir(" << relpath
<< ") cwd now " << cwd
->ino
<< dendl
;
9613 _getcwd(new_cwd
, perms
);
9617 void Client::_getcwd(string
& dir
, const UserPerm
& perms
)
9620 ldout(cct
, 10) << "getcwd " << *cwd
<< dendl
;
9622 Inode
*in
= cwd
.get();
9623 while (in
!= root
) {
9624 assert(in
->dn_set
.size() < 2); // dirs can't be hard-linked
9626 // A cwd or ancester is unlinked
9627 if (in
->dn_set
.empty()) {
9631 Dentry
*dn
= in
->get_first_parent();
9636 ldout(cct
, 10) << "getcwd looking up parent for " << *in
<< dendl
;
9637 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
9638 filepath
path(in
->ino
);
9639 req
->set_filepath(path
);
9641 int res
= make_request(req
, perms
);
9650 path
.push_front_dentry(dn
->name
);
9651 in
= dn
->dir
->parent_inode
;
9654 dir
+= path
.get_path();
9657 void Client::getcwd(string
& dir
, const UserPerm
& perms
)
9659 Mutex::Locker
l(client_lock
);
9661 _getcwd(dir
, perms
);
9664 int Client::statfs(const char *path
, struct statvfs
*stbuf
,
9665 const UserPerm
& perms
)
9667 Mutex::Locker
l(client_lock
);
9668 tout(cct
) << "statfs" << std::endl
;
9676 const vector
<int64_t> &data_pools
= mdsmap
->get_data_pools();
9677 if (data_pools
.size() == 1) {
9678 objecter
->get_fs_stats(stats
, data_pools
[0], &cond
);
9680 objecter
->get_fs_stats(stats
, boost::optional
<int64_t>(), &cond
);
9683 client_lock
.Unlock();
9684 int rval
= cond
.wait();
9688 ldout(cct
, 1) << "underlying call to statfs returned error: "
9689 << cpp_strerror(rval
)
9694 memset(stbuf
, 0, sizeof(*stbuf
));
9697 * we're going to set a block size of 4MB so we can represent larger
9698 * FSes without overflowing. Additionally convert the space
9699 * measurements from KB to bytes while making them in terms of
9700 * blocks. We use 4MB only because it is big enough, and because it
9701 * actually *is* the (ceph) default block size.
9703 const int CEPH_BLOCK_SHIFT
= 22;
9704 stbuf
->f_frsize
= 1 << CEPH_BLOCK_SHIFT
;
9705 stbuf
->f_bsize
= 1 << CEPH_BLOCK_SHIFT
;
9706 stbuf
->f_files
= stats
.num_objects
;
9707 stbuf
->f_ffree
= -1;
9708 stbuf
->f_favail
= -1;
9709 stbuf
->f_fsid
= -1; // ??
9710 stbuf
->f_flag
= 0; // ??
9711 stbuf
->f_namemax
= NAME_MAX
;
9713 // Usually quota_root will == root_ancestor, but if the mount root has no
9714 // quota but we can see a parent of it that does have a quota, we'll
9715 // respect that one instead.
9716 assert(root
!= nullptr);
9717 Inode
*quota_root
= root
->quota
.is_enable() ? root
: get_quota_root(root
, perms
);
9719 // get_quota_root should always give us something
9720 // because client quotas are always enabled
9721 assert(quota_root
!= nullptr);
9723 if (quota_root
&& cct
->_conf
->client_quota_df
&& quota_root
->quota
.max_bytes
) {
9725 // Skip the getattr if any sessions are stale, as we don't want to
9726 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9728 if (!_any_stale_sessions()) {
9729 int r
= _getattr(quota_root
, 0, perms
, true);
9731 // Ignore return value: error getting latest inode metadata is not a good
9732 // reason to break "df".
9733 lderr(cct
) << "Error in getattr on quota root 0x"
9734 << std::hex
<< quota_root
->ino
<< std::dec
9735 << " statfs result may be outdated" << dendl
;
9739 // Special case: if there is a size quota set on the Inode acting
9740 // as the root for this client mount, then report the quota status
9741 // as the filesystem statistics.
9742 const fsblkcnt_t total
= quota_root
->quota
.max_bytes
>> CEPH_BLOCK_SHIFT
;
9743 const fsblkcnt_t used
= quota_root
->rstat
.rbytes
>> CEPH_BLOCK_SHIFT
;
9744 // It is possible for a quota to be exceeded: arithmetic here must
9745 // handle case where used > total.
9746 const fsblkcnt_t free
= total
> used
? total
- used
: 0;
9748 stbuf
->f_blocks
= total
;
9749 stbuf
->f_bfree
= free
;
9750 stbuf
->f_bavail
= free
;
9752 // General case: report the cluster statistics returned from RADOS. Because
9753 // multiple pools may be used without one filesystem namespace via
9754 // layouts, this is the most correct thing we can do.
9755 stbuf
->f_blocks
= stats
.kb
>> (CEPH_BLOCK_SHIFT
- 10);
9756 stbuf
->f_bfree
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
9757 stbuf
->f_bavail
= stats
.kb_avail
>> (CEPH_BLOCK_SHIFT
- 10);
9763 int Client::_do_filelock(Inode
*in
, Fh
*fh
, int lock_type
, int op
, int sleep
,
9764 struct flock
*fl
, uint64_t owner
, bool removing
)
9766 ldout(cct
, 10) << "_do_filelock ino " << in
->ino
9767 << (lock_type
== CEPH_LOCK_FCNTL
? " fcntl" : " flock")
9768 << " type " << fl
->l_type
<< " owner " << owner
9769 << " " << fl
->l_start
<< "~" << fl
->l_len
<< dendl
;
9772 if (F_RDLCK
== fl
->l_type
)
9773 lock_cmd
= CEPH_LOCK_SHARED
;
9774 else if (F_WRLCK
== fl
->l_type
)
9775 lock_cmd
= CEPH_LOCK_EXCL
;
9776 else if (F_UNLCK
== fl
->l_type
)
9777 lock_cmd
= CEPH_LOCK_UNLOCK
;
9781 if (op
!= CEPH_MDS_OP_SETFILELOCK
|| lock_cmd
== CEPH_LOCK_UNLOCK
)
9785 * Set the most significant bit, so that MDS knows the 'owner'
9786 * is sufficient to identify the owner of lock. (old code uses
9787 * both 'owner' and 'pid')
9789 owner
|= (1ULL << 63);
9791 MetaRequest
*req
= new MetaRequest(op
);
9793 in
->make_nosnap_relative_path(path
);
9794 req
->set_filepath(path
);
9797 req
->head
.args
.filelock_change
.rule
= lock_type
;
9798 req
->head
.args
.filelock_change
.type
= lock_cmd
;
9799 req
->head
.args
.filelock_change
.owner
= owner
;
9800 req
->head
.args
.filelock_change
.pid
= fl
->l_pid
;
9801 req
->head
.args
.filelock_change
.start
= fl
->l_start
;
9802 req
->head
.args
.filelock_change
.length
= fl
->l_len
;
9803 req
->head
.args
.filelock_change
.wait
= sleep
;
9808 if (sleep
&& switch_interrupt_cb
) {
9810 switch_interrupt_cb(callback_handle
, req
->get());
9811 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
9812 // disable interrupt
9813 switch_interrupt_cb(callback_handle
, NULL
);
9814 if (ret
== 0 && req
->aborted()) {
9815 // effect of this lock request has been revoked by the 'lock intr' request
9816 ret
= req
->get_abort_code();
9820 ret
= make_request(req
, fh
->actor_perms
, NULL
, NULL
, -1, &bl
);
9824 if (op
== CEPH_MDS_OP_GETFILELOCK
) {
9825 ceph_filelock filelock
;
9826 bufferlist::iterator p
= bl
.begin();
9827 ::decode(filelock
, p
);
9829 if (CEPH_LOCK_SHARED
== filelock
.type
)
9830 fl
->l_type
= F_RDLCK
;
9831 else if (CEPH_LOCK_EXCL
== filelock
.type
)
9832 fl
->l_type
= F_WRLCK
;
9834 fl
->l_type
= F_UNLCK
;
9836 fl
->l_whence
= SEEK_SET
;
9837 fl
->l_start
= filelock
.start
;
9838 fl
->l_len
= filelock
.length
;
9839 fl
->l_pid
= filelock
.pid
;
9840 } else if (op
== CEPH_MDS_OP_SETFILELOCK
) {
9841 ceph_lock_state_t
*lock_state
;
9842 if (lock_type
== CEPH_LOCK_FCNTL
) {
9843 if (!in
->fcntl_locks
)
9844 in
->fcntl_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
);
9845 lock_state
= in
->fcntl_locks
;
9846 } else if (lock_type
== CEPH_LOCK_FLOCK
) {
9847 if (!in
->flock_locks
)
9848 in
->flock_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
);
9849 lock_state
= in
->flock_locks
;
9854 _update_lock_state(fl
, owner
, lock_state
);
9857 if (lock_type
== CEPH_LOCK_FCNTL
) {
9858 if (!fh
->fcntl_locks
)
9859 fh
->fcntl_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FCNTL
);
9860 lock_state
= fh
->fcntl_locks
;
9862 if (!fh
->flock_locks
)
9863 fh
->flock_locks
= new ceph_lock_state_t(cct
, CEPH_LOCK_FLOCK
);
9864 lock_state
= fh
->flock_locks
;
9866 _update_lock_state(fl
, owner
, lock_state
);
9874 int Client::_interrupt_filelock(MetaRequest
*req
)
9876 // Set abort code, but do not kick. The abort code prevents the request
9877 // from being re-sent.
9880 return 0; // haven't sent the request
9882 Inode
*in
= req
->inode();
9885 if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FLOCK
)
9886 lock_type
= CEPH_LOCK_FLOCK_INTR
;
9887 else if (req
->head
.args
.filelock_change
.rule
== CEPH_LOCK_FCNTL
)
9888 lock_type
= CEPH_LOCK_FCNTL_INTR
;
9894 MetaRequest
*intr_req
= new MetaRequest(CEPH_MDS_OP_SETFILELOCK
);
9896 in
->make_nosnap_relative_path(path
);
9897 intr_req
->set_filepath(path
);
9898 intr_req
->set_inode(in
);
9899 intr_req
->head
.args
.filelock_change
= req
->head
.args
.filelock_change
;
9900 intr_req
->head
.args
.filelock_change
.rule
= lock_type
;
9901 intr_req
->head
.args
.filelock_change
.type
= CEPH_LOCK_UNLOCK
;
9903 UserPerm
perms(req
->get_uid(), req
->get_gid());
9904 return make_request(intr_req
, perms
, NULL
, NULL
, -1);
9907 void Client::_encode_filelocks(Inode
*in
, bufferlist
& bl
)
9909 if (!in
->fcntl_locks
&& !in
->flock_locks
)
9912 unsigned nr_fcntl_locks
= in
->fcntl_locks
? in
->fcntl_locks
->held_locks
.size() : 0;
9913 ::encode(nr_fcntl_locks
, bl
);
9914 if (nr_fcntl_locks
) {
9915 ceph_lock_state_t
* lock_state
= in
->fcntl_locks
;
9916 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
9917 p
!= lock_state
->held_locks
.end();
9919 ::encode(p
->second
, bl
);
9922 unsigned nr_flock_locks
= in
->flock_locks
? in
->flock_locks
->held_locks
.size() : 0;
9923 ::encode(nr_flock_locks
, bl
);
9924 if (nr_flock_locks
) {
9925 ceph_lock_state_t
* lock_state
= in
->flock_locks
;
9926 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
9927 p
!= lock_state
->held_locks
.end();
9929 ::encode(p
->second
, bl
);
9932 ldout(cct
, 10) << "_encode_filelocks ino " << in
->ino
<< ", " << nr_fcntl_locks
9933 << " fcntl locks, " << nr_flock_locks
<< " flock locks" << dendl
;
9936 void Client::_release_filelocks(Fh
*fh
)
9938 if (!fh
->fcntl_locks
&& !fh
->flock_locks
)
9941 Inode
*in
= fh
->inode
.get();
9942 ldout(cct
, 10) << "_release_filelocks " << fh
<< " ino " << in
->ino
<< dendl
;
9944 list
<pair
<int, ceph_filelock
> > to_release
;
9946 if (fh
->fcntl_locks
) {
9947 ceph_lock_state_t
* lock_state
= fh
->fcntl_locks
;
9948 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
9949 p
!= lock_state
->held_locks
.end();
9951 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FCNTL
, p
->second
));
9952 delete fh
->fcntl_locks
;
9954 if (fh
->flock_locks
) {
9955 ceph_lock_state_t
* lock_state
= fh
->flock_locks
;
9956 for(multimap
<uint64_t, ceph_filelock
>::iterator p
= lock_state
->held_locks
.begin();
9957 p
!= lock_state
->held_locks
.end();
9959 to_release
.push_back(pair
<int, ceph_filelock
>(CEPH_LOCK_FLOCK
, p
->second
));
9960 delete fh
->flock_locks
;
9963 if (to_release
.empty())
9967 memset(&fl
, 0, sizeof(fl
));
9968 fl
.l_whence
= SEEK_SET
;
9969 fl
.l_type
= F_UNLCK
;
9971 for (list
<pair
<int, ceph_filelock
> >::iterator p
= to_release
.begin();
9972 p
!= to_release
.end();
9974 fl
.l_start
= p
->second
.start
;
9975 fl
.l_len
= p
->second
.length
;
9976 fl
.l_pid
= p
->second
.pid
;
9977 _do_filelock(in
, fh
, p
->first
, CEPH_MDS_OP_SETFILELOCK
, 0, &fl
,
9978 p
->second
.owner
, true);
9982 void Client::_update_lock_state(struct flock
*fl
, uint64_t owner
,
9983 ceph_lock_state_t
*lock_state
)
9986 if (F_RDLCK
== fl
->l_type
)
9987 lock_cmd
= CEPH_LOCK_SHARED
;
9988 else if (F_WRLCK
== fl
->l_type
)
9989 lock_cmd
= CEPH_LOCK_EXCL
;
9991 lock_cmd
= CEPH_LOCK_UNLOCK
;;
9993 ceph_filelock filelock
;
9994 filelock
.start
= fl
->l_start
;
9995 filelock
.length
= fl
->l_len
;
9996 filelock
.client
= 0;
9997 // see comment in _do_filelock()
9998 filelock
.owner
= owner
| (1ULL << 63);
9999 filelock
.pid
= fl
->l_pid
;
10000 filelock
.type
= lock_cmd
;
10002 if (filelock
.type
== CEPH_LOCK_UNLOCK
) {
10003 list
<ceph_filelock
> activated_locks
;
10004 lock_state
->remove_lock(filelock
, activated_locks
);
10006 bool r
= lock_state
->add_lock(filelock
, false, false, NULL
);
10011 int Client::_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
10013 Inode
*in
= fh
->inode
.get();
10014 ldout(cct
, 10) << "_getlk " << fh
<< " ino " << in
->ino
<< dendl
;
10015 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_GETFILELOCK
, 0, fl
, owner
);
10019 int Client::_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
10021 Inode
*in
= fh
->inode
.get();
10022 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< dendl
;
10023 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FCNTL
, CEPH_MDS_OP_SETFILELOCK
, sleep
, fl
, owner
);
10024 ldout(cct
, 10) << "_setlk " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10028 int Client::_flock(Fh
*fh
, int cmd
, uint64_t owner
)
10030 Inode
*in
= fh
->inode
.get();
10031 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< dendl
;
10033 int sleep
= !(cmd
& LOCK_NB
);
10052 memset(&fl
, 0, sizeof(fl
));
10054 fl
.l_whence
= SEEK_SET
;
10056 int ret
= _do_filelock(in
, fh
, CEPH_LOCK_FLOCK
, CEPH_MDS_OP_SETFILELOCK
, sleep
, &fl
, owner
);
10057 ldout(cct
, 10) << "_flock " << fh
<< " ino " << in
->ino
<< " result=" << ret
<< dendl
;
10061 int Client::ll_statfs(Inode
*in
, struct statvfs
*stbuf
, const UserPerm
& perms
)
10063 /* Since the only thing this does is wrap a call to statfs, and
10064 statfs takes a lock, it doesn't seem we have a need to split it
10066 return statfs(0, stbuf
, perms
);
10069 void Client::ll_register_callbacks(struct client_callback_args
*args
)
10073 Mutex::Locker
l(client_lock
);
10074 ldout(cct
, 10) << "ll_register_callbacks cb " << args
->handle
10075 << " invalidate_ino_cb " << args
->ino_cb
10076 << " invalidate_dentry_cb " << args
->dentry_cb
10077 << " switch_interrupt_cb " << args
->switch_intr_cb
10078 << " remount_cb " << args
->remount_cb
10080 callback_handle
= args
->handle
;
10081 if (args
->ino_cb
) {
10082 ino_invalidate_cb
= args
->ino_cb
;
10083 async_ino_invalidator
.start();
10085 if (args
->dentry_cb
) {
10086 dentry_invalidate_cb
= args
->dentry_cb
;
10087 async_dentry_invalidator
.start();
10089 if (args
->switch_intr_cb
) {
10090 switch_interrupt_cb
= args
->switch_intr_cb
;
10091 interrupt_finisher
.start();
10093 if (args
->remount_cb
) {
10094 remount_cb
= args
->remount_cb
;
10095 remount_finisher
.start();
10097 umask_cb
= args
->umask_cb
;
10100 int Client::test_dentry_handling(bool can_invalidate
)
10104 can_invalidate_dentries
= can_invalidate
;
10106 if (can_invalidate_dentries
) {
10107 assert(dentry_invalidate_cb
);
10108 ldout(cct
, 1) << "using dentry_invalidate_cb" << dendl
;
10110 } else if (remount_cb
) {
10111 ldout(cct
, 1) << "using remount_cb" << dendl
;
10115 bool should_abort
= cct
->_conf
->get_val
<bool>("client_die_on_failed_dentry_invalidate");
10116 if (should_abort
) {
10117 lderr(cct
) << "no method to invalidate kernel dentry cache; quitting!" << dendl
;
10120 lderr(cct
) << "no method to invalidate kernel dentry cache; expect issues!" << dendl
;
10126 int Client::_sync_fs()
10128 ldout(cct
, 10) << "_sync_fs" << dendl
;
10131 Mutex
lock("Client::_fsync::lock");
10133 bool flush_done
= false;
10134 if (cct
->_conf
->client_oc
)
10135 objectcacher
->flush_all(new C_SafeCond(&lock
, &cond
, &flush_done
));
10141 ceph_tid_t flush_tid
= last_flush_tid
;
10143 // wait for unsafe mds requests
10144 wait_unsafe_requests();
10146 wait_sync_caps(flush_tid
);
10149 client_lock
.Unlock();
10151 ldout(cct
, 15) << "waiting on data to flush" << dendl
;
10152 while (!flush_done
)
10155 client_lock
.Lock();
10161 int Client::sync_fs()
10163 Mutex::Locker
l(client_lock
);
10171 int64_t Client::drop_caches()
10173 Mutex::Locker
l(client_lock
);
10174 return objectcacher
->release_all();
10178 int Client::lazyio_propogate(int fd
, loff_t offset
, size_t count
)
10180 Mutex::Locker
l(client_lock
);
10181 ldout(cct
, 3) << "op: client->lazyio_propogate(" << fd
10182 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10184 Fh
*f
= get_filehandle(fd
);
10194 int Client::lazyio_synchronize(int fd
, loff_t offset
, size_t count
)
10196 Mutex::Locker
l(client_lock
);
10197 ldout(cct
, 3) << "op: client->lazyio_synchronize(" << fd
10198 << ", " << offset
<< ", " << count
<< ")" << dendl
;
10200 Fh
*f
= get_filehandle(fd
);
10203 Inode
*in
= f
->inode
.get();
10212 // =============================
10215 int Client::mksnap(const char *relpath
, const char *name
, const UserPerm
& perm
)
10217 Mutex::Locker
l(client_lock
);
10222 filepath
path(relpath
);
10224 int r
= path_walk(path
, &in
, perm
);
10227 if (cct
->_conf
->client_permissions
) {
10228 r
= may_create(in
.get(), perm
);
10232 Inode
*snapdir
= open_snapdir(in
.get());
10233 return _mkdir(snapdir
, name
, 0, perm
);
10236 int Client::rmsnap(const char *relpath
, const char *name
, const UserPerm
& perms
)
10238 Mutex::Locker
l(client_lock
);
10243 filepath
path(relpath
);
10245 int r
= path_walk(path
, &in
, perms
);
10248 if (cct
->_conf
->client_permissions
) {
10249 r
= may_delete(in
.get(), NULL
, perms
);
10253 Inode
*snapdir
= open_snapdir(in
.get());
10254 return _rmdir(snapdir
, name
, perms
);
10257 // =============================
10260 int Client::get_caps_issued(int fd
) {
10262 Mutex::Locker
lock(client_lock
);
10267 Fh
*f
= get_filehandle(fd
);
10271 return f
->inode
->caps_issued();
10274 int Client::get_caps_issued(const char *path
, const UserPerm
& perms
)
10276 Mutex::Locker
lock(client_lock
);
10283 int r
= path_walk(p
, &in
, perms
, true);
10286 return in
->caps_issued();
10289 // =========================================
10292 Inode
*Client::open_snapdir(Inode
*diri
)
10295 vinodeno_t
vino(diri
->ino
, CEPH_SNAPDIR
);
10296 if (!inode_map
.count(vino
)) {
10297 in
= new Inode(this, vino
, &diri
->layout
);
10299 in
->ino
= diri
->ino
;
10300 in
->snapid
= CEPH_SNAPDIR
;
10301 in
->mode
= diri
->mode
;
10302 in
->uid
= diri
->uid
;
10303 in
->gid
= diri
->gid
;
10304 in
->mtime
= diri
->mtime
;
10305 in
->ctime
= diri
->ctime
;
10306 in
->btime
= diri
->btime
;
10307 in
->size
= diri
->size
;
10308 in
->change_attr
= diri
->change_attr
;
10310 in
->dirfragtree
.clear();
10311 in
->snapdir_parent
= diri
;
10312 diri
->flags
|= I_SNAPDIR_OPEN
;
10313 inode_map
[vino
] = in
;
10314 if (use_faked_inos())
10315 _assign_faked_ino(in
);
10316 ldout(cct
, 10) << "open_snapdir created snapshot inode " << *in
<< dendl
;
10318 in
= inode_map
[vino
];
10319 ldout(cct
, 10) << "open_snapdir had snapshot inode " << *in
<< dendl
;
10324 int Client::ll_lookup(Inode
*parent
, const char *name
, struct stat
*attr
,
10325 Inode
**out
, const UserPerm
& perms
)
10327 Mutex::Locker
lock(client_lock
);
10328 vinodeno_t vparent
= _get_vino(parent
);
10329 ldout(cct
, 3) << "ll_lookup " << vparent
<< " " << name
<< dendl
;
10330 tout(cct
) << "ll_lookup" << std::endl
;
10331 tout(cct
) << name
<< std::endl
;
10337 if (!cct
->_conf
->fuse_default_permissions
) {
10338 r
= may_lookup(parent
, perms
);
10343 string
dname(name
);
10346 r
= _lookup(parent
, dname
, CEPH_STAT_CAP_INODE_ALL
, &in
, perms
);
10353 fill_stat(in
, attr
);
10357 ldout(cct
, 3) << "ll_lookup " << vparent
<< " " << name
10358 << " -> " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
10359 tout(cct
) << attr
->st_ino
<< std::endl
;
10364 int Client::ll_lookup_inode(
10365 struct inodeno_t ino
,
10366 const UserPerm
& perms
,
10369 Mutex::Locker
lock(client_lock
);
10370 ldout(cct
, 3) << "ll_lookup_inode " << ino
<< dendl
;
10372 // Num1: get inode and *inode
10373 int r
= _lookup_ino(ino
, perms
, inode
);
10377 assert(inode
!= NULL
);
10378 assert(*inode
!= NULL
);
10380 // Num2: Request the parent inode, so that we can look up the name
10382 r
= _lookup_parent(*inode
, perms
, &parent
);
10383 if (r
&& r
!= -EINVAL
) {
10384 // Unexpected error
10385 _ll_forget(*inode
, 1);
10387 } else if (r
== -EINVAL
) {
10388 // EINVAL indicates node without parents (root), drop out now
10389 // and don't try to look up the non-existent dentry.
10392 // FIXME: I don't think this works; lookup_parent() returns 0 if the parent
10393 // is already in cache
10394 assert(parent
!= NULL
);
10396 // Num3: Finally, get the name (dentry) of the requested inode
10397 r
= _lookup_name(*inode
, parent
, perms
);
10399 // Unexpected error
10400 _ll_forget(parent
, 1);
10401 _ll_forget(*inode
, 1);
10405 _ll_forget(parent
, 1);
10409 int Client::ll_lookupx(Inode
*parent
, const char *name
, Inode
**out
,
10410 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
10411 const UserPerm
& perms
)
10413 Mutex::Locker
lock(client_lock
);
10414 vinodeno_t vparent
= _get_vino(parent
);
10415 ldout(cct
, 3) << "ll_lookupx " << vparent
<< " " << name
<< dendl
;
10416 tout(cct
) << "ll_lookupx" << std::endl
;
10417 tout(cct
) << name
<< std::endl
;
10423 if (!cct
->_conf
->fuse_default_permissions
) {
10424 r
= may_lookup(parent
, perms
);
10429 string
dname(name
);
10432 unsigned mask
= statx_to_mask(flags
, want
);
10433 r
= _lookup(parent
, dname
, mask
, &in
, perms
);
10439 fill_statx(in
, mask
, stx
);
10443 ldout(cct
, 3) << "ll_lookupx " << vparent
<< " " << name
10444 << " -> " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
10445 tout(cct
) << stx
->stx_ino
<< std::endl
;
10450 int Client::ll_walk(const char* name
, Inode
**out
, struct ceph_statx
*stx
,
10451 unsigned int want
, unsigned int flags
, const UserPerm
& perms
)
10453 Mutex::Locker
lock(client_lock
);
10458 filepath
fp(name
, 0);
10461 unsigned mask
= statx_to_mask(flags
, want
);
10463 ldout(cct
, 3) << "ll_walk" << name
<< dendl
;
10464 tout(cct
) << "ll_walk" << std::endl
;
10465 tout(cct
) << name
<< std::endl
;
10467 rc
= path_walk(fp
, &in
, perms
, !(flags
& AT_SYMLINK_NOFOLLOW
), mask
);
10469 /* zero out mask, just in case... */
10476 fill_statx(in
, mask
, stx
);
10483 void Client::_ll_get(Inode
*in
)
10485 if (in
->ll_ref
== 0) {
10487 if (in
->is_dir() && !in
->dn_set
.empty()) {
10488 assert(in
->dn_set
.size() == 1); // dirs can't be hard-linked
10489 in
->get_first_parent()->get(); // pin dentry
10493 ldout(cct
, 20) << "_ll_get " << in
<< " " << in
->ino
<< " -> " << in
->ll_ref
<< dendl
;
10496 int Client::_ll_put(Inode
*in
, int num
)
10499 ldout(cct
, 20) << "_ll_put " << in
<< " " << in
->ino
<< " " << num
<< " -> " << in
->ll_ref
<< dendl
;
10500 if (in
->ll_ref
== 0) {
10501 if (in
->is_dir() && !in
->dn_set
.empty()) {
10502 assert(in
->dn_set
.size() == 1); // dirs can't be hard-linked
10503 in
->get_first_parent()->put(); // unpin dentry
10512 void Client::_ll_drop_pins()
10514 ldout(cct
, 10) << "_ll_drop_pins" << dendl
;
10515 std::set
<InodeRef
> to_be_put
; //this set will be deconstructed item by item when exit
10516 ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator next
;
10517 for (ceph::unordered_map
<vinodeno_t
, Inode
*>::iterator it
= inode_map
.begin();
10518 it
!= inode_map
.end();
10520 Inode
*in
= it
->second
;
10524 to_be_put
.insert(in
);
10525 _ll_put(in
, in
->ll_ref
);
10530 bool Client::_ll_forget(Inode
*in
, int count
)
10532 inodeno_t ino
= _get_inodeno(in
);
10534 ldout(cct
, 8) << "ll_forget " << ino
<< " " << count
<< dendl
;
10535 tout(cct
) << "ll_forget" << std::endl
;
10536 tout(cct
) << ino
.val
<< std::endl
;
10537 tout(cct
) << count
<< std::endl
;
10539 // Ignore forget if we're no longer mounted
10543 if (ino
== 1) return true; // ignore forget on root.
10546 if (in
->ll_ref
< count
) {
10547 ldout(cct
, 1) << "WARNING: ll_forget on " << ino
<< " " << count
10548 << ", which only has ll_ref=" << in
->ll_ref
<< dendl
;
10549 _ll_put(in
, in
->ll_ref
);
10552 if (_ll_put(in
, count
) == 0)
10559 bool Client::ll_forget(Inode
*in
, int count
)
10561 Mutex::Locker
lock(client_lock
);
10562 return _ll_forget(in
, count
);
10565 bool Client::ll_put(Inode
*in
)
10567 /* ll_forget already takes the lock */
10568 return ll_forget(in
, 1);
10571 snapid_t
Client::ll_get_snapid(Inode
*in
)
10573 Mutex::Locker
lock(client_lock
);
10577 Inode
*Client::ll_get_inode(ino_t ino
)
10579 Mutex::Locker
lock(client_lock
);
10584 vinodeno_t vino
= _map_faked_ino(ino
);
10585 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10586 if (p
== inode_map
.end())
10588 Inode
*in
= p
->second
;
10593 Inode
*Client::ll_get_inode(vinodeno_t vino
)
10595 Mutex::Locker
lock(client_lock
);
10600 unordered_map
<vinodeno_t
,Inode
*>::iterator p
= inode_map
.find(vino
);
10601 if (p
== inode_map
.end())
10603 Inode
*in
= p
->second
;
10608 int Client::_ll_getattr(Inode
*in
, int caps
, const UserPerm
& perms
)
10610 vinodeno_t vino
= _get_vino(in
);
10612 ldout(cct
, 8) << "ll_getattr " << vino
<< dendl
;
10613 tout(cct
) << "ll_getattr" << std::endl
;
10614 tout(cct
) << vino
.ino
.val
<< std::endl
;
10616 if (vino
.snapid
< CEPH_NOSNAP
)
10619 return _getattr(in
, caps
, perms
);
10622 int Client::ll_getattr(Inode
*in
, struct stat
*attr
, const UserPerm
& perms
)
10624 Mutex::Locker
lock(client_lock
);
10629 int res
= _ll_getattr(in
, CEPH_STAT_CAP_INODE_ALL
, perms
);
10632 fill_stat(in
, attr
);
10633 ldout(cct
, 3) << "ll_getattr " << _get_vino(in
) << " = " << res
<< dendl
;
10637 int Client::ll_getattrx(Inode
*in
, struct ceph_statx
*stx
, unsigned int want
,
10638 unsigned int flags
, const UserPerm
& perms
)
10640 Mutex::Locker
lock(client_lock
);
10646 unsigned mask
= statx_to_mask(flags
, want
);
10648 if (mask
&& !in
->caps_issued_mask(mask
, true))
10649 res
= _ll_getattr(in
, mask
, perms
);
10652 fill_statx(in
, mask
, stx
);
10653 ldout(cct
, 3) << "ll_getattrx " << _get_vino(in
) << " = " << res
<< dendl
;
10657 int Client::_ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10658 const UserPerm
& perms
, InodeRef
*inp
)
10660 vinodeno_t vino
= _get_vino(in
);
10662 ldout(cct
, 8) << "ll_setattrx " << vino
<< " mask " << hex
<< mask
<< dec
10664 tout(cct
) << "ll_setattrx" << std::endl
;
10665 tout(cct
) << vino
.ino
.val
<< std::endl
;
10666 tout(cct
) << stx
->stx_mode
<< std::endl
;
10667 tout(cct
) << stx
->stx_uid
<< std::endl
;
10668 tout(cct
) << stx
->stx_gid
<< std::endl
;
10669 tout(cct
) << stx
->stx_size
<< std::endl
;
10670 tout(cct
) << stx
->stx_mtime
<< std::endl
;
10671 tout(cct
) << stx
->stx_atime
<< std::endl
;
10672 tout(cct
) << stx
->stx_btime
<< std::endl
;
10673 tout(cct
) << mask
<< std::endl
;
10675 if (!cct
->_conf
->fuse_default_permissions
) {
10676 int res
= may_setattr(in
, stx
, mask
, perms
);
10681 mask
&= ~(CEPH_SETATTR_MTIME_NOW
| CEPH_SETATTR_ATIME_NOW
);
10683 return __setattrx(in
, stx
, mask
, perms
, inp
);
10686 int Client::ll_setattrx(Inode
*in
, struct ceph_statx
*stx
, int mask
,
10687 const UserPerm
& perms
)
10689 Mutex::Locker
lock(client_lock
);
10694 InodeRef
target(in
);
10695 int res
= _ll_setattrx(in
, stx
, mask
, perms
, &target
);
10697 assert(in
== target
.get());
10698 fill_statx(in
, in
->caps_issued(), stx
);
10701 ldout(cct
, 3) << "ll_setattrx " << _get_vino(in
) << " = " << res
<< dendl
;
10705 int Client::ll_setattr(Inode
*in
, struct stat
*attr
, int mask
,
10706 const UserPerm
& perms
)
10708 struct ceph_statx stx
;
10709 stat_to_statx(attr
, &stx
);
10711 Mutex::Locker
lock(client_lock
);
10716 InodeRef
target(in
);
10717 int res
= _ll_setattrx(in
, &stx
, mask
, perms
, &target
);
10719 assert(in
== target
.get());
10720 fill_stat(in
, attr
);
10723 ldout(cct
, 3) << "ll_setattr " << _get_vino(in
) << " = " << res
<< dendl
;
10731 int Client::getxattr(const char *path
, const char *name
, void *value
, size_t size
,
10732 const UserPerm
& perms
)
10734 Mutex::Locker
lock(client_lock
);
10740 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
10743 return _getxattr(in
, name
, value
, size
, perms
);
10746 int Client::lgetxattr(const char *path
, const char *name
, void *value
, size_t size
,
10747 const UserPerm
& perms
)
10749 Mutex::Locker
lock(client_lock
);
10755 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
10758 return _getxattr(in
, name
, value
, size
, perms
);
10761 int Client::fgetxattr(int fd
, const char *name
, void *value
, size_t size
,
10762 const UserPerm
& perms
)
10764 Mutex::Locker
lock(client_lock
);
10769 Fh
*f
= get_filehandle(fd
);
10772 return _getxattr(f
->inode
, name
, value
, size
, perms
);
10775 int Client::listxattr(const char *path
, char *list
, size_t size
,
10776 const UserPerm
& perms
)
10778 Mutex::Locker
lock(client_lock
);
10784 int r
= Client::path_walk(path
, &in
, perms
, true, CEPH_STAT_CAP_XATTR
);
10787 return Client::_listxattr(in
.get(), list
, size
, perms
);
10790 int Client::llistxattr(const char *path
, char *list
, size_t size
,
10791 const UserPerm
& perms
)
10793 Mutex::Locker
lock(client_lock
);
10799 int r
= Client::path_walk(path
, &in
, perms
, false, CEPH_STAT_CAP_XATTR
);
10802 return Client::_listxattr(in
.get(), list
, size
, perms
);
10805 int Client::flistxattr(int fd
, char *list
, size_t size
, const UserPerm
& perms
)
10807 Mutex::Locker
lock(client_lock
);
10812 Fh
*f
= get_filehandle(fd
);
10815 return Client::_listxattr(f
->inode
.get(), list
, size
, perms
);
10818 int Client::removexattr(const char *path
, const char *name
,
10819 const UserPerm
& perms
)
10821 Mutex::Locker
lock(client_lock
);
10827 int r
= Client::path_walk(path
, &in
, perms
, true);
10830 return _removexattr(in
, name
, perms
);
10833 int Client::lremovexattr(const char *path
, const char *name
,
10834 const UserPerm
& perms
)
10836 Mutex::Locker
lock(client_lock
);
10842 int r
= Client::path_walk(path
, &in
, perms
, false);
10845 return _removexattr(in
, name
, perms
);
10848 int Client::fremovexattr(int fd
, const char *name
, const UserPerm
& perms
)
10850 Mutex::Locker
lock(client_lock
);
10855 Fh
*f
= get_filehandle(fd
);
10858 return _removexattr(f
->inode
, name
, perms
);
10861 int Client::setxattr(const char *path
, const char *name
, const void *value
,
10862 size_t size
, int flags
, const UserPerm
& perms
)
10864 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10866 Mutex::Locker
lock(client_lock
);
10872 int r
= Client::path_walk(path
, &in
, perms
, true);
10875 return _setxattr(in
, name
, value
, size
, flags
, perms
);
10878 int Client::lsetxattr(const char *path
, const char *name
, const void *value
,
10879 size_t size
, int flags
, const UserPerm
& perms
)
10881 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10883 Mutex::Locker
lock(client_lock
);
10889 int r
= Client::path_walk(path
, &in
, perms
, false);
10892 return _setxattr(in
, name
, value
, size
, flags
, perms
);
10895 int Client::fsetxattr(int fd
, const char *name
, const void *value
, size_t size
,
10896 int flags
, const UserPerm
& perms
)
10898 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
10900 Mutex::Locker
lock(client_lock
);
10905 Fh
*f
= get_filehandle(fd
);
10908 return _setxattr(f
->inode
, name
, value
, size
, flags
, perms
);
10911 int Client::_getxattr(Inode
*in
, const char *name
, void *value
, size_t size
,
10912 const UserPerm
& perms
)
10916 const VXattr
*vxattr
= _match_vxattr(in
, name
);
10920 // Do a force getattr to get the latest quota before returning
10921 // a value to userspace.
10923 if (vxattr
->flags
& VXATTR_RSTAT
) {
10924 flags
|= CEPH_STAT_RSTAT
;
10926 r
= _getattr(in
, flags
, perms
, true);
10928 // Error from getattr!
10932 // call pointer-to-member function
10934 if (!(vxattr
->exists_cb
&& !(this->*(vxattr
->exists_cb
))(in
))) {
10935 r
= (this->*(vxattr
->getxattr_cb
))(in
, buf
, sizeof(buf
));
10941 if (r
> (int)size
) {
10943 } else if (r
> 0) {
10944 memcpy(value
, buf
, r
);
10950 if (acl_type
== NO_ACL
&& !strncmp(name
, "system.", 7)) {
10955 r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
10959 if (in
->xattrs
.count(n
)) {
10960 r
= in
->xattrs
[n
].length();
10961 if (r
> 0 && size
!= 0) {
10962 if (size
>= (unsigned)r
)
10963 memcpy(value
, in
->xattrs
[n
].c_str(), r
);
10970 ldout(cct
, 8) << "_getxattr(" << in
->ino
<< ", \"" << name
<< "\", " << size
<< ") = " << r
<< dendl
;
10974 int Client::_getxattr(InodeRef
&in
, const char *name
, void *value
, size_t size
,
10975 const UserPerm
& perms
)
10977 if (cct
->_conf
->client_permissions
) {
10978 int r
= xattr_permission(in
.get(), name
, MAY_READ
, perms
);
10982 return _getxattr(in
.get(), name
, value
, size
, perms
);
10985 int Client::ll_getxattr(Inode
*in
, const char *name
, void *value
,
10986 size_t size
, const UserPerm
& perms
)
10988 Mutex::Locker
lock(client_lock
);
10993 vinodeno_t vino
= _get_vino(in
);
10995 ldout(cct
, 3) << "ll_getxattr " << vino
<< " " << name
<< " size " << size
<< dendl
;
10996 tout(cct
) << "ll_getxattr" << std::endl
;
10997 tout(cct
) << vino
.ino
.val
<< std::endl
;
10998 tout(cct
) << name
<< std::endl
;
11000 if (!cct
->_conf
->fuse_default_permissions
) {
11001 int r
= xattr_permission(in
, name
, MAY_READ
, perms
);
11006 return _getxattr(in
, name
, value
, size
, perms
);
11009 int Client::_listxattr(Inode
*in
, char *name
, size_t size
,
11010 const UserPerm
& perms
)
11012 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
11014 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11015 p
!= in
->xattrs
.end();
11017 r
+= p
->first
.length() + 1;
11019 const VXattr
*vxattrs
= _get_vxattrs(in
);
11020 r
+= _vxattrs_name_size(vxattrs
);
11023 if (size
>= (unsigned)r
) {
11024 for (map
<string
,bufferptr
>::iterator p
= in
->xattrs
.begin();
11025 p
!= in
->xattrs
.end();
11027 memcpy(name
, p
->first
.c_str(), p
->first
.length());
11028 name
+= p
->first
.length();
11033 for (int i
= 0; !vxattrs
[i
].name
.empty(); i
++) {
11034 const VXattr
& vxattr
= vxattrs
[i
];
11037 // call pointer-to-member function
11038 if(vxattr
.exists_cb
&& !(this->*(vxattr
.exists_cb
))(in
))
11040 memcpy(name
, vxattr
.name
.c_str(), vxattr
.name
.length());
11041 name
+= vxattr
.name
.length();
11050 ldout(cct
, 8) << "_listxattr(" << in
->ino
<< ", " << size
<< ") = " << r
<< dendl
;
11054 int Client::ll_listxattr(Inode
*in
, char *names
, size_t size
,
11055 const UserPerm
& perms
)
11057 Mutex::Locker
lock(client_lock
);
11062 vinodeno_t vino
= _get_vino(in
);
11064 ldout(cct
, 3) << "ll_listxattr " << vino
<< " size " << size
<< dendl
;
11065 tout(cct
) << "ll_listxattr" << std::endl
;
11066 tout(cct
) << vino
.ino
.val
<< std::endl
;
11067 tout(cct
) << size
<< std::endl
;
11069 return _listxattr(in
, names
, size
, perms
);
11072 int Client::_do_setxattr(Inode
*in
, const char *name
, const void *value
,
11073 size_t size
, int flags
, const UserPerm
& perms
)
11076 int xattr_flags
= 0;
11078 xattr_flags
|= CEPH_XATTR_REMOVE
;
11079 if (flags
& XATTR_CREATE
)
11080 xattr_flags
|= CEPH_XATTR_CREATE
;
11081 if (flags
& XATTR_REPLACE
)
11082 xattr_flags
|= CEPH_XATTR_REPLACE
;
11084 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SETXATTR
);
11086 in
->make_nosnap_relative_path(path
);
11087 req
->set_filepath(path
);
11088 req
->set_string2(name
);
11089 req
->set_inode(in
);
11090 req
->head
.args
.setxattr
.flags
= xattr_flags
;
11093 bl
.append((const char*)value
, size
);
11096 int res
= make_request(req
, perms
);
11099 ldout(cct
, 3) << "_setxattr(" << in
->ino
<< ", \"" << name
<< "\") = " <<
11104 int Client::_setxattr(Inode
*in
, const char *name
, const void *value
,
11105 size_t size
, int flags
, const UserPerm
& perms
)
11107 if (in
->snapid
!= CEPH_NOSNAP
) {
11111 bool posix_acl_xattr
= false;
11112 if (acl_type
== POSIX_ACL
)
11113 posix_acl_xattr
= !strncmp(name
, "system.", 7);
11115 if (strncmp(name
, "user.", 5) &&
11116 strncmp(name
, "security.", 9) &&
11117 strncmp(name
, "trusted.", 8) &&
11118 strncmp(name
, "ceph.", 5) &&
11120 return -EOPNOTSUPP
;
11122 if (posix_acl_xattr
) {
11123 if (!strcmp(name
, ACL_EA_ACCESS
)) {
11124 mode_t new_mode
= in
->mode
;
11126 int ret
= posix_acl_equiv_mode(value
, size
, &new_mode
);
11133 if (new_mode
!= in
->mode
) {
11134 struct ceph_statx stx
;
11135 stx
.stx_mode
= new_mode
;
11136 ret
= _do_setattr(in
, &stx
, CEPH_SETATTR_MODE
, perms
, NULL
);
11141 } else if (!strcmp(name
, ACL_EA_DEFAULT
)) {
11143 if (!S_ISDIR(in
->mode
))
11145 int ret
= posix_acl_check(value
, size
);
11154 return -EOPNOTSUPP
;
11157 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11158 if (vxattr
&& vxattr
->readonly
)
11159 return -EOPNOTSUPP
;
11162 return _do_setxattr(in
, name
, value
, size
, flags
, perms
);
11165 int Client::_setxattr(InodeRef
&in
, const char *name
, const void *value
,
11166 size_t size
, int flags
, const UserPerm
& perms
)
11168 if (cct
->_conf
->client_permissions
) {
11169 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11173 return _setxattr(in
.get(), name
, value
, size
, flags
, perms
);
11176 int Client::_setxattr_check_data_pool(string
& name
, string
& value
, const OSDMap
*osdmap
)
11179 if (name
== "layout") {
11180 string::iterator begin
= value
.begin();
11181 string::iterator end
= value
.end();
11182 keys_and_values
<string::iterator
> p
; // create instance of parser
11183 std::map
<string
, string
> m
; // map to receive results
11184 if (!qi::parse(begin
, end
, p
, m
)) { // returns true if successful
11189 for (map
<string
,string
>::iterator q
= m
.begin(); q
!= m
.end(); ++q
) {
11190 if (q
->first
== "pool") {
11195 } else if (name
== "layout.pool") {
11199 if (tmp
.length()) {
11202 pool
= boost::lexical_cast
<unsigned>(tmp
);
11203 if (!osdmap
->have_pg_pool(pool
))
11205 } catch (boost::bad_lexical_cast
const&) {
11206 pool
= osdmap
->lookup_pg_pool_name(tmp
);
11216 void Client::_setxattr_maybe_wait_for_osdmap(const char *name
, const void *value
, size_t size
)
11218 // For setting pool of layout, MetaRequest need osdmap epoch.
11219 // There is a race which create a new data pool but client and mds both don't have.
11220 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11221 if (strcmp(name
, "ceph.file.layout.pool") == 0 || strcmp(name
, "ceph.dir.layout.pool") == 0 ||
11222 strcmp(name
, "ceph.file.layout") == 0 || strcmp(name
, "ceph.dir.layout") == 0) {
11223 string
rest(strstr(name
, "layout"));
11224 string
v((const char*)value
, size
);
11225 int r
= objecter
->with_osdmap([&](const OSDMap
& o
) {
11226 return _setxattr_check_data_pool(rest
, v
, &o
);
11229 if (r
== -ENOENT
) {
11231 objecter
->wait_for_latest_osdmap(&ctx
);
11237 int Client::ll_setxattr(Inode
*in
, const char *name
, const void *value
,
11238 size_t size
, int flags
, const UserPerm
& perms
)
11240 _setxattr_maybe_wait_for_osdmap(name
, value
, size
);
11242 Mutex::Locker
lock(client_lock
);
11247 vinodeno_t vino
= _get_vino(in
);
11249 ldout(cct
, 3) << "ll_setxattr " << vino
<< " " << name
<< " size " << size
<< dendl
;
11250 tout(cct
) << "ll_setxattr" << std::endl
;
11251 tout(cct
) << vino
.ino
.val
<< std::endl
;
11252 tout(cct
) << name
<< std::endl
;
11254 if (!cct
->_conf
->fuse_default_permissions
) {
11255 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11259 return _setxattr(in
, name
, value
, size
, flags
, perms
);
11262 int Client::_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11264 if (in
->snapid
!= CEPH_NOSNAP
) {
11268 // same xattrs supported by kernel client
11269 if (strncmp(name
, "user.", 5) &&
11270 strncmp(name
, "system.", 7) &&
11271 strncmp(name
, "security.", 9) &&
11272 strncmp(name
, "trusted.", 8) &&
11273 strncmp(name
, "ceph.", 5))
11274 return -EOPNOTSUPP
;
11276 const VXattr
*vxattr
= _match_vxattr(in
, name
);
11277 if (vxattr
&& vxattr
->readonly
)
11278 return -EOPNOTSUPP
;
11280 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_RMXATTR
);
11282 in
->make_nosnap_relative_path(path
);
11283 req
->set_filepath(path
);
11284 req
->set_filepath2(name
);
11285 req
->set_inode(in
);
11287 int res
= make_request(req
, perms
);
11290 ldout(cct
, 8) << "_removexattr(" << in
->ino
<< ", \"" << name
<< "\") = " << res
<< dendl
;
11294 int Client::_removexattr(InodeRef
&in
, const char *name
, const UserPerm
& perms
)
11296 if (cct
->_conf
->client_permissions
) {
11297 int r
= xattr_permission(in
.get(), name
, MAY_WRITE
, perms
);
11301 return _removexattr(in
.get(), name
, perms
);
11304 int Client::ll_removexattr(Inode
*in
, const char *name
, const UserPerm
& perms
)
11306 Mutex::Locker
lock(client_lock
);
11311 vinodeno_t vino
= _get_vino(in
);
11313 ldout(cct
, 3) << "ll_removexattr " << vino
<< " " << name
<< dendl
;
11314 tout(cct
) << "ll_removexattr" << std::endl
;
11315 tout(cct
) << vino
.ino
.val
<< std::endl
;
11316 tout(cct
) << name
<< std::endl
;
11318 if (!cct
->_conf
->fuse_default_permissions
) {
11319 int r
= xattr_permission(in
, name
, MAY_WRITE
, perms
);
11324 return _removexattr(in
, name
, perms
);
11327 bool Client::_vxattrcb_quota_exists(Inode
*in
)
11329 return in
->quota
.is_enable();
11331 size_t Client::_vxattrcb_quota(Inode
*in
, char *val
, size_t size
)
11333 return snprintf(val
, size
,
11334 "max_bytes=%lld max_files=%lld",
11335 (long long int)in
->quota
.max_bytes
,
11336 (long long int)in
->quota
.max_files
);
11338 size_t Client::_vxattrcb_quota_max_bytes(Inode
*in
, char *val
, size_t size
)
11340 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_bytes
);
11342 size_t Client::_vxattrcb_quota_max_files(Inode
*in
, char *val
, size_t size
)
11344 return snprintf(val
, size
, "%lld", (long long int)in
->quota
.max_files
);
11347 bool Client::_vxattrcb_layout_exists(Inode
*in
)
11349 return in
->layout
!= file_layout_t();
11351 size_t Client::_vxattrcb_layout(Inode
*in
, char *val
, size_t size
)
11353 int r
= snprintf(val
, size
,
11354 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11355 (unsigned long long)in
->layout
.stripe_unit
,
11356 (unsigned long long)in
->layout
.stripe_count
,
11357 (unsigned long long)in
->layout
.object_size
);
11358 objecter
->with_osdmap([&](const OSDMap
& o
) {
11359 if (o
.have_pg_pool(in
->layout
.pool_id
))
11360 r
+= snprintf(val
+ r
, size
- r
, "%s",
11361 o
.get_pool_name(in
->layout
.pool_id
).c_str());
11363 r
+= snprintf(val
+ r
, size
- r
, "%" PRIu64
,
11364 (uint64_t)in
->layout
.pool_id
);
11366 if (in
->layout
.pool_ns
.length())
11367 r
+= snprintf(val
+ r
, size
- r
, " pool_namespace=%s",
11368 in
->layout
.pool_ns
.c_str());
11371 size_t Client::_vxattrcb_layout_stripe_unit(Inode
*in
, char *val
, size_t size
)
11373 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.stripe_unit
);
11375 size_t Client::_vxattrcb_layout_stripe_count(Inode
*in
, char *val
, size_t size
)
11377 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.stripe_count
);
11379 size_t Client::_vxattrcb_layout_object_size(Inode
*in
, char *val
, size_t size
)
11381 return snprintf(val
, size
, "%lld", (unsigned long long)in
->layout
.object_size
);
11383 size_t Client::_vxattrcb_layout_pool(Inode
*in
, char *val
, size_t size
)
11386 objecter
->with_osdmap([&](const OSDMap
& o
) {
11387 if (o
.have_pg_pool(in
->layout
.pool_id
))
11388 r
= snprintf(val
, size
, "%s", o
.get_pool_name(
11389 in
->layout
.pool_id
).c_str());
11391 r
= snprintf(val
, size
, "%" PRIu64
, (uint64_t)in
->layout
.pool_id
);
11395 size_t Client::_vxattrcb_layout_pool_namespace(Inode
*in
, char *val
, size_t size
)
11397 return snprintf(val
, size
, "%s", in
->layout
.pool_ns
.c_str());
11399 size_t Client::_vxattrcb_dir_entries(Inode
*in
, char *val
, size_t size
)
11401 return snprintf(val
, size
, "%lld", (unsigned long long)(in
->dirstat
.nfiles
+ in
->dirstat
.nsubdirs
));
11403 size_t Client::_vxattrcb_dir_files(Inode
*in
, char *val
, size_t size
)
11405 return snprintf(val
, size
, "%lld", (unsigned long long)in
->dirstat
.nfiles
);
11407 size_t Client::_vxattrcb_dir_subdirs(Inode
*in
, char *val
, size_t size
)
11409 return snprintf(val
, size
, "%lld", (unsigned long long)in
->dirstat
.nsubdirs
);
11411 size_t Client::_vxattrcb_dir_rentries(Inode
*in
, char *val
, size_t size
)
11413 return snprintf(val
, size
, "%lld", (unsigned long long)(in
->rstat
.rfiles
+ in
->rstat
.rsubdirs
));
11415 size_t Client::_vxattrcb_dir_rfiles(Inode
*in
, char *val
, size_t size
)
11417 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rfiles
);
11419 size_t Client::_vxattrcb_dir_rsubdirs(Inode
*in
, char *val
, size_t size
)
11421 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rsubdirs
);
11423 size_t Client::_vxattrcb_dir_rbytes(Inode
*in
, char *val
, size_t size
)
11425 return snprintf(val
, size
, "%lld", (unsigned long long)in
->rstat
.rbytes
);
11427 size_t Client::_vxattrcb_dir_rctime(Inode
*in
, char *val
, size_t size
)
11429 return snprintf(val
, size
, "%ld.09%ld", (long)in
->rstat
.rctime
.sec(),
11430 (long)in
->rstat
.rctime
.nsec());
11433 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11434 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11436 #define XATTR_NAME_CEPH(_type, _name) \
11438 name: CEPH_XATTR_NAME(_type, _name), \
11439 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11445 #define XATTR_NAME_CEPH2(_type, _name, _flags) \
11447 name: CEPH_XATTR_NAME(_type, _name), \
11448 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11454 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11456 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11457 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11460 exists_cb: &Client::_vxattrcb_layout_exists, \
11463 #define XATTR_QUOTA_FIELD(_type, _name) \
11465 name: CEPH_XATTR_NAME(_type, _name), \
11466 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11469 exists_cb: &Client::_vxattrcb_quota_exists, \
11473 const Client::VXattr
Client::_dir_vxattrs
[] = {
11475 name
: "ceph.dir.layout",
11476 getxattr_cb
: &Client::_vxattrcb_layout
,
11479 exists_cb
: &Client::_vxattrcb_layout_exists
,
11482 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_unit
),
11483 XATTR_LAYOUT_FIELD(dir
, layout
, stripe_count
),
11484 XATTR_LAYOUT_FIELD(dir
, layout
, object_size
),
11485 XATTR_LAYOUT_FIELD(dir
, layout
, pool
),
11486 XATTR_LAYOUT_FIELD(dir
, layout
, pool_namespace
),
11487 XATTR_NAME_CEPH(dir
, entries
),
11488 XATTR_NAME_CEPH(dir
, files
),
11489 XATTR_NAME_CEPH(dir
, subdirs
),
11490 XATTR_NAME_CEPH2(dir
, rentries
, VXATTR_RSTAT
),
11491 XATTR_NAME_CEPH2(dir
, rfiles
, VXATTR_RSTAT
),
11492 XATTR_NAME_CEPH2(dir
, rsubdirs
, VXATTR_RSTAT
),
11493 XATTR_NAME_CEPH2(dir
, rbytes
, VXATTR_RSTAT
),
11494 XATTR_NAME_CEPH2(dir
, rctime
, VXATTR_RSTAT
),
11496 name
: "ceph.quota",
11497 getxattr_cb
: &Client::_vxattrcb_quota
,
11500 exists_cb
: &Client::_vxattrcb_quota_exists
,
11503 XATTR_QUOTA_FIELD(quota
, max_bytes
),
11504 XATTR_QUOTA_FIELD(quota
, max_files
),
11505 { name
: "" } /* Required table terminator */
11508 const Client::VXattr
Client::_file_vxattrs
[] = {
11510 name
: "ceph.file.layout",
11511 getxattr_cb
: &Client::_vxattrcb_layout
,
11514 exists_cb
: &Client::_vxattrcb_layout_exists
,
11517 XATTR_LAYOUT_FIELD(file
, layout
, stripe_unit
),
11518 XATTR_LAYOUT_FIELD(file
, layout
, stripe_count
),
11519 XATTR_LAYOUT_FIELD(file
, layout
, object_size
),
11520 XATTR_LAYOUT_FIELD(file
, layout
, pool
),
11521 XATTR_LAYOUT_FIELD(file
, layout
, pool_namespace
),
11522 { name
: "" } /* Required table terminator */
11525 const Client::VXattr
*Client::_get_vxattrs(Inode
*in
)
11528 return _dir_vxattrs
;
11529 else if (in
->is_file())
11530 return _file_vxattrs
;
11534 const Client::VXattr
*Client::_match_vxattr(Inode
*in
, const char *name
)
11536 if (strncmp(name
, "ceph.", 5) == 0) {
11537 const VXattr
*vxattr
= _get_vxattrs(in
);
11539 while (!vxattr
->name
.empty()) {
11540 if (vxattr
->name
== name
)
11549 size_t Client::_vxattrs_calcu_name_size(const VXattr
*vxattr
)
11552 while (!vxattr
->name
.empty()) {
11553 if (!vxattr
->hidden
)
11554 len
+= vxattr
->name
.length() + 1;
11560 int Client::ll_readlink(Inode
*in
, char *buf
, size_t buflen
, const UserPerm
& perms
)
11562 Mutex::Locker
lock(client_lock
);
11567 vinodeno_t vino
= _get_vino(in
);
11569 ldout(cct
, 3) << "ll_readlink " << vino
<< dendl
;
11570 tout(cct
) << "ll_readlink" << std::endl
;
11571 tout(cct
) << vino
.ino
.val
<< std::endl
;
11573 set
<Dentry
*>::iterator dn
= in
->dn_set
.begin();
11574 while (dn
!= in
->dn_set
.end()) {
11579 int r
= _readlink(in
, buf
, buflen
); // FIXME: no permission checking!
11580 ldout(cct
, 3) << "ll_readlink " << vino
<< " = " << r
<< dendl
;
11584 int Client::_mknod(Inode
*dir
, const char *name
, mode_t mode
, dev_t rdev
,
11585 const UserPerm
& perms
, InodeRef
*inp
)
11587 ldout(cct
, 8) << "_mknod(" << dir
->ino
<< " " << name
<< ", 0" << oct
11588 << mode
<< dec
<< ", " << rdev
<< ", uid " << perms
.uid()
11589 << ", gid " << perms
.gid() << ")" << dendl
;
11591 if (strlen(name
) > NAME_MAX
)
11592 return -ENAMETOOLONG
;
11594 if (dir
->snapid
!= CEPH_NOSNAP
) {
11597 if (is_quota_files_exceeded(dir
, perms
)) {
11601 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_MKNOD
);
11604 dir
->make_nosnap_relative_path(path
);
11605 path
.push_dentry(name
);
11606 req
->set_filepath(path
);
11607 req
->set_inode(dir
);
11608 req
->head
.args
.mknod
.rdev
= rdev
;
11609 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11610 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11612 bufferlist xattrs_bl
;
11613 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
11616 req
->head
.args
.mknod
.mode
= mode
;
11617 if (xattrs_bl
.length() > 0)
11618 req
->set_data(xattrs_bl
);
11621 res
= get_or_create(dir
, name
, &de
);
11624 req
->set_dentry(de
);
11626 res
= make_request(req
, perms
, inp
);
11630 ldout(cct
, 8) << "mknod(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
11638 int Client::ll_mknod(Inode
*parent
, const char *name
, mode_t mode
,
11639 dev_t rdev
, struct stat
*attr
, Inode
**out
,
11640 const UserPerm
& perms
)
11642 Mutex::Locker
lock(client_lock
);
11647 vinodeno_t vparent
= _get_vino(parent
);
11649 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
<< dendl
;
11650 tout(cct
) << "ll_mknod" << std::endl
;
11651 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11652 tout(cct
) << name
<< std::endl
;
11653 tout(cct
) << mode
<< std::endl
;
11654 tout(cct
) << rdev
<< std::endl
;
11656 if (!cct
->_conf
->fuse_default_permissions
) {
11657 int r
= may_create(parent
, perms
);
11663 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
11665 fill_stat(in
, attr
);
11668 tout(cct
) << attr
->st_ino
<< std::endl
;
11669 ldout(cct
, 3) << "ll_mknod " << vparent
<< " " << name
11670 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11675 int Client::ll_mknodx(Inode
*parent
, const char *name
, mode_t mode
,
11676 dev_t rdev
, Inode
**out
,
11677 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11678 const UserPerm
& perms
)
11680 unsigned caps
= statx_to_mask(flags
, want
);
11681 Mutex::Locker
lock(client_lock
);
11686 vinodeno_t vparent
= _get_vino(parent
);
11688 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
<< dendl
;
11689 tout(cct
) << "ll_mknodx" << std::endl
;
11690 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11691 tout(cct
) << name
<< std::endl
;
11692 tout(cct
) << mode
<< std::endl
;
11693 tout(cct
) << rdev
<< std::endl
;
11695 if (!cct
->_conf
->fuse_default_permissions
) {
11696 int r
= may_create(parent
, perms
);
11702 int r
= _mknod(parent
, name
, mode
, rdev
, perms
, &in
);
11704 fill_statx(in
, caps
, stx
);
11707 tout(cct
) << stx
->stx_ino
<< std::endl
;
11708 ldout(cct
, 3) << "ll_mknodx " << vparent
<< " " << name
11709 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11714 int Client::_create(Inode
*dir
, const char *name
, int flags
, mode_t mode
,
11715 InodeRef
*inp
, Fh
**fhp
, int stripe_unit
, int stripe_count
,
11716 int object_size
, const char *data_pool
, bool *created
,
11717 const UserPerm
& perms
)
11719 ldout(cct
, 8) << "_create(" << dir
->ino
<< " " << name
<< ", 0" << oct
<<
11720 mode
<< dec
<< ")" << dendl
;
11722 if (strlen(name
) > NAME_MAX
)
11723 return -ENAMETOOLONG
;
11724 if (dir
->snapid
!= CEPH_NOSNAP
) {
11727 if (is_quota_files_exceeded(dir
, perms
)) {
11731 // use normalized flags to generate cmode
11732 int cmode
= ceph_flags_to_mode(ceph_flags_sys2wire(flags
));
11736 int64_t pool_id
= -1;
11737 if (data_pool
&& *data_pool
) {
11738 pool_id
= objecter
->with_osdmap(
11739 std::mem_fn(&OSDMap::lookup_pg_pool_name
), data_pool
);
11742 if (pool_id
> 0xffffffffll
)
11743 return -ERANGE
; // bummer!
11746 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_CREATE
);
11749 dir
->make_nosnap_relative_path(path
);
11750 path
.push_dentry(name
);
11751 req
->set_filepath(path
);
11752 req
->set_inode(dir
);
11753 req
->head
.args
.open
.flags
= ceph_flags_sys2wire(flags
| O_CREAT
);
11755 req
->head
.args
.open
.stripe_unit
= stripe_unit
;
11756 req
->head
.args
.open
.stripe_count
= stripe_count
;
11757 req
->head
.args
.open
.object_size
= object_size
;
11758 if (cct
->_conf
->client_debug_getattr_caps
)
11759 req
->head
.args
.open
.mask
= DEBUG_GETATTR_CAPS
;
11761 req
->head
.args
.open
.mask
= 0;
11762 req
->head
.args
.open
.pool
= pool_id
;
11763 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11764 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11767 bufferlist xattrs_bl
;
11768 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perms
);
11771 req
->head
.args
.open
.mode
= mode
;
11772 if (xattrs_bl
.length() > 0)
11773 req
->set_data(xattrs_bl
);
11776 res
= get_or_create(dir
, name
, &de
);
11779 req
->set_dentry(de
);
11781 res
= make_request(req
, perms
, inp
, created
);
11786 /* If the caller passed a value in fhp, do the open */
11788 (*inp
)->get_open_ref(cmode
);
11789 *fhp
= _create_fh(inp
->get(), flags
, cmode
, perms
);
11795 ldout(cct
, 8) << "create(" << path
<< ", 0" << oct
<< mode
<< dec
11796 << " layout " << stripe_unit
11797 << ' ' << stripe_count
11798 << ' ' << object_size
11799 <<") = " << res
<< dendl
;
11808 int Client::_mkdir(Inode
*dir
, const char *name
, mode_t mode
, const UserPerm
& perm
,
11811 ldout(cct
, 8) << "_mkdir(" << dir
->ino
<< " " << name
<< ", 0" << oct
11812 << mode
<< dec
<< ", uid " << perm
.uid()
11813 << ", gid " << perm
.gid() << ")" << dendl
;
11815 if (strlen(name
) > NAME_MAX
)
11816 return -ENAMETOOLONG
;
11818 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
11821 if (is_quota_files_exceeded(dir
, perm
)) {
11824 MetaRequest
*req
= new MetaRequest(dir
->snapid
== CEPH_SNAPDIR
?
11825 CEPH_MDS_OP_MKSNAP
: CEPH_MDS_OP_MKDIR
);
11828 dir
->make_nosnap_relative_path(path
);
11829 path
.push_dentry(name
);
11830 req
->set_filepath(path
);
11831 req
->set_inode(dir
);
11832 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11833 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11836 bufferlist xattrs_bl
;
11837 int res
= _posix_acl_create(dir
, &mode
, xattrs_bl
, perm
);
11840 req
->head
.args
.mkdir
.mode
= mode
;
11841 if (xattrs_bl
.length() > 0)
11842 req
->set_data(xattrs_bl
);
11845 res
= get_or_create(dir
, name
, &de
);
11848 req
->set_dentry(de
);
11850 ldout(cct
, 10) << "_mkdir: making request" << dendl
;
11851 res
= make_request(req
, perm
, inp
);
11852 ldout(cct
, 10) << "_mkdir result is " << res
<< dendl
;
11856 ldout(cct
, 8) << "_mkdir(" << path
<< ", 0" << oct
<< mode
<< dec
<< ") = " << res
<< dendl
;
11864 int Client::ll_mkdir(Inode
*parent
, const char *name
, mode_t mode
,
11865 struct stat
*attr
, Inode
**out
, const UserPerm
& perm
)
11867 Mutex::Locker
lock(client_lock
);
11872 vinodeno_t vparent
= _get_vino(parent
);
11874 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
<< dendl
;
11875 tout(cct
) << "ll_mkdir" << std::endl
;
11876 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11877 tout(cct
) << name
<< std::endl
;
11878 tout(cct
) << mode
<< std::endl
;
11880 if (!cct
->_conf
->fuse_default_permissions
) {
11881 int r
= may_create(parent
, perm
);
11887 int r
= _mkdir(parent
, name
, mode
, perm
, &in
);
11889 fill_stat(in
, attr
);
11892 tout(cct
) << attr
->st_ino
<< std::endl
;
11893 ldout(cct
, 3) << "ll_mkdir " << vparent
<< " " << name
11894 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
11899 int Client::ll_mkdirx(Inode
*parent
, const char *name
, mode_t mode
, Inode
**out
,
11900 struct ceph_statx
*stx
, unsigned want
, unsigned flags
,
11901 const UserPerm
& perms
)
11903 Mutex::Locker
lock(client_lock
);
11908 vinodeno_t vparent
= _get_vino(parent
);
11910 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
<< dendl
;
11911 tout(cct
) << "ll_mkdirx" << std::endl
;
11912 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11913 tout(cct
) << name
<< std::endl
;
11914 tout(cct
) << mode
<< std::endl
;
11916 if (!cct
->_conf
->fuse_default_permissions
) {
11917 int r
= may_create(parent
, perms
);
11923 int r
= _mkdir(parent
, name
, mode
, perms
, &in
);
11925 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
11931 tout(cct
) << stx
->stx_ino
<< std::endl
;
11932 ldout(cct
, 3) << "ll_mkdirx " << vparent
<< " " << name
11933 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
11938 int Client::_symlink(Inode
*dir
, const char *name
, const char *target
,
11939 const UserPerm
& perms
, InodeRef
*inp
)
11941 ldout(cct
, 8) << "_symlink(" << dir
->ino
<< " " << name
<< ", " << target
11942 << ", uid " << perms
.uid() << ", gid " << perms
.gid() << ")"
11945 if (strlen(name
) > NAME_MAX
)
11946 return -ENAMETOOLONG
;
11948 if (dir
->snapid
!= CEPH_NOSNAP
) {
11951 if (is_quota_files_exceeded(dir
, perms
)) {
11955 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_SYMLINK
);
11958 dir
->make_nosnap_relative_path(path
);
11959 path
.push_dentry(name
);
11960 req
->set_filepath(path
);
11961 req
->set_inode(dir
);
11962 req
->set_string2(target
);
11963 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
11964 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
11967 int res
= get_or_create(dir
, name
, &de
);
11970 req
->set_dentry(de
);
11972 res
= make_request(req
, perms
, inp
);
11975 ldout(cct
, 8) << "_symlink(\"" << path
<< "\", \"" << target
<< "\") = " <<
11984 int Client::ll_symlink(Inode
*parent
, const char *name
, const char *value
,
11985 struct stat
*attr
, Inode
**out
, const UserPerm
& perms
)
11987 Mutex::Locker
lock(client_lock
);
11992 vinodeno_t vparent
= _get_vino(parent
);
11994 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
<< " -> " << value
11996 tout(cct
) << "ll_symlink" << std::endl
;
11997 tout(cct
) << vparent
.ino
.val
<< std::endl
;
11998 tout(cct
) << name
<< std::endl
;
11999 tout(cct
) << value
<< std::endl
;
12001 if (!cct
->_conf
->fuse_default_permissions
) {
12002 int r
= may_create(parent
, perms
);
12008 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12010 fill_stat(in
, attr
);
12013 tout(cct
) << attr
->st_ino
<< std::endl
;
12014 ldout(cct
, 3) << "ll_symlink " << vparent
<< " " << name
12015 << " = " << r
<< " (" << hex
<< attr
->st_ino
<< dec
<< ")" << dendl
;
12020 int Client::ll_symlinkx(Inode
*parent
, const char *name
, const char *value
,
12021 Inode
**out
, struct ceph_statx
*stx
, unsigned want
,
12022 unsigned flags
, const UserPerm
& perms
)
12024 Mutex::Locker
lock(client_lock
);
12029 vinodeno_t vparent
= _get_vino(parent
);
12031 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
<< " -> " << value
12033 tout(cct
) << "ll_symlinkx" << std::endl
;
12034 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12035 tout(cct
) << name
<< std::endl
;
12036 tout(cct
) << value
<< std::endl
;
12038 if (!cct
->_conf
->fuse_default_permissions
) {
12039 int r
= may_create(parent
, perms
);
12045 int r
= _symlink(parent
, name
, value
, perms
, &in
);
12047 fill_statx(in
, statx_to_mask(flags
, want
), stx
);
12050 tout(cct
) << stx
->stx_ino
<< std::endl
;
12051 ldout(cct
, 3) << "ll_symlinkx " << vparent
<< " " << name
12052 << " = " << r
<< " (" << hex
<< stx
->stx_ino
<< dec
<< ")" << dendl
;
12057 int Client::_unlink(Inode
*dir
, const char *name
, const UserPerm
& perm
)
12059 ldout(cct
, 8) << "_unlink(" << dir
->ino
<< " " << name
12060 << " uid " << perm
.uid() << " gid " << perm
.gid()
12063 if (dir
->snapid
!= CEPH_NOSNAP
) {
12067 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_UNLINK
);
12070 dir
->make_nosnap_relative_path(path
);
12071 path
.push_dentry(name
);
12072 req
->set_filepath(path
);
12078 int res
= get_or_create(dir
, name
, &de
);
12081 req
->set_dentry(de
);
12082 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12083 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12085 res
= _lookup(dir
, name
, 0, &otherin
, perm
);
12089 in
= otherin
.get();
12090 req
->set_other_inode(in
);
12091 in
->break_all_delegs();
12092 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12094 req
->set_inode(dir
);
12096 res
= make_request(req
, perm
);
12099 ldout(cct
, 8) << "unlink(" << path
<< ") = " << res
<< dendl
;
12107 int Client::ll_unlink(Inode
*in
, const char *name
, const UserPerm
& perm
)
12109 Mutex::Locker
lock(client_lock
);
12114 vinodeno_t vino
= _get_vino(in
);
12116 ldout(cct
, 3) << "ll_unlink " << vino
<< " " << name
<< dendl
;
12117 tout(cct
) << "ll_unlink" << std::endl
;
12118 tout(cct
) << vino
.ino
.val
<< std::endl
;
12119 tout(cct
) << name
<< std::endl
;
12121 if (!cct
->_conf
->fuse_default_permissions
) {
12122 int r
= may_delete(in
, name
, perm
);
12126 return _unlink(in
, name
, perm
);
12129 int Client::_rmdir(Inode
*dir
, const char *name
, const UserPerm
& perms
)
12131 ldout(cct
, 8) << "_rmdir(" << dir
->ino
<< " " << name
<< " uid "
12132 << perms
.uid() << " gid " << perms
.gid() << ")" << dendl
;
12134 if (dir
->snapid
!= CEPH_NOSNAP
&& dir
->snapid
!= CEPH_SNAPDIR
) {
12138 int op
= dir
->snapid
== CEPH_SNAPDIR
? CEPH_MDS_OP_RMSNAP
: CEPH_MDS_OP_RMDIR
;
12139 MetaRequest
*req
= new MetaRequest(op
);
12141 dir
->make_nosnap_relative_path(path
);
12142 path
.push_dentry(name
);
12143 req
->set_filepath(path
);
12145 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12146 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12147 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12152 int res
= get_or_create(dir
, name
, &de
);
12155 if (op
== CEPH_MDS_OP_RMDIR
)
12156 req
->set_dentry(de
);
12160 res
= _lookup(dir
, name
, 0, &in
, perms
);
12163 if (op
== CEPH_MDS_OP_RMDIR
) {
12164 req
->set_inode(dir
);
12165 req
->set_other_inode(in
.get());
12167 unlink(de
, true, true);
12169 req
->set_other_inode(in
.get());
12172 res
= make_request(req
, perms
);
12175 ldout(cct
, 8) << "rmdir(" << path
<< ") = " << res
<< dendl
;
12183 int Client::ll_rmdir(Inode
*in
, const char *name
, const UserPerm
& perms
)
12185 Mutex::Locker
lock(client_lock
);
12190 vinodeno_t vino
= _get_vino(in
);
12192 ldout(cct
, 3) << "ll_rmdir " << vino
<< " " << name
<< dendl
;
12193 tout(cct
) << "ll_rmdir" << std::endl
;
12194 tout(cct
) << vino
.ino
.val
<< std::endl
;
12195 tout(cct
) << name
<< std::endl
;
12197 if (!cct
->_conf
->fuse_default_permissions
) {
12198 int r
= may_delete(in
, name
, perms
);
12203 return _rmdir(in
, name
, perms
);
12206 int Client::_rename(Inode
*fromdir
, const char *fromname
, Inode
*todir
, const char *toname
, const UserPerm
& perm
)
12208 ldout(cct
, 8) << "_rename(" << fromdir
->ino
<< " " << fromname
<< " to "
12209 << todir
->ino
<< " " << toname
12210 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")"
12213 if (fromdir
->snapid
!= todir
->snapid
)
12216 int op
= CEPH_MDS_OP_RENAME
;
12217 if (fromdir
->snapid
!= CEPH_NOSNAP
) {
12218 if (fromdir
== todir
&& fromdir
->snapid
== CEPH_SNAPDIR
)
12219 op
= CEPH_MDS_OP_RENAMESNAP
;
12223 if (fromdir
!= todir
) {
12224 Inode
*fromdir_root
=
12225 fromdir
->quota
.is_enable() ? fromdir
: get_quota_root(fromdir
, perm
);
12226 Inode
*todir_root
=
12227 todir
->quota
.is_enable() ? todir
: get_quota_root(todir
, perm
);
12228 if (fromdir_root
!= todir_root
) {
12234 MetaRequest
*req
= new MetaRequest(op
);
12237 fromdir
->make_nosnap_relative_path(from
);
12238 from
.push_dentry(fromname
);
12240 todir
->make_nosnap_relative_path(to
);
12241 to
.push_dentry(toname
);
12242 req
->set_filepath(to
);
12243 req
->set_filepath2(from
);
12246 int res
= get_or_create(fromdir
, fromname
, &oldde
);
12250 res
= get_or_create(todir
, toname
, &de
);
12254 if (op
== CEPH_MDS_OP_RENAME
) {
12255 req
->set_old_dentry(oldde
);
12256 req
->old_dentry_drop
= CEPH_CAP_FILE_SHARED
;
12257 req
->old_dentry_unless
= CEPH_CAP_FILE_EXCL
;
12259 req
->set_dentry(de
);
12260 req
->dentry_drop
= CEPH_CAP_FILE_SHARED
;
12261 req
->dentry_unless
= CEPH_CAP_FILE_EXCL
;
12263 InodeRef oldin
, otherin
;
12264 res
= _lookup(fromdir
, fromname
, 0, &oldin
, perm
);
12268 Inode
*oldinode
= oldin
.get();
12269 oldinode
->break_all_delegs();
12270 req
->set_old_inode(oldinode
);
12271 req
->old_inode_drop
= CEPH_CAP_LINK_SHARED
;
12273 res
= _lookup(todir
, toname
, 0, &otherin
, perm
);
12277 Inode
*in
= otherin
.get();
12278 req
->set_other_inode(in
);
12279 in
->break_all_delegs();
12281 req
->other_inode_drop
= CEPH_CAP_LINK_SHARED
| CEPH_CAP_LINK_EXCL
;
12289 req
->set_inode(todir
);
12291 // renamesnap reply contains no tracedn, so we need to invalidate
12293 unlink(oldde
, true, true);
12294 unlink(de
, true, true);
12297 res
= make_request(req
, perm
, &target
);
12298 ldout(cct
, 10) << "rename result is " << res
<< dendl
;
12300 // renamed item from our cache
12303 ldout(cct
, 8) << "_rename(" << from
<< ", " << to
<< ") = " << res
<< dendl
;
12311 int Client::ll_rename(Inode
*parent
, const char *name
, Inode
*newparent
,
12312 const char *newname
, const UserPerm
& perm
)
12314 Mutex::Locker
lock(client_lock
);
12319 vinodeno_t vparent
= _get_vino(parent
);
12320 vinodeno_t vnewparent
= _get_vino(newparent
);
12322 ldout(cct
, 3) << "ll_rename " << vparent
<< " " << name
<< " to "
12323 << vnewparent
<< " " << newname
<< dendl
;
12324 tout(cct
) << "ll_rename" << std::endl
;
12325 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12326 tout(cct
) << name
<< std::endl
;
12327 tout(cct
) << vnewparent
.ino
.val
<< std::endl
;
12328 tout(cct
) << newname
<< std::endl
;
12330 if (!cct
->_conf
->fuse_default_permissions
) {
12331 int r
= may_delete(parent
, name
, perm
);
12334 r
= may_delete(newparent
, newname
, perm
);
12335 if (r
< 0 && r
!= -ENOENT
)
12339 return _rename(parent
, name
, newparent
, newname
, perm
);
12342 int Client::_link(Inode
*in
, Inode
*dir
, const char *newname
, const UserPerm
& perm
, InodeRef
*inp
)
12344 ldout(cct
, 8) << "_link(" << in
->ino
<< " to " << dir
->ino
<< " " << newname
12345 << " uid " << perm
.uid() << " gid " << perm
.gid() << ")" << dendl
;
12347 if (strlen(newname
) > NAME_MAX
)
12348 return -ENAMETOOLONG
;
12350 if (in
->snapid
!= CEPH_NOSNAP
|| dir
->snapid
!= CEPH_NOSNAP
) {
12353 if (is_quota_files_exceeded(dir
, perm
)) {
12357 in
->break_all_delegs();
12358 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LINK
);
12360 filepath
path(newname
, dir
->ino
);
12361 req
->set_filepath(path
);
12362 filepath
existing(in
->ino
);
12363 req
->set_filepath2(existing
);
12365 req
->set_inode(dir
);
12366 req
->inode_drop
= CEPH_CAP_FILE_SHARED
;
12367 req
->inode_unless
= CEPH_CAP_FILE_EXCL
;
12370 int res
= get_or_create(dir
, newname
, &de
);
12373 req
->set_dentry(de
);
12375 res
= make_request(req
, perm
, inp
);
12376 ldout(cct
, 10) << "link result is " << res
<< dendl
;
12379 ldout(cct
, 8) << "link(" << existing
<< ", " << path
<< ") = " << res
<< dendl
;
12387 int Client::ll_link(Inode
*in
, Inode
*newparent
, const char *newname
,
12388 const UserPerm
& perm
)
12390 Mutex::Locker
lock(client_lock
);
12395 vinodeno_t vino
= _get_vino(in
);
12396 vinodeno_t vnewparent
= _get_vino(newparent
);
12398 ldout(cct
, 3) << "ll_link " << vino
<< " to " << vnewparent
<< " " <<
12400 tout(cct
) << "ll_link" << std::endl
;
12401 tout(cct
) << vino
.ino
.val
<< std::endl
;
12402 tout(cct
) << vnewparent
<< std::endl
;
12403 tout(cct
) << newname
<< std::endl
;
12408 if (!cct
->_conf
->fuse_default_permissions
) {
12409 if (S_ISDIR(in
->mode
))
12412 r
= may_hardlink(in
, perm
);
12416 r
= may_create(newparent
, perm
);
12421 return _link(in
, newparent
, newname
, perm
, &target
);
12424 int Client::ll_num_osds(void)
12426 Mutex::Locker
lock(client_lock
);
12427 return objecter
->with_osdmap(std::mem_fn(&OSDMap::get_num_osds
));
12430 int Client::ll_osdaddr(int osd
, uint32_t *addr
)
12432 Mutex::Locker
lock(client_lock
);
12435 bool exists
= objecter
->with_osdmap([&](const OSDMap
& o
) {
12436 if (!o
.exists(osd
))
12438 g
= o
.get_addr(osd
);
12443 uint32_t nb_addr
= (g
.in4_addr()).sin_addr
.s_addr
;
12444 *addr
= ntohl(nb_addr
);
12448 uint32_t Client::ll_stripe_unit(Inode
*in
)
12450 Mutex::Locker
lock(client_lock
);
12451 return in
->layout
.stripe_unit
;
12454 uint64_t Client::ll_snap_seq(Inode
*in
)
12456 Mutex::Locker
lock(client_lock
);
12457 return in
->snaprealm
->seq
;
12460 int Client::ll_file_layout(Inode
*in
, file_layout_t
*layout
)
12462 Mutex::Locker
lock(client_lock
);
12463 *layout
= in
->layout
;
12467 int Client::ll_file_layout(Fh
*fh
, file_layout_t
*layout
)
12469 return ll_file_layout(fh
->inode
.get(), layout
);
12472 /* Currently we cannot take advantage of redundancy in reads, since we
12473 would have to go through all possible placement groups (a
12474 potentially quite large number determined by a hash), and use CRUSH
12475 to calculate the appropriate set of OSDs for each placement group,
12476 then index into that. An array with one entry per OSD is much more
12477 tractable and works for demonstration purposes. */
12479 int Client::ll_get_stripe_osd(Inode
*in
, uint64_t blockno
,
12480 file_layout_t
* layout
)
12482 Mutex::Locker
lock(client_lock
);
12484 inodeno_t ino
= in
->ino
;
12485 uint32_t object_size
= layout
->object_size
;
12486 uint32_t su
= layout
->stripe_unit
;
12487 uint32_t stripe_count
= layout
->stripe_count
;
12488 uint64_t stripes_per_object
= object_size
/ su
;
12490 uint64_t stripeno
= blockno
/ stripe_count
; // which horizontal stripe (Y)
12491 uint64_t stripepos
= blockno
% stripe_count
; // which object in the object set (X)
12492 uint64_t objectsetno
= stripeno
/ stripes_per_object
; // which object set
12493 uint64_t objectno
= objectsetno
* stripe_count
+ stripepos
; // object id
12495 object_t oid
= file_object_t(ino
, objectno
);
12496 return objecter
->with_osdmap([&](const OSDMap
& o
) {
12497 ceph_object_layout olayout
=
12498 o
.file_to_object_layout(oid
, *layout
);
12499 pg_t pg
= (pg_t
)olayout
.ol_pgid
;
12502 o
.pg_to_acting_osds(pg
, &osds
, &primary
);
12507 /* Return the offset of the block, internal to the object */
12509 uint64_t Client::ll_get_internal_offset(Inode
*in
, uint64_t blockno
)
12511 Mutex::Locker
lock(client_lock
);
12512 file_layout_t
*layout
=&(in
->layout
);
12513 uint32_t object_size
= layout
->object_size
;
12514 uint32_t su
= layout
->stripe_unit
;
12515 uint64_t stripes_per_object
= object_size
/ su
;
12517 return (blockno
% stripes_per_object
) * su
;
12520 int Client::ll_opendir(Inode
*in
, int flags
, dir_result_t
** dirpp
,
12521 const UserPerm
& perms
)
12523 Mutex::Locker
lock(client_lock
);
12528 vinodeno_t vino
= _get_vino(in
);
12530 ldout(cct
, 3) << "ll_opendir " << vino
<< dendl
;
12531 tout(cct
) << "ll_opendir" << std::endl
;
12532 tout(cct
) << vino
.ino
.val
<< std::endl
;
12534 if (!cct
->_conf
->fuse_default_permissions
) {
12535 int r
= may_open(in
, flags
, perms
);
12540 int r
= _opendir(in
, dirpp
, perms
);
12541 tout(cct
) << (unsigned long)*dirpp
<< std::endl
;
12543 ldout(cct
, 3) << "ll_opendir " << vino
<< " = " << r
<< " (" << *dirpp
<< ")"
12548 int Client::ll_releasedir(dir_result_t
*dirp
)
12550 Mutex::Locker
lock(client_lock
);
12551 ldout(cct
, 3) << "ll_releasedir " << dirp
<< dendl
;
12552 tout(cct
) << "ll_releasedir" << std::endl
;
12553 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12562 int Client::ll_fsyncdir(dir_result_t
*dirp
)
12564 Mutex::Locker
lock(client_lock
);
12565 ldout(cct
, 3) << "ll_fsyncdir " << dirp
<< dendl
;
12566 tout(cct
) << "ll_fsyncdir" << std::endl
;
12567 tout(cct
) << (unsigned long)dirp
<< std::endl
;
12572 return _fsync(dirp
->inode
.get(), false);
12575 int Client::ll_open(Inode
*in
, int flags
, Fh
**fhp
, const UserPerm
& perms
)
12577 assert(!(flags
& O_CREAT
));
12579 Mutex::Locker
lock(client_lock
);
12584 vinodeno_t vino
= _get_vino(in
);
12586 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) << dendl
;
12587 tout(cct
) << "ll_open" << std::endl
;
12588 tout(cct
) << vino
.ino
.val
<< std::endl
;
12589 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
12592 if (!cct
->_conf
->fuse_default_permissions
) {
12593 r
= may_open(in
, flags
, perms
);
12598 r
= _open(in
, flags
, 0, fhp
/* may be NULL */, perms
);
12601 Fh
*fhptr
= fhp
? *fhp
: NULL
;
12603 ll_unclosed_fh_set
.insert(fhptr
);
12605 tout(cct
) << (unsigned long)fhptr
<< std::endl
;
12606 ldout(cct
, 3) << "ll_open " << vino
<< " " << ceph_flags_sys2wire(flags
) <<
12607 " = " << r
<< " (" << fhptr
<< ")" << dendl
;
12611 int Client::_ll_create(Inode
*parent
, const char *name
, mode_t mode
,
12612 int flags
, InodeRef
*in
, int caps
, Fh
**fhp
,
12613 const UserPerm
& perms
)
12617 vinodeno_t vparent
= _get_vino(parent
);
12619 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
12620 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << ", uid " << perms
.uid()
12621 << ", gid " << perms
.gid() << dendl
;
12622 tout(cct
) << "ll_create" << std::endl
;
12623 tout(cct
) << vparent
.ino
.val
<< std::endl
;
12624 tout(cct
) << name
<< std::endl
;
12625 tout(cct
) << mode
<< std::endl
;
12626 tout(cct
) << ceph_flags_sys2wire(flags
) << std::endl
;
12628 bool created
= false;
12629 int r
= _lookup(parent
, name
, caps
, in
, perms
);
12631 if (r
== 0 && (flags
& O_CREAT
) && (flags
& O_EXCL
))
12634 if (r
== -ENOENT
&& (flags
& O_CREAT
)) {
12635 if (!cct
->_conf
->fuse_default_permissions
) {
12636 r
= may_create(parent
, perms
);
12640 r
= _create(parent
, name
, flags
, mode
, in
, fhp
, 0, 0, 0, NULL
, &created
,
12651 ldout(cct
, 20) << "_ll_create created = " << created
<< dendl
;
12653 if (!cct
->_conf
->fuse_default_permissions
) {
12654 r
= may_open(in
->get(), flags
, perms
);
12657 int release_r
= _release_fh(*fhp
);
12658 assert(release_r
== 0); // during create, no async data ops should have happened
12663 if (*fhp
== NULL
) {
12664 r
= _open(in
->get(), flags
, mode
, fhp
, perms
);
12672 ll_unclosed_fh_set
.insert(*fhp
);
12677 Inode
*inode
= in
->get();
12678 if (use_faked_inos())
12679 ino
= inode
->faked_ino
;
12684 tout(cct
) << (unsigned long)*fhp
<< std::endl
;
12685 tout(cct
) << ino
<< std::endl
;
12686 ldout(cct
, 8) << "_ll_create " << vparent
<< " " << name
<< " 0" << oct
<<
12687 mode
<< dec
<< " " << ceph_flags_sys2wire(flags
) << " = " << r
<< " (" <<
12688 *fhp
<< " " << hex
<< ino
<< dec
<< ")" << dendl
;
12693 int Client::ll_create(Inode
*parent
, const char *name
, mode_t mode
,
12694 int flags
, struct stat
*attr
, Inode
**outp
, Fh
**fhp
,
12695 const UserPerm
& perms
)
12697 Mutex::Locker
lock(client_lock
);
12703 int r
= _ll_create(parent
, name
, mode
, flags
, &in
, CEPH_STAT_CAP_INODE_ALL
,
12708 // passing an Inode in outp requires an additional ref
12713 fill_stat(in
, attr
);
12721 int Client::ll_createx(Inode
*parent
, const char *name
, mode_t mode
,
12722 int oflags
, Inode
**outp
, Fh
**fhp
,
12723 struct ceph_statx
*stx
, unsigned want
, unsigned lflags
,
12724 const UserPerm
& perms
)
12726 unsigned caps
= statx_to_mask(lflags
, want
);
12727 Mutex::Locker
lock(client_lock
);
12733 int r
= _ll_create(parent
, name
, mode
, oflags
, &in
, caps
, fhp
, perms
);
12737 // passing an Inode in outp requires an additional ref
12742 fill_statx(in
, caps
, stx
);
12751 loff_t
Client::ll_lseek(Fh
*fh
, loff_t offset
, int whence
)
12753 Mutex::Locker
lock(client_lock
);
12754 tout(cct
) << "ll_lseek" << std::endl
;
12755 tout(cct
) << offset
<< std::endl
;
12756 tout(cct
) << whence
<< std::endl
;
12761 return _lseek(fh
, offset
, whence
);
12764 int Client::ll_read(Fh
*fh
, loff_t off
, loff_t len
, bufferlist
*bl
)
12766 Mutex::Locker
lock(client_lock
);
12767 ldout(cct
, 3) << "ll_read " << fh
<< " " << fh
->inode
->ino
<< " " << " " << off
<< "~" << len
<< dendl
;
12768 tout(cct
) << "ll_read" << std::endl
;
12769 tout(cct
) << (unsigned long)fh
<< std::endl
;
12770 tout(cct
) << off
<< std::endl
;
12771 tout(cct
) << len
<< std::endl
;
12776 return _read(fh
, off
, len
, bl
);
12779 int Client::ll_read_block(Inode
*in
, uint64_t blockid
,
12783 file_layout_t
* layout
)
12785 Mutex::Locker
lock(client_lock
);
12790 vinodeno_t vino
= _get_vino(in
);
12791 object_t oid
= file_object_t(vino
.ino
, blockid
);
12792 C_SaferCond onfinish
;
12795 objecter
->read(oid
,
12796 object_locator_t(layout
->pool_id
),
12801 CEPH_OSD_FLAG_READ
,
12804 client_lock
.Unlock();
12805 int r
= onfinish
.wait();
12806 client_lock
.Lock();
12809 bl
.copy(0, bl
.length(), buf
);
12816 /* It appears that the OSD doesn't return success unless the entire
12817 buffer was written, return the write length on success. */
12819 int Client::ll_write_block(Inode
*in
, uint64_t blockid
,
12820 char* buf
, uint64_t offset
,
12821 uint64_t length
, file_layout_t
* layout
,
12822 uint64_t snapseq
, uint32_t sync
)
12824 Mutex
flock("Client::ll_write_block flock");
12825 vinodeno_t vino
= ll_get_vino(in
);
12829 Context
*onsafe
= nullptr;
12834 if (true || sync
) {
12835 /* if write is stable, the epilogue is waiting on
12837 onsafe
= new C_SafeCond(&flock
, &cond
, &done
, &r
);
12840 /* if write is unstable, we just place a barrier for
12841 * future commits to wait on */
12842 /*onsafe = new C_Block_Sync(this, vino.ino,
12843 barrier_interval(offset, offset + length), &r);
12847 object_t oid
= file_object_t(vino
.ino
, blockid
);
12848 SnapContext fakesnap
;
12850 if (length
> 0) bp
= buffer::copy(buf
, length
);
12854 ldout(cct
, 1) << "ll_block_write for " << vino
.ino
<< "." << blockid
12857 fakesnap
.seq
= snapseq
;
12859 /* lock just in time */
12860 client_lock
.Lock();
12862 client_lock
.Unlock();
12867 objecter
->write(oid
,
12868 object_locator_t(layout
->pool_id
),
12873 ceph::real_clock::now(),
12877 client_lock
.Unlock();
12878 if (!done
/* also !sync */) {
12892 int Client::ll_commit_blocks(Inode
*in
,
12896 Mutex::Locker
lock(client_lock
);
12898 BarrierContext *bctx;
12899 vinodeno_t vino = _get_vino(in);
12900 uint64_t ino = vino.ino;
12902 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12903 << offset << " to " << length << dendl;
12909 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12910 if (p != barriers.end()) {
12911 barrier_interval civ(offset, offset + length);
12912 p->second->commit_barrier(civ);
12918 int Client::ll_write(Fh
*fh
, loff_t off
, loff_t len
, const char *data
)
12920 Mutex::Locker
lock(client_lock
);
12921 ldout(cct
, 3) << "ll_write " << fh
<< " " << fh
->inode
->ino
<< " " << off
<<
12922 "~" << len
<< dendl
;
12923 tout(cct
) << "ll_write" << std::endl
;
12924 tout(cct
) << (unsigned long)fh
<< std::endl
;
12925 tout(cct
) << off
<< std::endl
;
12926 tout(cct
) << len
<< std::endl
;
12931 int r
= _write(fh
, off
, len
, data
, NULL
, 0);
12932 ldout(cct
, 3) << "ll_write " << fh
<< " " << off
<< "~" << len
<< " = " << r
12937 int Client::ll_flush(Fh
*fh
)
12939 Mutex::Locker
lock(client_lock
);
12940 ldout(cct
, 3) << "ll_flush " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
12941 tout(cct
) << "ll_flush" << std::endl
;
12942 tout(cct
) << (unsigned long)fh
<< std::endl
;
12950 int Client::ll_fsync(Fh
*fh
, bool syncdataonly
)
12952 Mutex::Locker
lock(client_lock
);
12953 ldout(cct
, 3) << "ll_fsync " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
12954 tout(cct
) << "ll_fsync" << std::endl
;
12955 tout(cct
) << (unsigned long)fh
<< std::endl
;
12960 int r
= _fsync(fh
, syncdataonly
);
12962 // If we're returning an error, clear it from the FH
12963 fh
->take_async_err();
12968 int Client::ll_sync_inode(Inode
*in
, bool syncdataonly
)
12970 Mutex::Locker
lock(client_lock
);
12971 ldout(cct
, 3) << "ll_sync_inode " << *in
<< " " << dendl
;
12972 tout(cct
) << "ll_sync_inode" << std::endl
;
12973 tout(cct
) << (unsigned long)in
<< std::endl
;
12978 return _fsync(in
, syncdataonly
);
12981 #ifdef FALLOC_FL_PUNCH_HOLE
12983 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
12985 if (offset
< 0 || length
<= 0)
12988 if (mode
& ~(FALLOC_FL_KEEP_SIZE
| FALLOC_FL_PUNCH_HOLE
))
12989 return -EOPNOTSUPP
;
12991 if ((mode
& FALLOC_FL_PUNCH_HOLE
) && !(mode
& FALLOC_FL_KEEP_SIZE
))
12992 return -EOPNOTSUPP
;
12994 Inode
*in
= fh
->inode
.get();
12996 if (objecter
->osdmap_pool_full(in
->layout
.pool_id
) &&
12997 !(mode
& FALLOC_FL_PUNCH_HOLE
)) {
13001 if (in
->snapid
!= CEPH_NOSNAP
)
13004 if ((fh
->mode
& CEPH_FILE_MODE_WR
) == 0)
13007 uint64_t size
= offset
+ length
;
13008 std::list
<InodeRef
> quota_roots
;
13009 if (!(mode
& (FALLOC_FL_PUNCH_HOLE
| FALLOC_FL_KEEP_SIZE
)) &&
13011 is_quota_bytes_exceeded(in
, size
- in
->size
, fh
->actor_perms
, "a_roots
)) {
13016 int r
= get_caps(in
, CEPH_CAP_FILE_WR
, CEPH_CAP_FILE_BUFFER
, &have
, -1);
13020 Mutex
uninline_flock("Client::_fallocate_uninline_data flock");
13021 Cond uninline_cond
;
13022 bool uninline_done
= false;
13023 int uninline_ret
= 0;
13024 Context
*onuninline
= NULL
;
13026 if (mode
& FALLOC_FL_PUNCH_HOLE
) {
13027 if (in
->inline_version
< CEPH_INLINE_NONE
&&
13028 (have
& CEPH_CAP_FILE_BUFFER
)) {
13030 int len
= in
->inline_data
.length();
13031 if (offset
< len
) {
13033 in
->inline_data
.copy(0, offset
, bl
);
13035 if (offset
+ size
> len
)
13036 size
= len
- offset
;
13038 bl
.append_zero(size
);
13039 if (offset
+ size
< len
)
13040 in
->inline_data
.copy(offset
+ size
, len
- offset
- size
, bl
);
13041 in
->inline_data
= bl
;
13042 in
->inline_version
++;
13044 in
->mtime
= ceph_clock_now();
13046 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13048 if (in
->inline_version
< CEPH_INLINE_NONE
) {
13049 onuninline
= new C_SafeCond(&uninline_flock
,
13053 uninline_data(in
, onuninline
);
13056 Mutex
flock("Client::_punch_hole flock");
13059 Context
*onfinish
= new C_SafeCond(&flock
, &cond
, &done
);
13061 unsafe_sync_write
++;
13062 get_cap_ref(in
, CEPH_CAP_FILE_BUFFER
);
13064 _invalidate_inode_cache(in
, offset
, length
);
13065 filer
->zero(in
->ino
, &in
->layout
,
13066 in
->snaprealm
->get_snap_context(),
13068 ceph::real_clock::now(),
13069 0, true, onfinish
);
13070 in
->mtime
= ceph_clock_now();
13072 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13074 client_lock
.Unlock();
13079 client_lock
.Lock();
13080 _sync_write_commit(in
);
13082 } else if (!(mode
& FALLOC_FL_KEEP_SIZE
)) {
13083 uint64_t size
= offset
+ length
;
13084 if (size
> in
->size
) {
13086 in
->mtime
= ceph_clock_now();
13088 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13090 if (is_quota_bytes_approaching(in
, quota_roots
)) {
13091 check_caps(in
, CHECK_CAPS_NODELAY
);
13092 } else if (is_max_size_approaching(in
)) {
13099 client_lock
.Unlock();
13100 uninline_flock
.Lock();
13101 while (!uninline_done
)
13102 uninline_cond
.Wait(uninline_flock
);
13103 uninline_flock
.Unlock();
13104 client_lock
.Lock();
13106 if (uninline_ret
>= 0 || uninline_ret
== -ECANCELED
) {
13107 in
->inline_data
.clear();
13108 in
->inline_version
= CEPH_INLINE_NONE
;
13109 in
->mark_caps_dirty(CEPH_CAP_FILE_WR
);
13115 put_cap_ref(in
, CEPH_CAP_FILE_WR
);
13120 int Client::_fallocate(Fh
*fh
, int mode
, int64_t offset
, int64_t length
)
13122 return -EOPNOTSUPP
;
13128 int Client::ll_fallocate(Fh
*fh
, int mode
, loff_t offset
, loff_t length
)
13130 Mutex::Locker
lock(client_lock
);
13131 ldout(cct
, 3) << "ll_fallocate " << fh
<< " " << fh
->inode
->ino
<< " " << dendl
;
13132 tout(cct
) << "ll_fallocate " << mode
<< " " << offset
<< " " << length
<< std::endl
;
13133 tout(cct
) << (unsigned long)fh
<< std::endl
;
13138 return _fallocate(fh
, mode
, offset
, length
);
13141 int Client::fallocate(int fd
, int mode
, loff_t offset
, loff_t length
)
13143 Mutex::Locker
lock(client_lock
);
13144 tout(cct
) << "fallocate " << " " << fd
<< mode
<< " " << offset
<< " " << length
<< std::endl
;
13149 Fh
*fh
= get_filehandle(fd
);
13152 #if defined(__linux__) && defined(O_PATH)
13153 if (fh
->flags
& O_PATH
)
13156 return _fallocate(fh
, mode
, offset
, length
);
13159 int Client::ll_release(Fh
*fh
)
13161 Mutex::Locker
lock(client_lock
);
13162 ldout(cct
, 3) << "ll_release (fh)" << fh
<< " " << fh
->inode
->ino
<< " " <<
13164 tout(cct
) << "ll_release (fh)" << std::endl
;
13165 tout(cct
) << (unsigned long)fh
<< std::endl
;
13170 if (ll_unclosed_fh_set
.count(fh
))
13171 ll_unclosed_fh_set
.erase(fh
);
13172 return _release_fh(fh
);
13175 int Client::ll_getlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
)
13177 Mutex::Locker
lock(client_lock
);
13179 ldout(cct
, 3) << "ll_getlk (fh)" << fh
<< " " << fh
->inode
->ino
<< dendl
;
13180 tout(cct
) << "ll_getk (fh)" << (unsigned long)fh
<< std::endl
;
13185 return _getlk(fh
, fl
, owner
);
13188 int Client::ll_setlk(Fh
*fh
, struct flock
*fl
, uint64_t owner
, int sleep
)
13190 Mutex::Locker
lock(client_lock
);
13192 ldout(cct
, 3) << "ll_setlk (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13193 tout(cct
) << "ll_setk (fh)" << (unsigned long)fh
<< std::endl
;
13198 return _setlk(fh
, fl
, owner
, sleep
);
13201 int Client::ll_flock(Fh
*fh
, int cmd
, uint64_t owner
)
13203 Mutex::Locker
lock(client_lock
);
13205 ldout(cct
, 3) << "ll_flock (fh) " << fh
<< " " << fh
->inode
->ino
<< dendl
;
13206 tout(cct
) << "ll_flock (fh)" << (unsigned long)fh
<< std::endl
;
13211 return _flock(fh
, cmd
, owner
);
13214 int Client::set_deleg_timeout(uint32_t timeout
)
13216 Mutex::Locker
lock(client_lock
);
13219 * The whole point is to prevent blacklisting so we must time out the
13220 * delegation before the session autoclose timeout kicks in.
13222 if (timeout
>= mdsmap
->get_session_autoclose())
13225 deleg_timeout
= timeout
;
13229 int Client::ll_delegation(Fh
*fh
, unsigned cmd
, ceph_deleg_cb_t cb
, void *priv
)
13233 Mutex::Locker
lock(client_lock
);
13238 Inode
*inode
= fh
->inode
.get();
13241 case CEPH_DELEGATION_NONE
:
13242 inode
->unset_deleg(fh
);
13247 ret
= inode
->set_deleg(fh
, cmd
, cb
, priv
);
13248 } catch (std::bad_alloc
) {
13256 class C_Client_RequestInterrupt
: public Context
{
13261 C_Client_RequestInterrupt(Client
*c
, MetaRequest
*r
) : client(c
), req(r
) {
13264 void finish(int r
) override
{
13265 Mutex::Locker
l(client
->client_lock
);
13266 assert(req
->head
.op
== CEPH_MDS_OP_SETFILELOCK
);
13267 client
->_interrupt_filelock(req
);
13268 client
->put_request(req
);
13272 void Client::ll_interrupt(void *d
)
13274 MetaRequest
*req
= static_cast<MetaRequest
*>(d
);
13275 ldout(cct
, 3) << "ll_interrupt tid " << req
->get_tid() << dendl
;
13276 tout(cct
) << "ll_interrupt tid " << req
->get_tid() << std::endl
;
13277 interrupt_finisher
.queue(new C_Client_RequestInterrupt(this, req
));
13280 // =========================================
13283 // expose file layouts
13285 int Client::describe_layout(const char *relpath
, file_layout_t
*lp
,
13286 const UserPerm
& perms
)
13288 Mutex::Locker
lock(client_lock
);
13293 filepath
path(relpath
);
13295 int r
= path_walk(path
, &in
, perms
);
13301 ldout(cct
, 3) << "describe_layout(" << relpath
<< ") = 0" << dendl
;
13305 int Client::fdescribe_layout(int fd
, file_layout_t
*lp
)
13307 Mutex::Locker
lock(client_lock
);
13312 Fh
*f
= get_filehandle(fd
);
13315 Inode
*in
= f
->inode
.get();
13319 ldout(cct
, 3) << "fdescribe_layout(" << fd
<< ") = 0" << dendl
;
13323 int64_t Client::get_default_pool_id()
13325 Mutex::Locker
lock(client_lock
);
13330 /* first data pool is the default */
13331 return mdsmap
->get_first_data_pool();
13336 int64_t Client::get_pool_id(const char *pool_name
)
13338 Mutex::Locker
lock(client_lock
);
13343 return objecter
->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name
),
13347 string
Client::get_pool_name(int64_t pool
)
13349 Mutex::Locker
lock(client_lock
);
13354 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13355 return o
.have_pg_pool(pool
) ? o
.get_pool_name(pool
) : string();
13359 int Client::get_pool_replication(int64_t pool
)
13361 Mutex::Locker
lock(client_lock
);
13366 return objecter
->with_osdmap([pool
](const OSDMap
& o
) {
13367 return o
.have_pg_pool(pool
) ? o
.get_pg_pool(pool
)->get_size() : -ENOENT
;
13371 int Client::get_file_extent_osds(int fd
, loff_t off
, loff_t
*len
, vector
<int>& osds
)
13373 Mutex::Locker
lock(client_lock
);
13378 Fh
*f
= get_filehandle(fd
);
13381 Inode
*in
= f
->inode
.get();
13383 vector
<ObjectExtent
> extents
;
13384 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, off
, 1, in
->truncate_size
, extents
);
13385 assert(extents
.size() == 1);
13387 objecter
->with_osdmap([&](const OSDMap
& o
) {
13388 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13389 o
.pg_to_acting_osds(pg
, osds
);
13396 * Return the remainder of the extent (stripe unit)
13398 * If length = 1 is passed to Striper::file_to_extents we get a single
13399 * extent back, but its length is one so we still need to compute the length
13400 * to the end of the stripe unit.
13402 * If length = su then we may get 1 or 2 objects back in the extents vector
13403 * which would have to be examined. Even then, the offsets are local to the
13404 * object, so matching up to the file offset is extra work.
13406 * It seems simpler to stick with length = 1 and manually compute the
13410 uint64_t su
= in
->layout
.stripe_unit
;
13411 *len
= su
- (off
% su
);
13417 int Client::get_osd_crush_location(int id
, vector
<pair
<string
, string
> >& path
)
13419 Mutex::Locker
lock(client_lock
);
13426 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13427 return o
.crush
->get_full_location_ordered(id
, path
);
13431 int Client::get_file_stripe_address(int fd
, loff_t offset
,
13432 vector
<entity_addr_t
>& address
)
13434 Mutex::Locker
lock(client_lock
);
13439 Fh
*f
= get_filehandle(fd
);
13442 Inode
*in
= f
->inode
.get();
13445 vector
<ObjectExtent
> extents
;
13446 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, 1,
13447 in
->truncate_size
, extents
);
13448 assert(extents
.size() == 1);
13450 // now we have the object and its 'layout'
13451 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13452 pg_t pg
= o
.object_locator_to_pg(extents
[0].oid
, extents
[0].oloc
);
13454 o
.pg_to_acting_osds(pg
, osds
);
13457 for (unsigned i
= 0; i
< osds
.size(); i
++) {
13458 entity_addr_t addr
= o
.get_addr(osds
[i
]);
13459 address
.push_back(addr
);
13465 int Client::get_osd_addr(int osd
, entity_addr_t
& addr
)
13467 Mutex::Locker
lock(client_lock
);
13472 return objecter
->with_osdmap([&](const OSDMap
& o
) {
13473 if (!o
.exists(osd
))
13476 addr
= o
.get_addr(osd
);
13481 int Client::enumerate_layout(int fd
, vector
<ObjectExtent
>& result
,
13482 loff_t length
, loff_t offset
)
13484 Mutex::Locker
lock(client_lock
);
13489 Fh
*f
= get_filehandle(fd
);
13492 Inode
*in
= f
->inode
.get();
13494 // map to a list of extents
13495 Striper::file_to_extents(cct
, in
->ino
, &in
->layout
, offset
, length
, in
->truncate_size
, result
);
13497 ldout(cct
, 3) << "enumerate_layout(" << fd
<< ", " << length
<< ", " << offset
<< ") = 0" << dendl
;
13502 /* find an osd with the same ip. -ENXIO if none. */
13503 int Client::get_local_osd()
13505 Mutex::Locker
lock(client_lock
);
13510 objecter
->with_osdmap([this](const OSDMap
& o
) {
13511 if (o
.get_epoch() != local_osd_epoch
) {
13512 local_osd
= o
.find_osd_on_ip(messenger
->get_myaddr());
13513 local_osd_epoch
= o
.get_epoch();
13524 // ===============================
13526 void Client::ms_handle_connect(Connection
*con
)
13528 ldout(cct
, 10) << "ms_handle_connect on " << con
->get_peer_addr() << dendl
;
13531 bool Client::ms_handle_reset(Connection
*con
)
13533 ldout(cct
, 0) << "ms_handle_reset on " << con
->get_peer_addr() << dendl
;
13537 void Client::ms_handle_remote_reset(Connection
*con
)
13539 ldout(cct
, 0) << "ms_handle_remote_reset on " << con
->get_peer_addr() << dendl
;
13540 Mutex::Locker
l(client_lock
);
13541 switch (con
->get_peer_type()) {
13542 case CEPH_ENTITY_TYPE_MDS
:
13544 // kludge to figure out which mds this is; fixme with a Connection* state
13545 mds_rank_t mds
= MDS_RANK_NONE
;
13546 MetaSession
*s
= NULL
;
13547 for (map
<mds_rank_t
,MetaSession
*>::iterator p
= mds_sessions
.begin();
13548 p
!= mds_sessions
.end();
13550 if (mdsmap
->get_addr(p
->first
) == con
->get_peer_addr()) {
13556 assert (s
!= NULL
);
13557 switch (s
->state
) {
13558 case MetaSession::STATE_CLOSING
:
13559 ldout(cct
, 1) << "reset from mds we were closing; we'll call that closed" << dendl
;
13560 _closed_mds_session(s
);
13563 case MetaSession::STATE_OPENING
:
13565 ldout(cct
, 1) << "reset from mds we were opening; retrying" << dendl
;
13566 list
<Context
*> waiters
;
13567 waiters
.swap(s
->waiting_for_open
);
13568 _closed_mds_session(s
);
13569 MetaSession
*news
= _get_or_open_mds_session(mds
);
13570 news
->waiting_for_open
.swap(waiters
);
13574 case MetaSession::STATE_OPEN
:
13576 objecter
->maybe_request_map(); /* to check if we are blacklisted */
13577 const md_config_t
*conf
= cct
->_conf
;
13578 if (conf
->client_reconnect_stale
) {
13579 ldout(cct
, 1) << "reset from mds we were open; close mds session for reconnect" << dendl
;
13580 _closed_mds_session(s
);
13582 ldout(cct
, 1) << "reset from mds we were open; mark session as stale" << dendl
;
13583 s
->state
= MetaSession::STATE_STALE
;
13588 case MetaSession::STATE_NEW
:
13589 case MetaSession::STATE_CLOSED
:
13599 bool Client::ms_handle_refused(Connection
*con
)
13601 ldout(cct
, 1) << "ms_handle_refused on " << con
->get_peer_addr() << dendl
;
13605 bool Client::ms_get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
, bool force_new
)
13607 if (dest_type
== CEPH_ENTITY_TYPE_MON
)
13609 *authorizer
= monclient
->build_authorizer(dest_type
);
13613 Inode
*Client::get_quota_root(Inode
*in
, const UserPerm
& perms
)
13616 utime_t now
= ceph_clock_now();
13619 if (cur
!= in
&& cur
->quota
.is_enable())
13622 Inode
*parent_in
= NULL
;
13623 if (!cur
->dn_set
.empty()) {
13624 for (auto p
= cur
->dn_set
.begin(); p
!= cur
->dn_set
.end(); ++p
) {
13626 if (dn
->lease_mds
>= 0 &&
13627 dn
->lease_ttl
> now
&&
13628 mds_sessions
.count(dn
->lease_mds
)) {
13629 parent_in
= dn
->dir
->parent_inode
;
13631 Inode
*diri
= dn
->dir
->parent_inode
;
13632 if (diri
->caps_issued_mask(CEPH_CAP_FILE_SHARED
) &&
13633 diri
->shared_gen
== dn
->cap_shared_gen
) {
13634 parent_in
= dn
->dir
->parent_inode
;
13640 } else if (root_parents
.count(cur
)) {
13641 parent_in
= root_parents
[cur
].get();
13649 if (cur
== root_ancestor
)
13653 if (cur
->nlink
== 0) {
13654 cur
= root_ancestor
;
13658 MetaRequest
*req
= new MetaRequest(CEPH_MDS_OP_LOOKUPNAME
);
13659 filepath
path(cur
->ino
);
13660 req
->set_filepath(path
);
13661 req
->set_inode(cur
);
13663 InodeRef parent_ref
;
13664 int ret
= make_request(req
, perms
, &parent_ref
);
13666 ldout(cct
, 1) << __func__
<< " " << in
->vino()
13667 << " failed to find parent of " << cur
->vino()
13668 << " err " << ret
<< dendl
;
13669 // FIXME: what to do?
13670 cur
= root_ancestor
;
13674 now
= ceph_clock_now();
13676 cur
= parent_ref
.get();
13678 cur
= in
; // start over
13681 ldout(cct
, 10) << __func__
<< " " << in
->vino() << " -> " << cur
->vino() << dendl
;
13686 * Traverse quota ancestors of the Inode, return true
13687 * if any of them passes the passed function
13689 bool Client::check_quota_condition(Inode
*in
, const UserPerm
& perms
,
13690 std::function
<bool (const Inode
&in
)> test
)
13693 assert(in
!= NULL
);
13698 if (in
== root_ancestor
) {
13699 // We're done traversing, drop out
13702 // Continue up the tree
13703 in
= get_quota_root(in
, perms
);
13710 bool Client::is_quota_files_exceeded(Inode
*in
, const UserPerm
& perms
)
13712 return check_quota_condition(in
, perms
,
13713 [](const Inode
&in
) {
13714 return in
.quota
.max_files
&& in
.rstat
.rsize() >= in
.quota
.max_files
;
13718 bool Client::is_quota_bytes_exceeded(Inode
*in
, int64_t new_bytes
,
13719 const UserPerm
& perms
,
13720 std::list
<InodeRef
>* quota_roots
)
13722 return check_quota_condition(in
, perms
,
13723 [&new_bytes
, quota_roots
](const Inode
&in
) {
13725 quota_roots
->emplace_back(const_cast<Inode
*>(&in
));
13726 return in
.quota
.max_bytes
&& (in
.rstat
.rbytes
+ new_bytes
)
13727 > in
.quota
.max_bytes
;
13731 bool Client::is_quota_bytes_approaching(Inode
*in
, std::list
<InodeRef
>& quota_roots
)
13733 assert(in
->size
>= in
->reported_size
);
13734 const uint64_t size
= in
->size
- in
->reported_size
;
13736 for (auto& diri
: quota_roots
) {
13737 if (diri
->quota
.max_bytes
) {
13738 if (diri
->rstat
.rbytes
>= diri
->quota
.max_bytes
)
13741 uint64_t space
= diri
->quota
.max_bytes
- diri
->rstat
.rbytes
;
13742 if ((space
>> 4) < size
)
13756 int Client::check_pool_perm(Inode
*in
, int need
)
13758 if (!cct
->_conf
->client_check_pool_perm
)
13761 int64_t pool_id
= in
->layout
.pool_id
;
13762 std::string pool_ns
= in
->layout
.pool_ns
;
13763 std::pair
<int64_t, std::string
> perm_key(pool_id
, pool_ns
);
13766 auto it
= pool_perms
.find(perm_key
);
13767 if (it
== pool_perms
.end())
13769 if (it
->second
== POOL_CHECKING
) {
13770 // avoid concurrent checkings
13771 wait_on_list(waiting_for_pool_perm
);
13774 assert(have
& POOL_CHECKED
);
13780 if (in
->snapid
!= CEPH_NOSNAP
) {
13781 // pool permission check needs to write to the first object. But for snapshot,
13782 // head of the first object may have alread been deleted. To avoid creating
13783 // orphan object, skip the check for now.
13787 pool_perms
[perm_key
] = POOL_CHECKING
;
13790 snprintf(oid_buf
, sizeof(oid_buf
), "%llx.00000000", (unsigned long long)in
->ino
);
13791 object_t oid
= oid_buf
;
13793 SnapContext nullsnapc
;
13795 C_SaferCond rd_cond
;
13796 ObjectOperation rd_op
;
13797 rd_op
.stat(NULL
, (ceph::real_time
*)nullptr, NULL
);
13799 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), rd_op
,
13800 nullsnapc
, ceph::real_clock::now(), 0, &rd_cond
);
13802 C_SaferCond wr_cond
;
13803 ObjectOperation wr_op
;
13804 wr_op
.create(true);
13806 objecter
->mutate(oid
, OSDMap::file_to_object_locator(in
->layout
), wr_op
,
13807 nullsnapc
, ceph::real_clock::now(), 0, &wr_cond
);
13809 client_lock
.Unlock();
13810 int rd_ret
= rd_cond
.wait();
13811 int wr_ret
= wr_cond
.wait();
13812 client_lock
.Lock();
13814 bool errored
= false;
13816 if (rd_ret
== 0 || rd_ret
== -ENOENT
)
13818 else if (rd_ret
!= -EPERM
) {
13819 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13820 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
13824 if (wr_ret
== 0 || wr_ret
== -EEXIST
)
13825 have
|= POOL_WRITE
;
13826 else if (wr_ret
!= -EPERM
) {
13827 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13828 << " rd_err = " << rd_ret
<< " wr_err = " << wr_ret
<< dendl
;
13833 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13834 // Raise EIO because actual error code might be misleading for
13835 // userspace filesystem user.
13836 pool_perms
.erase(perm_key
);
13837 signal_cond_list(waiting_for_pool_perm
);
13841 pool_perms
[perm_key
] = have
| POOL_CHECKED
;
13842 signal_cond_list(waiting_for_pool_perm
);
13845 if ((need
& CEPH_CAP_FILE_RD
) && !(have
& POOL_READ
)) {
13846 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13847 << " need " << ccap_string(need
) << ", but no read perm" << dendl
;
13850 if ((need
& CEPH_CAP_FILE_WR
) && !(have
& POOL_WRITE
)) {
13851 ldout(cct
, 10) << "check_pool_perm on pool " << pool_id
<< " ns " << pool_ns
13852 << " need " << ccap_string(need
) << ", but no write perm" << dendl
;
13859 int Client::_posix_acl_permission(Inode
*in
, const UserPerm
& perms
, unsigned want
)
13861 if (acl_type
== POSIX_ACL
) {
13862 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
13863 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
13865 return posix_acl_permits(access_acl
, in
->uid
, in
->gid
, perms
, want
);
13871 int Client::_posix_acl_chmod(Inode
*in
, mode_t mode
, const UserPerm
& perms
)
13873 if (acl_type
== NO_ACL
)
13876 int r
= _getattr(in
, CEPH_STAT_CAP_XATTR
, perms
, in
->xattr_version
== 0);
13880 if (acl_type
== POSIX_ACL
) {
13881 if (in
->xattrs
.count(ACL_EA_ACCESS
)) {
13882 const bufferptr
& access_acl
= in
->xattrs
[ACL_EA_ACCESS
];
13883 bufferptr
acl(access_acl
.c_str(), access_acl
.length());
13884 r
= posix_acl_access_chmod(acl
, mode
);
13887 r
= _do_setxattr(in
, ACL_EA_ACCESS
, acl
.c_str(), acl
.length(), 0, perms
);
13893 ldout(cct
, 10) << __func__
<< " ino " << in
->ino
<< " result=" << r
<< dendl
;
13897 int Client::_posix_acl_create(Inode
*dir
, mode_t
*mode
, bufferlist
& xattrs_bl
,
13898 const UserPerm
& perms
)
13900 if (acl_type
== NO_ACL
)
13903 if (S_ISLNK(*mode
))
13906 int r
= _getattr(dir
, CEPH_STAT_CAP_XATTR
, perms
, dir
->xattr_version
== 0);
13910 if (acl_type
== POSIX_ACL
) {
13911 if (dir
->xattrs
.count(ACL_EA_DEFAULT
)) {
13912 map
<string
, bufferptr
> xattrs
;
13914 const bufferptr
& default_acl
= dir
->xattrs
[ACL_EA_DEFAULT
];
13915 bufferptr
acl(default_acl
.c_str(), default_acl
.length());
13916 r
= posix_acl_inherit_mode(acl
, mode
);
13921 r
= posix_acl_equiv_mode(acl
.c_str(), acl
.length(), mode
);
13925 xattrs
[ACL_EA_ACCESS
] = acl
;
13928 if (S_ISDIR(*mode
))
13929 xattrs
[ACL_EA_DEFAULT
] = dir
->xattrs
[ACL_EA_DEFAULT
];
13933 ::encode(xattrs
, xattrs_bl
);
13936 *mode
&= ~umask_cb(callback_handle
);
13941 ldout(cct
, 10) << __func__
<< " dir ino " << dir
->ino
<< " result=" << r
<< dendl
;
13945 void Client::set_filer_flags(int flags
)
13947 Mutex::Locker
l(client_lock
);
13948 assert(flags
== 0 ||
13949 flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
13950 objecter
->add_global_op_flags(flags
);
13953 void Client::clear_filer_flags(int flags
)
13955 Mutex::Locker
l(client_lock
);
13956 assert(flags
== CEPH_OSD_FLAG_LOCALIZE_READS
);
13957 objecter
->clear_global_op_flag(flags
);
13961 * This is included in cap release messages, to cause
13962 * the MDS to wait until this OSD map epoch. It is necessary
13963 * in corner cases where we cancel RADOS ops, so that
13964 * nobody else tries to do IO to the same objects in
13965 * the same epoch as the cancelled ops.
13967 void Client::set_cap_epoch_barrier(epoch_t e
)
13969 ldout(cct
, 5) << __func__
<< " epoch = " << e
<< dendl
;
13970 cap_epoch_barrier
= e
;
13973 const char** Client::get_tracked_conf_keys() const
13975 static const char* keys
[] = {
13976 "client_cache_size",
13977 "client_cache_mid",
13979 "client_deleg_timeout",
13980 "client_deleg_break_on_open",
13986 void Client::handle_conf_change(const struct md_config_t
*conf
,
13987 const std::set
<std::string
> &changed
)
13989 Mutex::Locker
lock(client_lock
);
13991 if (changed
.count("client_cache_mid")) {
13992 lru
.lru_set_midpoint(cct
->_conf
->client_cache_mid
);
13994 if (changed
.count("client_acl_type")) {
13996 if (cct
->_conf
->client_acl_type
== "posix_acl")
13997 acl_type
= POSIX_ACL
;
14001 void intrusive_ptr_add_ref(Inode
*in
)
14006 void intrusive_ptr_release(Inode
*in
)
14008 in
->client
->put_inode(in
);
14011 mds_rank_t
Client::_get_random_up_mds() const
14013 assert(client_lock
.is_locked_by_me());
14015 std::set
<mds_rank_t
> up
;
14016 mdsmap
->get_up_mds_set(up
);
14019 return MDS_RANK_NONE
;
14020 std::set
<mds_rank_t
>::const_iterator p
= up
.begin();
14021 for (int n
= rand() % up
.size(); n
; n
--)
14027 StandaloneClient::StandaloneClient(Messenger
*m
, MonClient
*mc
)
14028 : Client(m
, mc
, new Objecter(m
->cct
, m
, mc
, NULL
, 0, 0))
14030 monclient
->set_messenger(m
);
14031 objecter
->set_client_incarnation(0);
14034 StandaloneClient::~StandaloneClient()
14037 objecter
= nullptr;
14040 int StandaloneClient::init()
14043 objectcacher
->start();
14046 client_lock
.Lock();
14047 assert(!initialized
);
14049 messenger
->add_dispatcher_tail(objecter
);
14050 messenger
->add_dispatcher_tail(this);
14052 monclient
->set_want_keys(CEPH_ENTITY_TYPE_MDS
| CEPH_ENTITY_TYPE_OSD
);
14053 int r
= monclient
->init();
14055 // need to do cleanup because we're in an intermediate init state
14057 client_lock
.Unlock();
14058 objecter
->shutdown();
14059 objectcacher
->stop();
14060 monclient
->shutdown();
14065 client_lock
.Unlock();
14071 void StandaloneClient::shutdown()
14073 Client::shutdown();
14074 objecter
->shutdown();
14075 monclient
->shutdown();