1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_SERVER_H
16 #define CEPH_MDS_SERVER_H
18 #include <string_view>
20 #include <common/DecayCounter.h>
22 #include "include/common_fwd.h"
24 #include "messages/MClientReconnect.h"
25 #include "messages/MClientReply.h"
26 #include "messages/MClientRequest.h"
27 #include "messages/MClientSession.h"
28 #include "messages/MClientSnap.h"
29 #include "messages/MClientReclaim.h"
30 #include "messages/MClientReclaimReply.h"
31 #include "messages/MLock.h"
36 #include "MDSContext.h"
48 l_mdss_dispatch_client_request
,
49 l_mdss_dispatch_peer_request
,
50 l_mdss_handle_client_request
,
51 l_mdss_handle_client_session
,
52 l_mdss_handle_peer_request
,
53 l_mdss_req_create_latency
,
54 l_mdss_req_getattr_latency
,
55 l_mdss_req_getfilelock_latency
,
56 l_mdss_req_link_latency
,
57 l_mdss_req_lookup_latency
,
58 l_mdss_req_lookuphash_latency
,
59 l_mdss_req_lookupino_latency
,
60 l_mdss_req_lookupname_latency
,
61 l_mdss_req_lookupparent_latency
,
62 l_mdss_req_lookupsnap_latency
,
63 l_mdss_req_lssnap_latency
,
64 l_mdss_req_mkdir_latency
,
65 l_mdss_req_mknod_latency
,
66 l_mdss_req_mksnap_latency
,
67 l_mdss_req_open_latency
,
68 l_mdss_req_readdir_latency
,
69 l_mdss_req_rename_latency
,
70 l_mdss_req_renamesnap_latency
,
71 l_mdss_req_rmdir_latency
,
72 l_mdss_req_rmsnap_latency
,
73 l_mdss_req_rmxattr_latency
,
74 l_mdss_req_setattr_latency
,
75 l_mdss_req_setdirlayout_latency
,
76 l_mdss_req_setfilelock_latency
,
77 l_mdss_req_setlayout_latency
,
78 l_mdss_req_setxattr_latency
,
79 l_mdss_req_symlink_latency
,
80 l_mdss_req_unlink_latency
,
81 l_mdss_cap_revoke_eviction
,
82 l_mdss_cap_acquisition_throttle
,
88 using clock
= ceph::coarse_mono_clock
;
89 using time
= ceph::coarse_mono_time
;
91 enum class RecallFlags
: uint64_t {
96 ENFORCE_LIVENESS
= (1<<3),
98 explicit Server(MDSRank
*m
, MetricsHandler
*metrics_handler
);
100 g_ceph_context
->get_perfcounters_collection()->remove(logger
);
102 delete reconnect_done
;
105 void create_logger();
108 void dispatch(const cref_t
<Message
> &m
);
110 void handle_osd_map();
112 // -- sessions and recovery --
113 bool waiting_for_reconnect(client_t c
) const;
114 void dump_reconnect_status(Formatter
*f
) const;
116 time
last_recalled() const {
117 return last_recall_state
;
120 void handle_client_session(const cref_t
<MClientSession
> &m
);
121 void _session_logged(Session
*session
, uint64_t state_seq
, bool open
, version_t pv
,
122 const interval_set
<inodeno_t
>& inos_to_free
, version_t piv
,
123 const interval_set
<inodeno_t
>& inos_to_purge
, LogSegment
*ls
);
124 version_t
prepare_force_open_sessions(map
<client_t
,entity_inst_t
> &cm
,
125 map
<client_t
,client_metadata_t
>& cmm
,
126 map
<client_t
,pair
<Session
*,uint64_t> >& smap
);
127 void finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
128 bool dec_import
=true);
129 void flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
);
130 void finish_flush_session(Session
*session
, version_t seq
);
131 void terminate_sessions();
132 void find_idle_sessions();
134 void kill_session(Session
*session
, Context
*on_safe
);
135 size_t apply_blocklist(const std::set
<entity_addr_t
> &blocklist
);
136 void journal_close_session(Session
*session
, int state
, Context
*on_safe
);
138 size_t get_num_pending_reclaim() const { return client_reclaim_gather
.size(); }
139 Session
*find_session_by_uuid(std::string_view uuid
);
140 void reclaim_session(Session
*session
, const cref_t
<MClientReclaim
> &m
);
141 void finish_reclaim_session(Session
*session
, const ref_t
<MClientReclaimReply
> &reply
=nullptr);
142 void handle_client_reclaim(const cref_t
<MClientReclaim
> &m
);
144 void reconnect_clients(MDSContext
*reconnect_done_
);
145 void handle_client_reconnect(const cref_t
<MClientReconnect
> &m
);
146 void infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
);
147 void update_required_client_features();
149 //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
150 void reconnect_gather_finish();
151 void reconnect_tick();
152 void recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
);
154 std::pair
<bool, uint64_t> recall_client_state(MDSGatherBuilder
* gather
, RecallFlags
=RecallFlags::NONE
);
155 void force_clients_readonly();
158 void handle_client_request(const cref_t
<MClientRequest
> &m
);
160 void journal_and_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
,
161 LogEvent
*le
, MDSLogContextBase
*fin
);
162 void submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
,
163 MDRequestRef
& mdr
, std::string_view event
);
164 void dispatch_client_request(MDRequestRef
& mdr
);
165 void perf_gather_op_latency(const cref_t
<MClientRequest
> &req
, utime_t lat
);
166 void early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
);
167 void respond_to_request(MDRequestRef
& mdr
, int r
= 0);
168 void set_trace_dist(const ref_t
<MClientReply
> &reply
, CInode
*in
, CDentry
*dn
,
171 void handle_peer_request(const cref_t
<MMDSPeerRequest
> &m
);
172 void handle_peer_request_reply(const cref_t
<MMDSPeerRequest
> &m
);
173 void dispatch_peer_request(MDRequestRef
& mdr
);
174 void handle_peer_auth_pin(MDRequestRef
& mdr
);
175 void handle_peer_auth_pin_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
);
178 bool check_fragment_space(MDRequestRef
& mdr
, CDir
*in
);
179 bool check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
);
180 bool _check_access(Session
*session
, CInode
*in
, unsigned mask
, int caller_uid
, int caller_gid
, int setattr_uid
, int setattr_gid
);
181 CDentry
*prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
);
182 CInode
* prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
183 const file_layout_t
*layout
=nullptr);
184 void journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
);
185 void apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
);
187 CInode
* rdlock_path_pin_ref(MDRequestRef
& mdr
, bool want_auth
,
188 bool no_want_auth
=false);
189 CDentry
* rdlock_path_xlock_dentry(MDRequestRef
& mdr
, bool create
,
190 bool okexist
=false, bool want_layout
=false);
191 std::pair
<CDentry
*, CDentry
*>
192 rdlock_two_paths_xlock_destdn(MDRequestRef
& mdr
, bool xlock_srcdn
);
194 CDir
* try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
);
196 // requests on existing inodes.
197 void handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
);
198 void handle_client_lookup_ino(MDRequestRef
& mdr
,
199 bool want_parent
, bool want_dentry
);
200 void _lookup_snap_ino(MDRequestRef
& mdr
);
201 void _lookup_ino_2(MDRequestRef
& mdr
, int r
);
202 void handle_client_readdir(MDRequestRef
& mdr
);
203 void handle_client_file_setlock(MDRequestRef
& mdr
);
204 void handle_client_file_readlock(MDRequestRef
& mdr
);
206 bool xlock_policylock(MDRequestRef
& mdr
, CInode
*in
,
207 bool want_layout
=false, bool xlock_snaplock
=false);
208 CInode
* try_get_auth_inode(MDRequestRef
& mdr
, inodeno_t ino
);
209 void handle_client_setattr(MDRequestRef
& mdr
);
210 void handle_client_setlayout(MDRequestRef
& mdr
);
211 void handle_client_setdirlayout(MDRequestRef
& mdr
);
213 int parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
);
214 void create_quota_realm(CInode
*in
);
215 int parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
216 file_layout_t
*layout
, bool validate
=true);
217 int check_layout_vxattr(MDRequestRef
& mdr
,
220 file_layout_t
*layout
);
221 void handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
);
222 void handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
);
223 void handle_client_setxattr(MDRequestRef
& mdr
);
224 void handle_client_removexattr(MDRequestRef
& mdr
);
226 void handle_client_fsync(MDRequestRef
& mdr
);
229 void handle_client_open(MDRequestRef
& mdr
);
230 void handle_client_openc(MDRequestRef
& mdr
); // O_CREAT variant.
231 void do_open_truncate(MDRequestRef
& mdr
, int cmode
); // O_TRUNC variant.
234 void handle_client_mknod(MDRequestRef
& mdr
);
235 void handle_client_mkdir(MDRequestRef
& mdr
);
236 void handle_client_symlink(MDRequestRef
& mdr
);
239 void handle_client_link(MDRequestRef
& mdr
);
240 void _link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
, SnapRealm
*target_realm
);
241 void _link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
242 version_t
, version_t
, bool);
244 void _link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
);
245 void _link_remote_finish(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
,
248 void handle_peer_link_prep(MDRequestRef
& mdr
);
249 void _logged_peer_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
);
250 void _commit_peer_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
);
251 void _committed_peer(MDRequestRef
& mdr
); // use for rename, too
252 void handle_peer_link_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
);
253 void do_link_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
);
254 void _link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
255 map
<client_t
,ref_t
<MClientSnap
>>& split
);
258 void handle_client_unlink(MDRequestRef
& mdr
);
259 bool _dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*rmdiri
);
260 bool _dir_is_nonempty(MDRequestRef
& mdr
, CInode
*rmdiri
);
261 void _unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
);
262 void _unlink_local_finish(MDRequestRef
& mdr
,
263 CDentry
*dn
, CDentry
*straydn
,
265 bool _rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
);
266 void handle_peer_rmdir_prep(MDRequestRef
& mdr
);
267 void _logged_peer_rmdir(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*straydn
);
268 void _commit_peer_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
);
269 void handle_peer_rmdir_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &ack
);
270 void do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
);
271 void _rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
);
274 void handle_client_rename(MDRequestRef
& mdr
);
275 void _rename_finish(MDRequestRef
& mdr
,
276 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
278 void handle_client_lssnap(MDRequestRef
& mdr
);
279 void handle_client_mksnap(MDRequestRef
& mdr
);
280 void _mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
);
281 void handle_client_rmsnap(MDRequestRef
& mdr
);
282 void _rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
);
283 void handle_client_renamesnap(MDRequestRef
& mdr
);
284 void _renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
);
287 bool _rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
288 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
);
289 version_t
_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
);
290 bool _need_force_journal(CInode
*diri
, bool empty
);
291 void _rename_prepare(MDRequestRef
& mdr
,
292 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
293 CDentry
*srcdn
, CDentry
*destdn
, std::string_view alternate_name
,
295 /* set not_journaling=true if you're going to discard the results --
296 * this bypasses the asserts to make sure we're journaling the right
297 * things on the right nodes */
298 void _rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
301 void handle_peer_rename_prep(MDRequestRef
& mdr
);
302 void handle_peer_rename_prep_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
);
303 void handle_peer_rename_notify_ack(MDRequestRef
& mdr
, const cref_t
<MMDSPeerRequest
> &m
);
304 void _peer_rename_sessions_flushed(MDRequestRef
& mdr
);
305 void _logged_peer_rename(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
306 void _commit_peer_rename(MDRequestRef
& mdr
, int r
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
307 void do_rename_rollback(bufferlist
&rbl
, mds_rank_t leader
, MDRequestRef
& mdr
, bool finish_mdr
=false);
308 void _rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
, version_t srcdnpv
,
309 CDentry
*destdn
, CDentry
*staydn
, map
<client_t
,ref_t
<MClientSnap
>> splits
[2],
312 void evict_cap_revoke_non_responders();
313 void handle_conf_change(const std::set
<std::string
>& changed
);
315 bool terminating_sessions
= false;
317 set
<client_t
> client_reclaim_gather
;
320 friend class MDSContinuation
;
321 friend class ServerContext
;
322 friend class ServerLogContext
;
323 friend class Batch_Getattr_Lookup
;
325 // placeholder for validation handler to store xattr specific
328 virtual ~XattrInfo() {
332 struct MirrorXattrInfo
: XattrInfo
{
333 std::string cluster_id
;
336 static const std::string MIRROR_INFO_REGEX
;
337 static const std::string CLUSTER_ID
;
338 static const std::string FS_ID
;
340 MirrorXattrInfo(std::string_view cluster_id
,
341 std::string_view fs_id
)
342 : cluster_id(cluster_id
),
349 std::string xattr_name
;
350 const bufferlist
&xattr_value
;
353 std::unique_ptr
<XattrInfo
> xinfo
;
355 XattrOp(int op
, std::string_view xattr_name
, const bufferlist
&xattr_value
, int flags
)
357 xattr_name(xattr_name
),
358 xattr_value(xattr_value
),
363 struct XattrHandler
{
364 const std::string xattr_name
;
365 const std::string description
;
367 // basic checks are to be done in this handler. return -errno to
368 // reject xattr request (set or remove), zero to proceed. handlers
369 // may parse xattr value for verification if needed and have an
370 // option to store custom data in XattrOp::xinfo.
371 int (Server::*validate
)(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
374 // set xattr for an inode in xattr_map
375 void (Server::*setxattr
)(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
376 const XattrOp
&xattr_op
);
378 // remove xattr for an inode from xattr_map
379 void (Server::*removexattr
)(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
380 const XattrOp
&xattr_op
);
383 inline static const std::string DEFAULT_HANDLER
= "<default>";
384 static const XattrHandler xattr_handlers
[];
386 const XattrHandler
* get_xattr_or_default_handler(std::string_view xattr_name
);
388 // generic variant to set/remove xattr in/from xattr_map
389 int xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
390 const std::string
&xattr_name
, int op
, int flags
);
391 void xattr_set(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
,
392 const bufferlist
&xattr_value
);
393 void xattr_rm(InodeStoreBase::xattr_map_ptr xattrs
, const std::string
&xattr_name
);
395 // default xattr handlers
396 int default_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
398 void default_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
399 const XattrOp
&xattr_op
);
400 void default_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
401 const XattrOp
&xattr_op
);
403 // mirror info xattr handler
404 int parse_mirror_info_xattr(const std::string
&name
, const std::string
&value
,
405 std::string
&cluster_id
, std::string
&fs_id
);
406 int mirror_info_xattr_validate(CInode
*cur
, const InodeStoreBase::xattr_map_const_ptr xattrs
,
408 void mirror_info_setxattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
409 const XattrOp
&xattr_op
);
410 void mirror_info_removexattr_handler(CInode
*cur
, InodeStoreBase::xattr_map_ptr xattrs
,
411 const XattrOp
&xattr_op
);
413 static bool is_ceph_vxattr(std::string_view xattr_name
) {
414 return xattr_name
.rfind("ceph.dir.layout", 0) == 0 ||
415 xattr_name
.rfind("ceph.file.layout", 0) == 0 ||
416 xattr_name
.rfind("ceph.quota", 0) == 0 ||
417 xattr_name
== "ceph.dir.subvolume"sv
||
418 xattr_name
== "ceph.dir.pin"sv
||
419 xattr_name
== "ceph.dir.pin.random"sv
||
420 xattr_name
== "ceph.dir.pin.distributed"sv
;
423 static bool is_allowed_ceph_xattr(std::string_view xattr_name
) {
424 // not a ceph xattr -- allow!
425 if (xattr_name
.rfind("ceph.", 0) != 0) {
429 return xattr_name
== "ceph.mirror.info";
432 void reply_client_request(MDRequestRef
& mdr
, const ref_t
<MClientReply
> &reply
);
433 void flush_session(Session
*session
, MDSGatherBuilder
& gather
);
438 PerfCounters
*logger
= nullptr;
440 // OSDMap full status, used to generate CEPHFS_ENOSPC on some operations
441 bool is_full
= false;
443 // State for while in reconnect
444 MDSContext
*reconnect_done
= nullptr;
445 int failed_reconnects
= 0;
446 bool reconnect_evicting
= false; // true if I am waiting for evictions to complete
447 // before proceeding to reconnect_gather_finish
448 time reconnect_start
= clock::zero();
449 time reconnect_last_seen
= clock::zero();
450 set
<client_t
> client_reconnect_gather
; // clients i need a reconnect msg from.
451 set
<client_t
> client_reconnect_denied
; // clients whose reconnect msg have been denied .
453 feature_bitset_t supported_features
;
454 feature_bitset_t required_client_features
;
456 bool forward_all_requests_to_auth
= false;
457 bool replay_unsafe_with_closed_session
= false;
458 double cap_revoke_eviction_timeout
= 0;
459 uint64_t max_snaps_per_dir
= 100;
460 unsigned delegate_inos_pct
= 0;
462 DecayCounter recall_throttle
;
463 time last_recall_state
;
465 MetricsHandler
*metrics_handler
;
467 // Cache cap acquisition throttle configs
468 uint64_t max_caps_per_client
;
469 uint64_t cap_acquisition_throttle
;
470 double max_caps_throttle_ratio
;
471 double caps_throttle_retry_request_timeout
;
473 size_t alternate_name_max
= g_conf().get_val
<Option::size_t>("mds_alternate_name_max");
476 static inline constexpr auto operator|(Server::RecallFlags a
, Server::RecallFlags b
) {
477 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
478 return static_cast<Server::RecallFlags
>(static_cast<T
>(a
) | static_cast<T
>(b
));
480 static inline constexpr auto operator&(Server::RecallFlags a
, Server::RecallFlags b
) {
481 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
482 return static_cast<Server::RecallFlags
>(static_cast<T
>(a
) & static_cast<T
>(b
));
484 static inline std::ostream
& operator<<(std::ostream
& os
, const Server::RecallFlags
& f
) {
485 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
486 return os
<< "0x" << std::hex
<< static_cast<T
>(f
) << std::dec
;
488 static inline constexpr bool operator!(const Server::RecallFlags
& f
) {
489 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
490 return static_cast<T
>(f
) == static_cast<T
>(0);