1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_SERVER_H
16 #define CEPH_MDS_SERVER_H
18 #include <string_view>
20 #include <common/DecayCounter.h>
22 #include "messages/MClientReconnect.h"
23 #include "messages/MClientReply.h"
24 #include "messages/MClientRequest.h"
25 #include "messages/MClientSession.h"
26 #include "messages/MClientSnap.h"
27 #include "messages/MClientReclaim.h"
28 #include "messages/MClientReclaimReply.h"
29 #include "messages/MLock.h"
33 #include "MDSContext.h"
45 l_mdss_dispatch_client_request
,
46 l_mdss_dispatch_slave_request
,
47 l_mdss_handle_client_request
,
48 l_mdss_handle_client_session
,
49 l_mdss_handle_slave_request
,
50 l_mdss_req_create_latency
,
51 l_mdss_req_getattr_latency
,
52 l_mdss_req_getfilelock_latency
,
53 l_mdss_req_link_latency
,
54 l_mdss_req_lookup_latency
,
55 l_mdss_req_lookuphash_latency
,
56 l_mdss_req_lookupino_latency
,
57 l_mdss_req_lookupname_latency
,
58 l_mdss_req_lookupparent_latency
,
59 l_mdss_req_lookupsnap_latency
,
60 l_mdss_req_lssnap_latency
,
61 l_mdss_req_mkdir_latency
,
62 l_mdss_req_mknod_latency
,
63 l_mdss_req_mksnap_latency
,
64 l_mdss_req_open_latency
,
65 l_mdss_req_readdir_latency
,
66 l_mdss_req_rename_latency
,
67 l_mdss_req_renamesnap_latency
,
68 l_mdss_req_rmdir_latency
,
69 l_mdss_req_rmsnap_latency
,
70 l_mdss_req_rmxattr_latency
,
71 l_mdss_req_setattr_latency
,
72 l_mdss_req_setdirlayout_latency
,
73 l_mdss_req_setfilelock_latency
,
74 l_mdss_req_setlayout_latency
,
75 l_mdss_req_setxattr_latency
,
76 l_mdss_req_symlink_latency
,
77 l_mdss_req_unlink_latency
,
78 l_mdss_cap_revoke_eviction
,
84 using clock
= ceph::coarse_mono_clock
;
85 using time
= ceph::coarse_mono_time
;
93 // OSDMap full status, used to generate ENOSPC on some operations
96 // State for while in reconnect
97 MDSContext
*reconnect_done
;
98 int failed_reconnects
;
99 bool reconnect_evicting
; // true if I am waiting for evictions to complete
100 // before proceeding to reconnect_gather_finish
101 time reconnect_start
= clock::zero();
102 time reconnect_last_seen
= clock::zero();
103 set
<client_t
> client_reconnect_gather
; // clients i need a reconnect msg from.
105 feature_bitset_t supported_features
;
106 feature_bitset_t required_client_features
;
108 bool replay_unsafe_with_closed_session
= false;
109 double cap_revoke_eviction_timeout
= 0;
111 friend class MDSContinuation
;
112 friend class ServerContext
;
113 friend class ServerLogContext
;
116 bool terminating_sessions
;
118 explicit Server(MDSRank
*m
);
120 g_ceph_context
->get_perfcounters_collection()->remove(logger
);
122 delete reconnect_done
;
125 void create_logger();
128 void dispatch(const Message::const_ref
&m
);
130 void handle_osd_map();
132 // -- sessions and recovery --
133 bool waiting_for_reconnect(client_t c
) const;
134 void dump_reconnect_status(Formatter
*f
) const;
136 time
last_recalled() const {
137 return last_recall_state
;
140 void handle_client_session(const MClientSession::const_ref
&m
);
141 void _session_logged(Session
*session
, uint64_t state_seq
,
142 bool open
, version_t pv
, interval_set
<inodeno_t
>& inos
,version_t piv
);
143 version_t
prepare_force_open_sessions(map
<client_t
,entity_inst_t
> &cm
,
144 map
<client_t
,client_metadata_t
>& cmm
,
145 map
<client_t
,pair
<Session
*,uint64_t> >& smap
);
146 void finish_force_open_sessions(const map
<client_t
,pair
<Session
*,uint64_t> >& smap
,
147 bool dec_import
=true);
148 void flush_client_sessions(set
<client_t
>& client_set
, MDSGatherBuilder
& gather
);
149 void finish_flush_session(Session
*session
, version_t seq
);
150 void terminate_sessions();
151 void find_idle_sessions();
152 void kill_session(Session
*session
, Context
*on_safe
);
153 size_t apply_blacklist(const std::set
<entity_addr_t
> &blacklist
);
154 void journal_close_session(Session
*session
, int state
, Context
*on_safe
);
156 set
<client_t
> client_reclaim_gather
;
157 size_t get_num_pending_reclaim() const { return client_reclaim_gather
.size(); }
158 Session
*find_session_by_uuid(std::string_view uuid
);
159 void reclaim_session(Session
*session
, const MClientReclaim::const_ref
&m
);
160 void finish_reclaim_session(Session
*session
, const MClientReclaimReply::ref
&reply
=nullptr);
161 void handle_client_reclaim(const MClientReclaim::const_ref
&m
);
163 void reconnect_clients(MDSContext
*reconnect_done_
);
164 void handle_client_reconnect(const MClientReconnect::const_ref
&m
);
165 void infer_supported_features(Session
*session
, client_metadata_t
& client_metadata
);
166 void update_required_client_features();
168 //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
169 void reconnect_gather_finish();
170 void reconnect_tick();
171 void recover_filelocks(CInode
*in
, bufferlist locks
, int64_t client
);
173 enum class RecallFlags
: uint64_t {
176 ENFORCE_MAX
= (1<<1),
178 ENFORCE_LIVENESS
= (1<<3),
180 std::pair
<bool, uint64_t> recall_client_state(MDSGatherBuilder
* gather
, RecallFlags
=RecallFlags::NONE
);
181 void force_clients_readonly();
184 void handle_client_request(const MClientRequest::const_ref
&m
);
186 void journal_and_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
,
187 LogEvent
*le
, MDSLogContextBase
*fin
);
188 void submit_mdlog_entry(LogEvent
*le
, MDSLogContextBase
*fin
,
189 MDRequestRef
& mdr
, std::string_view event
);
190 void dispatch_client_request(MDRequestRef
& mdr
);
191 void perf_gather_op_latency(const MClientRequest::const_ref
&req
, utime_t lat
);
192 void early_reply(MDRequestRef
& mdr
, CInode
*tracei
, CDentry
*tracedn
);
193 void respond_to_request(MDRequestRef
& mdr
, int r
= 0);
194 void set_trace_dist(Session
*session
, const MClientReply::ref
&reply
, CInode
*in
, CDentry
*dn
,
196 int num_dentries_wanted
,
200 void handle_slave_request(const MMDSSlaveRequest::const_ref
&m
);
201 void handle_slave_request_reply(const MMDSSlaveRequest::const_ref
&m
);
202 void dispatch_slave_request(MDRequestRef
& mdr
);
203 void handle_slave_auth_pin(MDRequestRef
& mdr
);
204 void handle_slave_auth_pin_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
);
207 bool check_fragment_space(MDRequestRef
& mdr
, CDir
*in
);
208 bool check_access(MDRequestRef
& mdr
, CInode
*in
, unsigned mask
);
209 bool _check_access(Session
*session
, CInode
*in
, unsigned mask
, int caller_uid
, int caller_gid
, int setattr_uid
, int setattr_gid
);
210 CDir
*validate_dentry_dir(MDRequestRef
& mdr
, CInode
*diri
, std::string_view dname
);
211 CDir
*traverse_to_auth_dir(MDRequestRef
& mdr
, vector
<CDentry
*> &trace
, filepath refpath
);
212 CDentry
*prepare_null_dentry(MDRequestRef
& mdr
, CDir
*dir
, std::string_view dname
, bool okexist
=false);
213 CDentry
*prepare_stray_dentry(MDRequestRef
& mdr
, CInode
*in
);
214 CInode
* prepare_new_inode(MDRequestRef
& mdr
, CDir
*dir
, inodeno_t useino
, unsigned mode
,
215 file_layout_t
*layout
=NULL
);
216 void journal_allocated_inos(MDRequestRef
& mdr
, EMetaBlob
*blob
);
217 void apply_allocated_inos(MDRequestRef
& mdr
, Session
*session
);
219 CInode
* rdlock_path_pin_ref(MDRequestRef
& mdr
, int n
, MutationImpl::LockOpVec
& lov
,
220 bool want_auth
, bool no_want_auth
=false,
221 file_layout_t
**layout
=nullptr,
222 bool no_lookup
=false);
223 CDentry
* rdlock_path_xlock_dentry(MDRequestRef
& mdr
, int n
,
224 MutationImpl::LockOpVec
& lov
,
225 bool okexist
, bool mustexist
, bool alwaysxlock
,
226 file_layout_t
**layout
=nullptr);
228 CDir
* try_open_auth_dirfrag(CInode
*diri
, frag_t fg
, MDRequestRef
& mdr
);
231 // requests on existing inodes.
232 void handle_client_getattr(MDRequestRef
& mdr
, bool is_lookup
);
233 void handle_client_lookup_ino(MDRequestRef
& mdr
,
234 bool want_parent
, bool want_dentry
);
235 void _lookup_snap_ino(MDRequestRef
& mdr
);
236 void _lookup_ino_2(MDRequestRef
& mdr
, int r
);
237 void handle_client_readdir(MDRequestRef
& mdr
);
238 void handle_client_file_setlock(MDRequestRef
& mdr
);
239 void handle_client_file_readlock(MDRequestRef
& mdr
);
241 void handle_client_setattr(MDRequestRef
& mdr
);
242 void handle_client_setlayout(MDRequestRef
& mdr
);
243 void handle_client_setdirlayout(MDRequestRef
& mdr
);
245 int parse_quota_vxattr(string name
, string value
, quota_info_t
*quota
);
246 void create_quota_realm(CInode
*in
);
247 int parse_layout_vxattr(string name
, string value
, const OSDMap
& osdmap
,
248 file_layout_t
*layout
, bool validate
=true);
249 int check_layout_vxattr(MDRequestRef
& mdr
,
252 file_layout_t
*layout
);
253 void handle_set_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
254 file_layout_t
*dir_layout
,
255 MutationImpl::LockOpVec
& lov
);
256 void handle_remove_vxattr(MDRequestRef
& mdr
, CInode
*cur
,
257 file_layout_t
*dir_layout
,
258 MutationImpl::LockOpVec
& lov
);
259 void handle_client_setxattr(MDRequestRef
& mdr
);
260 void handle_client_removexattr(MDRequestRef
& mdr
);
262 void handle_client_fsync(MDRequestRef
& mdr
);
265 void handle_client_open(MDRequestRef
& mdr
);
266 void handle_client_openc(MDRequestRef
& mdr
); // O_CREAT variant.
267 void do_open_truncate(MDRequestRef
& mdr
, int cmode
); // O_TRUNC variant.
270 void handle_client_mknod(MDRequestRef
& mdr
);
271 void handle_client_mkdir(MDRequestRef
& mdr
);
272 void handle_client_symlink(MDRequestRef
& mdr
);
275 void handle_client_link(MDRequestRef
& mdr
);
276 void _link_local(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
);
277 void _link_local_finish(MDRequestRef
& mdr
, CDentry
*dn
, CInode
*targeti
,
278 version_t
, version_t
, bool);
280 void _link_remote(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
);
281 void _link_remote_finish(MDRequestRef
& mdr
, bool inc
, CDentry
*dn
, CInode
*targeti
,
284 void handle_slave_link_prep(MDRequestRef
& mdr
);
285 void _logged_slave_link(MDRequestRef
& mdr
, CInode
*targeti
, bool adjust_realm
);
286 void _commit_slave_link(MDRequestRef
& mdr
, int r
, CInode
*targeti
);
287 void _committed_slave(MDRequestRef
& mdr
); // use for rename, too
288 void handle_slave_link_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&m
);
289 void do_link_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
);
290 void _link_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
,
291 map
<client_t
,MClientSnap::ref
>& split
);
294 void handle_client_unlink(MDRequestRef
& mdr
);
295 bool _dir_is_nonempty_unlocked(MDRequestRef
& mdr
, CInode
*rmdiri
);
296 bool _dir_is_nonempty(MDRequestRef
& mdr
, CInode
*rmdiri
);
297 void _unlink_local(MDRequestRef
& mdr
, CDentry
*dn
, CDentry
*straydn
);
298 void _unlink_local_finish(MDRequestRef
& mdr
,
299 CDentry
*dn
, CDentry
*straydn
,
301 bool _rmdir_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, vector
<CDentry
*>& trace
, CDentry
*straydn
);
302 void handle_slave_rmdir_prep(MDRequestRef
& mdr
);
303 void _logged_slave_rmdir(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*straydn
);
304 void _commit_slave_rmdir(MDRequestRef
& mdr
, int r
, CDentry
*straydn
);
305 void handle_slave_rmdir_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&ack
);
306 void do_rmdir_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
);
307 void _rmdir_rollback_finish(MDRequestRef
& mdr
, metareqid_t reqid
, CDentry
*dn
, CDentry
*straydn
);
310 void handle_client_rename(MDRequestRef
& mdr
);
311 void _rename_finish(MDRequestRef
& mdr
,
312 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
314 void handle_client_lssnap(MDRequestRef
& mdr
);
315 void handle_client_mksnap(MDRequestRef
& mdr
);
316 void _mksnap_finish(MDRequestRef
& mdr
, CInode
*diri
, SnapInfo
&info
);
317 void handle_client_rmsnap(MDRequestRef
& mdr
);
318 void _rmsnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
);
319 void handle_client_renamesnap(MDRequestRef
& mdr
);
320 void _renamesnap_finish(MDRequestRef
& mdr
, CInode
*diri
, snapid_t snapid
);
324 bool _rename_prepare_witness(MDRequestRef
& mdr
, mds_rank_t who
, set
<mds_rank_t
> &witnesse
,
325 vector
<CDentry
*>& srctrace
, vector
<CDentry
*>& dsttrace
, CDentry
*straydn
);
326 version_t
_rename_prepare_import(MDRequestRef
& mdr
, CDentry
*srcdn
, bufferlist
*client_map_bl
);
327 bool _need_force_journal(CInode
*diri
, bool empty
);
328 void _rename_prepare(MDRequestRef
& mdr
,
329 EMetaBlob
*metablob
, bufferlist
*client_map_bl
,
330 CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
331 /* set not_journaling=true if you're going to discard the results --
332 * this bypasses the asserts to make sure we're journaling the right
333 * things on the right nodes */
334 void _rename_apply(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
337 void handle_slave_rename_prep(MDRequestRef
& mdr
);
338 void handle_slave_rename_prep_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&m
);
339 void handle_slave_rename_notify_ack(MDRequestRef
& mdr
, const MMDSSlaveRequest::const_ref
&m
);
340 void _slave_rename_sessions_flushed(MDRequestRef
& mdr
);
341 void _logged_slave_rename(MDRequestRef
& mdr
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
342 void _commit_slave_rename(MDRequestRef
& mdr
, int r
, CDentry
*srcdn
, CDentry
*destdn
, CDentry
*straydn
);
343 void do_rename_rollback(bufferlist
&rbl
, mds_rank_t master
, MDRequestRef
& mdr
, bool finish_mdr
=false);
344 void _rename_rollback_finish(MutationRef
& mut
, MDRequestRef
& mdr
, CDentry
*srcdn
, version_t srcdnpv
,
345 CDentry
*destdn
, CDentry
*staydn
, map
<client_t
,MClientSnap::ref
> splits
[2],
348 void evict_cap_revoke_non_responders();
349 void handle_conf_change(const std::set
<std::string
>& changed
);
352 void reply_client_request(MDRequestRef
& mdr
, const MClientReply::ref
&reply
);
353 void flush_session(Session
*session
, MDSGatherBuilder
*gather
);
355 DecayCounter recall_throttle
;
356 time last_recall_state
;
359 static inline constexpr auto operator|(Server::RecallFlags a
, Server::RecallFlags b
) {
360 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
361 return static_cast<Server::RecallFlags
>(static_cast<T
>(a
) | static_cast<T
>(b
));
363 static inline constexpr auto operator&(Server::RecallFlags a
, Server::RecallFlags b
) {
364 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
365 return static_cast<Server::RecallFlags
>(static_cast<T
>(a
) & static_cast<T
>(b
));
367 static inline std::ostream
& operator<<(std::ostream
& os
, const Server::RecallFlags
& f
) {
368 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
369 return os
<< "0x" << std::hex
<< static_cast<T
>(f
) << std::dec
;
371 static inline constexpr bool operator!(const Server::RecallFlags
& f
) {
372 using T
= std::underlying_type
<Server::RecallFlags
>::type
;
373 return static_cast<T
>(f
) == static_cast<T
>(0);