]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.h
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / mds / Server.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_MDS_SERVER_H
16 #define CEPH_MDS_SERVER_H
17
18 #include <string_view>
19
20 #include <common/DecayCounter.h>
21
22 #include "include/common_fwd.h"
23
24 #include "messages/MClientReconnect.h"
25 #include "messages/MClientReply.h"
26 #include "messages/MClientRequest.h"
27 #include "messages/MClientSession.h"
28 #include "messages/MClientSnap.h"
29 #include "messages/MClientReclaim.h"
30 #include "messages/MClientReclaimReply.h"
31 #include "messages/MLock.h"
32
33 #include "CInode.h"
34 #include "MDSRank.h"
35 #include "Mutation.h"
36 #include "MDSContext.h"
37
38 class OSDMap;
39 class LogEvent;
40 class EMetaBlob;
41 class EUpdate;
42 class MDLog;
43 struct SnapInfo;
44 class MetricsHandler;
45
46 enum {
47 l_mdss_first = 1000,
48 l_mdss_dispatch_client_request,
49 l_mdss_dispatch_peer_request,
50 l_mdss_handle_client_request,
51 l_mdss_handle_client_session,
52 l_mdss_handle_peer_request,
53 l_mdss_req_create_latency,
54 l_mdss_req_getattr_latency,
55 l_mdss_req_getfilelock_latency,
56 l_mdss_req_link_latency,
57 l_mdss_req_lookup_latency,
58 l_mdss_req_lookuphash_latency,
59 l_mdss_req_lookupino_latency,
60 l_mdss_req_lookupname_latency,
61 l_mdss_req_lookupparent_latency,
62 l_mdss_req_lookupsnap_latency,
63 l_mdss_req_lssnap_latency,
64 l_mdss_req_mkdir_latency,
65 l_mdss_req_mknod_latency,
66 l_mdss_req_mksnap_latency,
67 l_mdss_req_open_latency,
68 l_mdss_req_readdir_latency,
69 l_mdss_req_rename_latency,
70 l_mdss_req_renamesnap_latency,
71 l_mdss_req_rmdir_latency,
72 l_mdss_req_rmsnap_latency,
73 l_mdss_req_rmxattr_latency,
74 l_mdss_req_setattr_latency,
75 l_mdss_req_setdirlayout_latency,
76 l_mdss_req_setfilelock_latency,
77 l_mdss_req_setlayout_latency,
78 l_mdss_req_setxattr_latency,
79 l_mdss_req_symlink_latency,
80 l_mdss_req_unlink_latency,
81 l_mdss_cap_revoke_eviction,
82 l_mdss_cap_acquisition_throttle,
83 l_mdss_last,
84 };
85
86 class Server {
87 public:
88 using clock = ceph::coarse_mono_clock;
89 using time = ceph::coarse_mono_time;
90
91 enum class RecallFlags : uint64_t {
92 NONE = 0,
93 STEADY = (1<<0),
94 ENFORCE_MAX = (1<<1),
95 TRIM = (1<<2),
96 ENFORCE_LIVENESS = (1<<3),
97 };
98 explicit Server(MDSRank *m, MetricsHandler *metrics_handler);
99 ~Server() {
100 g_ceph_context->get_perfcounters_collection()->remove(logger);
101 delete logger;
102 delete reconnect_done;
103 }
104
105 void create_logger();
106
107 // message handler
108 void dispatch(const cref_t<Message> &m);
109
110 void handle_osd_map();
111
112 // -- sessions and recovery --
113 bool waiting_for_reconnect(client_t c) const;
114 void dump_reconnect_status(Formatter *f) const;
115
116 time last_recalled() const {
117 return last_recall_state;
118 }
119
120 void handle_client_session(const cref_t<MClientSession> &m);
121 void _session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
122 const interval_set<inodeno_t>& inos_to_free, version_t piv,
123 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls);
124 version_t prepare_force_open_sessions(map<client_t,entity_inst_t> &cm,
125 map<client_t,client_metadata_t>& cmm,
126 map<client_t,pair<Session*,uint64_t> >& smap);
127 void finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
128 bool dec_import=true);
129 void flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather);
130 void finish_flush_session(Session *session, version_t seq);
131 void terminate_sessions();
132 void find_idle_sessions();
133
134 void kill_session(Session *session, Context *on_safe);
135 size_t apply_blocklist(const std::set<entity_addr_t> &blocklist);
136 void journal_close_session(Session *session, int state, Context *on_safe);
137
138 size_t get_num_pending_reclaim() const { return client_reclaim_gather.size(); }
139 Session *find_session_by_uuid(std::string_view uuid);
140 void reclaim_session(Session *session, const cref_t<MClientReclaim> &m);
141 void finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply=nullptr);
142 void handle_client_reclaim(const cref_t<MClientReclaim> &m);
143
144 void reconnect_clients(MDSContext *reconnect_done_);
145 void handle_client_reconnect(const cref_t<MClientReconnect> &m);
146 void infer_supported_features(Session *session, client_metadata_t& client_metadata);
147 void update_required_client_features();
148
149 //void process_reconnect_cap(CInode *in, int from, ceph_mds_cap_reconnect& capinfo);
150 void reconnect_gather_finish();
151 void reconnect_tick();
152 void recover_filelocks(CInode *in, bufferlist locks, int64_t client);
153
154 std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, RecallFlags=RecallFlags::NONE);
155 void force_clients_readonly();
156
157 // -- requests --
158 void handle_client_request(const cref_t<MClientRequest> &m);
159
160 void journal_and_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn,
161 LogEvent *le, MDSLogContextBase *fin);
162 void submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin,
163 MDRequestRef& mdr, std::string_view event);
164 void dispatch_client_request(MDRequestRef& mdr);
165 void perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat);
166 void early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn);
167 void respond_to_request(MDRequestRef& mdr, int r = 0);
168 void set_trace_dist(const ref_t<MClientReply> &reply, CInode *in, CDentry *dn,
169 MDRequestRef& mdr);
170
171 void handle_peer_request(const cref_t<MMDSPeerRequest> &m);
172 void handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m);
173 void dispatch_peer_request(MDRequestRef& mdr);
174 void handle_peer_auth_pin(MDRequestRef& mdr);
175 void handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack);
176
177 // some helpers
178 bool check_fragment_space(MDRequestRef& mdr, CDir *in);
179 bool check_access(MDRequestRef& mdr, CInode *in, unsigned mask);
180 bool _check_access(Session *session, CInode *in, unsigned mask, int caller_uid, int caller_gid, int setattr_uid, int setattr_gid);
181 CDentry *prepare_stray_dentry(MDRequestRef& mdr, CInode *in);
182 CInode* prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
183 const file_layout_t *layout=nullptr);
184 void journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob);
185 void apply_allocated_inos(MDRequestRef& mdr, Session *session);
186
187 CInode* rdlock_path_pin_ref(MDRequestRef& mdr, bool want_auth,
188 bool no_want_auth=false);
189 CDentry* rdlock_path_xlock_dentry(MDRequestRef& mdr, bool create,
190 bool okexist=false, bool want_layout=false);
191 std::pair<CDentry*, CDentry*>
192 rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn);
193
194 CDir* try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr);
195
196 // requests on existing inodes.
197 void handle_client_getattr(MDRequestRef& mdr, bool is_lookup);
198 void handle_client_lookup_ino(MDRequestRef& mdr,
199 bool want_parent, bool want_dentry);
200 void _lookup_snap_ino(MDRequestRef& mdr);
201 void _lookup_ino_2(MDRequestRef& mdr, int r);
202 void handle_client_readdir(MDRequestRef& mdr);
203 void handle_client_file_setlock(MDRequestRef& mdr);
204 void handle_client_file_readlock(MDRequestRef& mdr);
205
206 bool xlock_policylock(MDRequestRef& mdr, CInode *in,
207 bool want_layout=false, bool xlock_snaplock=false);
208 CInode* try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino);
209 void handle_client_setattr(MDRequestRef& mdr);
210 void handle_client_setlayout(MDRequestRef& mdr);
211 void handle_client_setdirlayout(MDRequestRef& mdr);
212
213 int parse_quota_vxattr(string name, string value, quota_info_t *quota);
214 void create_quota_realm(CInode *in);
215 int parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
216 file_layout_t *layout, bool validate=true);
217 int check_layout_vxattr(MDRequestRef& mdr,
218 string name,
219 string value,
220 file_layout_t *layout);
221 void handle_set_vxattr(MDRequestRef& mdr, CInode *cur);
222 void handle_remove_vxattr(MDRequestRef& mdr, CInode *cur);
223 void handle_client_setxattr(MDRequestRef& mdr);
224 void handle_client_removexattr(MDRequestRef& mdr);
225
226 void handle_client_fsync(MDRequestRef& mdr);
227
228 // open
229 void handle_client_open(MDRequestRef& mdr);
230 void handle_client_openc(MDRequestRef& mdr); // O_CREAT variant.
231 void do_open_truncate(MDRequestRef& mdr, int cmode); // O_TRUNC variant.
232
233 // namespace changes
234 void handle_client_mknod(MDRequestRef& mdr);
235 void handle_client_mkdir(MDRequestRef& mdr);
236 void handle_client_symlink(MDRequestRef& mdr);
237
238 // link
239 void handle_client_link(MDRequestRef& mdr);
240 void _link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm);
241 void _link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
242 version_t, version_t, bool);
243
244 void _link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti);
245 void _link_remote_finish(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti,
246 version_t);
247
248 void handle_peer_link_prep(MDRequestRef& mdr);
249 void _logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm);
250 void _commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti);
251 void _committed_peer(MDRequestRef& mdr); // use for rename, too
252 void handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
253 void do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr);
254 void _link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
255 map<client_t,ref_t<MClientSnap>>& split);
256
257 // unlink
258 void handle_client_unlink(MDRequestRef& mdr);
259 bool _dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *rmdiri);
260 bool _dir_is_nonempty(MDRequestRef& mdr, CInode *rmdiri);
261 void _unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn);
262 void _unlink_local_finish(MDRequestRef& mdr,
263 CDentry *dn, CDentry *straydn,
264 version_t);
265 bool _rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn);
266 void handle_peer_rmdir_prep(MDRequestRef& mdr);
267 void _logged_peer_rmdir(MDRequestRef& mdr, CDentry *srcdn, CDentry *straydn);
268 void _commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn);
269 void handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack);
270 void do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr);
271 void _rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn);
272
273 // rename
274 void handle_client_rename(MDRequestRef& mdr);
275 void _rename_finish(MDRequestRef& mdr,
276 CDentry *srcdn, CDentry *destdn, CDentry *straydn);
277
278 void handle_client_lssnap(MDRequestRef& mdr);
279 void handle_client_mksnap(MDRequestRef& mdr);
280 void _mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info);
281 void handle_client_rmsnap(MDRequestRef& mdr);
282 void _rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
283 void handle_client_renamesnap(MDRequestRef& mdr);
284 void _renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid);
285
286 // helpers
287 bool _rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
288 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn);
289 version_t _rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl);
290 bool _need_force_journal(CInode *diri, bool empty);
291 void _rename_prepare(MDRequestRef& mdr,
292 EMetaBlob *metablob, bufferlist *client_map_bl,
293 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
294 CDentry *straydn);
295 /* set not_journaling=true if you're going to discard the results --
296 * this bypasses the asserts to make sure we're journaling the right
297 * things on the right nodes */
298 void _rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
299
300 // slaving
301 void handle_peer_rename_prep(MDRequestRef& mdr);
302 void handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
303 void handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m);
304 void _peer_rename_sessions_flushed(MDRequestRef& mdr);
305 void _logged_peer_rename(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
306 void _commit_peer_rename(MDRequestRef& mdr, int r, CDentry *srcdn, CDentry *destdn, CDentry *straydn);
307 void do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr, bool finish_mdr=false);
308 void _rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn, version_t srcdnpv,
309 CDentry *destdn, CDentry *staydn, map<client_t,ref_t<MClientSnap>> splits[2],
310 bool finish_mdr);
311
312 void evict_cap_revoke_non_responders();
313 void handle_conf_change(const std::set<std::string>& changed);
314
315 bool terminating_sessions = false;
316
317 set<client_t> client_reclaim_gather;
318
319 private:
320 friend class MDSContinuation;
321 friend class ServerContext;
322 friend class ServerLogContext;
323 friend class Batch_Getattr_Lookup;
324
325 // placeholder for validation handler to store xattr specific
326 // data
327 struct XattrInfo {
328 virtual ~XattrInfo() {
329 }
330 };
331
332 struct MirrorXattrInfo : XattrInfo {
333 std::string cluster_id;
334 std::string fs_id;
335
336 static const std::string MIRROR_INFO_REGEX;
337 static const std::string CLUSTER_ID;
338 static const std::string FS_ID;
339
340 MirrorXattrInfo(std::string_view cluster_id,
341 std::string_view fs_id)
342 : cluster_id(cluster_id),
343 fs_id(fs_id) {
344 }
345 };
346
347 struct XattrOp {
348 int op;
349 std::string xattr_name;
350 const bufferlist &xattr_value;
351 int flags = 0;
352
353 std::unique_ptr<XattrInfo> xinfo;
354
355 XattrOp(int op, std::string_view xattr_name, const bufferlist &xattr_value, int flags)
356 : op(op),
357 xattr_name(xattr_name),
358 xattr_value(xattr_value),
359 flags (flags) {
360 }
361 };
362
363 struct XattrHandler {
364 const std::string xattr_name;
365 const std::string description;
366
367 // basic checks are to be done in this handler. return -errno to
368 // reject xattr request (set or remove), zero to proceed. handlers
369 // may parse xattr value for verification if needed and have an
370 // option to store custom data in XattrOp::xinfo.
371 int (Server::*validate)(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
372 XattrOp *xattr_op);
373
374 // set xattr for an inode in xattr_map
375 void (Server::*setxattr)(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
376 const XattrOp &xattr_op);
377
378 // remove xattr for an inode from xattr_map
379 void (Server::*removexattr)(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
380 const XattrOp &xattr_op);
381 };
382
383 inline static const std::string DEFAULT_HANDLER = "<default>";
384 static const XattrHandler xattr_handlers[];
385
386 const XattrHandler* get_xattr_or_default_handler(std::string_view xattr_name);
387
388 // generic variant to set/remove xattr in/from xattr_map
389 int xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
390 const std::string &xattr_name, int op, int flags);
391 void xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
392 const bufferlist &xattr_value);
393 void xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name);
394
395 // default xattr handlers
396 int default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
397 XattrOp *xattr_op);
398 void default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
399 const XattrOp &xattr_op);
400 void default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
401 const XattrOp &xattr_op);
402
403 // mirror info xattr handler
404 int parse_mirror_info_xattr(const std::string &name, const std::string &value,
405 std::string &cluster_id, std::string &fs_id);
406 int mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
407 XattrOp *xattr_op);
408 void mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
409 const XattrOp &xattr_op);
410 void mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
411 const XattrOp &xattr_op);
412
413 static bool is_ceph_vxattr(std::string_view xattr_name) {
414 return xattr_name.rfind("ceph.dir.layout", 0) == 0 ||
415 xattr_name.rfind("ceph.file.layout", 0) == 0 ||
416 xattr_name.rfind("ceph.quota", 0) == 0 ||
417 xattr_name == "ceph.dir.subvolume"sv ||
418 xattr_name == "ceph.dir.pin"sv ||
419 xattr_name == "ceph.dir.pin.random"sv ||
420 xattr_name == "ceph.dir.pin.distributed"sv;
421 }
422
423 static bool is_allowed_ceph_xattr(std::string_view xattr_name) {
424 // not a ceph xattr -- allow!
425 if (xattr_name.rfind("ceph.", 0) != 0) {
426 return true;
427 }
428
429 return xattr_name == "ceph.mirror.info";
430 }
431
432 void reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply);
433 void flush_session(Session *session, MDSGatherBuilder& gather);
434
435 MDSRank *mds;
436 MDCache *mdcache;
437 MDLog *mdlog;
438 PerfCounters *logger = nullptr;
439
440 // OSDMap full status, used to generate CEPHFS_ENOSPC on some operations
441 bool is_full = false;
442
443 // State for while in reconnect
444 MDSContext *reconnect_done = nullptr;
445 int failed_reconnects = 0;
446 bool reconnect_evicting = false; // true if I am waiting for evictions to complete
447 // before proceeding to reconnect_gather_finish
448 time reconnect_start = clock::zero();
449 time reconnect_last_seen = clock::zero();
450 set<client_t> client_reconnect_gather; // clients i need a reconnect msg from.
451 set<client_t> client_reconnect_denied; // clients whose reconnect msg have been denied .
452
453 feature_bitset_t supported_features;
454 feature_bitset_t required_client_features;
455
456 bool forward_all_requests_to_auth = false;
457 bool replay_unsafe_with_closed_session = false;
458 double cap_revoke_eviction_timeout = 0;
459 uint64_t max_snaps_per_dir = 100;
460 unsigned delegate_inos_pct = 0;
461
462 DecayCounter recall_throttle;
463 time last_recall_state;
464
465 MetricsHandler *metrics_handler;
466
467 // Cache cap acquisition throttle configs
468 uint64_t max_caps_per_client;
469 uint64_t cap_acquisition_throttle;
470 double max_caps_throttle_ratio;
471 double caps_throttle_retry_request_timeout;
472
473 size_t alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
474 };
475
476 static inline constexpr auto operator|(Server::RecallFlags a, Server::RecallFlags b) {
477 using T = std::underlying_type<Server::RecallFlags>::type;
478 return static_cast<Server::RecallFlags>(static_cast<T>(a) | static_cast<T>(b));
479 }
480 static inline constexpr auto operator&(Server::RecallFlags a, Server::RecallFlags b) {
481 using T = std::underlying_type<Server::RecallFlags>::type;
482 return static_cast<Server::RecallFlags>(static_cast<T>(a) & static_cast<T>(b));
483 }
484 static inline std::ostream& operator<<(std::ostream& os, const Server::RecallFlags& f) {
485 using T = std::underlying_type<Server::RecallFlags>::type;
486 return os << "0x" << std::hex << static_cast<T>(f) << std::dec;
487 }
488 static inline constexpr bool operator!(const Server::RecallFlags& f) {
489 using T = std::underlying_type<Server::RecallFlags>::type;
490 return static_cast<T>(f) == static_cast<T>(0);
491 }
492 #endif