]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Mutation.h
import 15.2.4
[ceph.git] / ceph / src / mds / Mutation.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef CEPH_MDS_MUTATION_H
16 #define CEPH_MDS_MUTATION_H
17
18 #include "include/interval_set.h"
19 #include "include/elist.h"
20 #include "include/filepath.h"
21
22 #include "MDSCacheObject.h"
23 #include "MDSContext.h"
24
25 #include "SimpleLock.h"
26 #include "Capability.h"
27
28 #include "common/TrackedOp.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MMDSSlaveRequest.h"
31 #include "messages/MClientReply.h"
32
33 class LogSegment;
34 class CInode;
35 class CDir;
36 class CDentry;
37 class Session;
38 class ScatterLock;
39 struct sr_t;
40 struct MDLockCache;
41
42 struct MutationImpl : public TrackedOp {
43 public:
44 // -- my pins and auth_pins --
45 struct ObjectState {
46 bool pinned = false;
47 bool auth_pinned = false;
48 mds_rank_t remote_auth_pinned = MDS_RANK_NONE;
49 };
50
51 // held locks
52 struct LockOp {
53 enum {
54 RDLOCK = 1,
55 WRLOCK = 2,
56 XLOCK = 4,
57 REMOTE_WRLOCK = 8,
58 STATE_PIN = 16, // no RW after locked, just pin lock state
59 };
60
61 LockOp(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) :
62 lock(l), flags(f), wrlock_target(t) {}
63
64 bool is_rdlock() const { return !!(flags & RDLOCK); }
65 bool is_xlock() const { return !!(flags & XLOCK); }
66 bool is_wrlock() const { return !!(flags & WRLOCK); }
67 void clear_wrlock() const { flags &= ~WRLOCK; }
68 bool is_remote_wrlock() const { return !!(flags & REMOTE_WRLOCK); }
69 void clear_remote_wrlock() const {
70 flags &= ~REMOTE_WRLOCK;
71 wrlock_target = MDS_RANK_NONE;
72 }
73 bool is_state_pin() const { return !!(flags & STATE_PIN); }
74 bool operator<(const LockOp& r) const {
75 return lock < r.lock;
76 }
77
78 SimpleLock* lock;
79 mutable unsigned flags;
80 mutable mds_rank_t wrlock_target;
81 };
82
83 struct LockOpVec : public vector<LockOp> {
84 LockOpVec() {
85 reserve(32);
86 }
87
88 void add_rdlock(SimpleLock *lock) {
89 emplace_back(lock, LockOp::RDLOCK);
90 }
91 void erase_rdlock(SimpleLock *lock);
92 void add_xlock(SimpleLock *lock, int idx=-1) {
93 if (idx >= 0)
94 emplace(cbegin() + idx, lock, LockOp::XLOCK);
95 else
96 emplace_back(lock, LockOp::XLOCK);
97 }
98 void add_wrlock(SimpleLock *lock, int idx=-1) {
99 if (idx >= 0)
100 emplace(cbegin() + idx, lock, LockOp::WRLOCK);
101 else
102 emplace_back(lock, LockOp::WRLOCK);
103 }
104 void add_remote_wrlock(SimpleLock *lock, mds_rank_t rank) {
105 ceph_assert(rank != MDS_RANK_NONE);
106 emplace_back(lock, LockOp::REMOTE_WRLOCK, rank);
107 }
108 void lock_scatter_gather(SimpleLock *lock) {
109 emplace_back(lock, LockOp::WRLOCK | LockOp::STATE_PIN);
110 }
111 void sort_and_merge();
112 };
113
114 using lock_set = set<LockOp>;
115 using lock_iterator = lock_set::iterator;
116
117 // keep our default values synced with MDRequestParam's
118 MutationImpl() : TrackedOp(nullptr, utime_t()) {}
119 MutationImpl(OpTracker *tracker, utime_t initiated,
120 const metareqid_t &ri, __u32 att=0, mds_rank_t slave_to=MDS_RANK_NONE)
121 : TrackedOp(tracker, initiated),
122 reqid(ri), attempt(att),
123 slave_to_mds(slave_to) {}
124 ~MutationImpl() override {
125 ceph_assert(!locking);
126 ceph_assert(!lock_cache);
127 ceph_assert(num_pins == 0);
128 ceph_assert(num_auth_pins == 0);
129 }
130
131 const ObjectState* find_object_state(MDSCacheObject *obj) const {
132 auto it = object_states.find(obj);
133 return it != object_states.end() ? &it->second : nullptr;
134 }
135
136 bool is_any_remote_auth_pin() const { return num_remote_auth_pins > 0; }
137
138 void disable_lock_cache() {
139 lock_cache_disabled = true;
140 }
141
142 lock_iterator emplace_lock(SimpleLock *l, unsigned f=0, mds_rank_t t=MDS_RANK_NONE) {
143 last_locked = l;
144 return locks.emplace(l, f, t).first;
145 }
146
147 bool is_rdlocked(SimpleLock *lock) const;
148 bool is_wrlocked(SimpleLock *lock) const;
149 bool is_xlocked(SimpleLock *lock) const {
150 auto it = locks.find(lock);
151 return it != locks.end() && it->is_xlock();
152 }
153 bool is_remote_wrlocked(SimpleLock *lock) const {
154 auto it = locks.find(lock);
155 return it != locks.end() && it->is_remote_wrlock();
156 }
157 bool is_last_locked(SimpleLock *lock) const {
158 return lock == last_locked;
159 }
160
161 bool is_master() const { return slave_to_mds == MDS_RANK_NONE; }
162 bool is_slave() const { return slave_to_mds != MDS_RANK_NONE; }
163
164 client_t get_client() const {
165 if (reqid.name.is_client())
166 return client_t(reqid.name.num());
167 return -1;
168 }
169
170 void set_mds_stamp(utime_t t) {
171 mds_stamp = t;
172 }
173 utime_t get_mds_stamp() const {
174 return mds_stamp;
175 }
176 void set_op_stamp(utime_t t) {
177 op_stamp = t;
178 }
179 utime_t get_op_stamp() const {
180 if (op_stamp != utime_t())
181 return op_stamp;
182 return get_mds_stamp();
183 }
184
185 // pin items in cache
186 void pin(MDSCacheObject *object);
187 void unpin(MDSCacheObject *object);
188 void set_stickydirs(CInode *in);
189 void put_stickydirs();
190 void drop_pins();
191
192 void start_locking(SimpleLock *lock, int target=-1);
193 void finish_locking(SimpleLock *lock);
194
195 // auth pins
196 bool is_auth_pinned(MDSCacheObject *object) const;
197 void auth_pin(MDSCacheObject *object);
198 void auth_unpin(MDSCacheObject *object);
199 void drop_local_auth_pins();
200 void set_remote_auth_pinned(MDSCacheObject* object, mds_rank_t from);
201 void _clear_remote_auth_pinned(ObjectState& stat);
202
203 void add_projected_inode(CInode *in);
204 void pop_and_dirty_projected_inodes();
205 void add_projected_fnode(CDir *dir);
206 void pop_and_dirty_projected_fnodes();
207 void add_updated_lock(ScatterLock *lock);
208 void add_cow_inode(CInode *in);
209 void add_cow_dentry(CDentry *dn);
210 void apply();
211 void cleanup();
212
213 virtual void print(ostream &out) const {
214 out << "mutation(" << this << ")";
215 }
216
217 virtual void dump(Formatter *f) const {}
218 void _dump_op_descriptor_unlocked(ostream& stream) const override;
219
220 metareqid_t reqid;
221 __u32 attempt = 0; // which attempt for this request
222 LogSegment *ls = nullptr; // the log segment i'm committing to
223
224 // flag mutation as slave
225 mds_rank_t slave_to_mds = MDS_RANK_NONE; // this is a slave request if >= 0.
226
227 ceph::unordered_map<MDSCacheObject*, ObjectState> object_states;
228 int num_pins = 0;
229 int num_auth_pins = 0;
230 int num_remote_auth_pins = 0;
231 // cache pins (so things don't expire)
232 CInode* stickydiri = nullptr;
233
234 lock_set locks; // full ordering
235 MDLockCache* lock_cache = nullptr;
236 bool lock_cache_disabled = false;
237 SimpleLock *last_locked = nullptr;
238 // Lock we are currently trying to acquire. If we give up for some reason,
239 // be sure to eval() this.
240 SimpleLock *locking = nullptr;
241 mds_rank_t locking_target_mds = -1;
242
243 // if this flag is set, do not attempt to acquire further locks.
244 // (useful for wrlock, which may be a moving auth target)
245 enum {
246 SNAP_LOCKED = 1,
247 SNAP2_LOCKED = 2,
248 PATH_LOCKED = 4,
249 ALL_LOCKED = 8,
250 };
251 int locking_state = 0;
252
253 bool committing = false;
254 bool aborted = false;
255 bool killed = false;
256
257 // for applying projected inode changes
258 list<CInode*> projected_inodes;
259 std::vector<CDir*> projected_fnodes;
260 list<ScatterLock*> updated_locks;
261
262 list<CInode*> dirty_cow_inodes;
263 list<pair<CDentry*,version_t> > dirty_cow_dentries;
264
265 private:
266 utime_t mds_stamp; ///< mds-local timestamp (real time)
267 utime_t op_stamp; ///< op timestamp (client provided)
268 };
269
270 /**
271 * MDRequestImpl: state we track for requests we are currently processing.
272 * mostly information about locks held, so that we can drop them all
273 * the request is finished or forwarded. see request_*().
274 */
275 struct MDRequestImpl : public MutationImpl {
276 // TrackedOp stuff
277 typedef boost::intrusive_ptr<MDRequestImpl> Ref;
278
279 // break rarely-used fields into a separately allocated structure
280 // to save memory for most ops
281 struct More {
282 More() {}
283
284 int slave_error = 0;
285 set<mds_rank_t> slaves; // mds nodes that have slave requests to me (implies client_request)
286 set<mds_rank_t> waiting_on_slave; // peers i'm waiting for slavereq replies from.
287
288 // for rename/link/unlink
289 set<mds_rank_t> witnessed; // nodes who have journaled a RenamePrepare
290 map<MDSCacheObject*,version_t> pvmap;
291
292 bool has_journaled_slaves = false;
293 bool slave_update_journaled = false;
294 bool slave_rolling_back = false;
295
296 // for rename
297 set<mds_rank_t> extra_witnesses; // replica list from srcdn auth (rename)
298 mds_rank_t srcdn_auth_mds = MDS_RANK_NONE;
299 bufferlist inode_import;
300 version_t inode_import_v = 0;
301 CInode* rename_inode = nullptr;
302 bool is_freeze_authpin = false;
303 bool is_ambiguous_auth = false;
304 bool is_remote_frozen_authpin = false;
305 bool is_inode_exporter = false;
306
307 map<client_t, pair<Session*, uint64_t> > imported_session_map;
308 map<CInode*, map<client_t,Capability::Export> > cap_imports;
309
310 // for lock/flock
311 bool flock_was_waiting = false;
312
313 // for snaps
314 version_t stid = 0;
315 bufferlist snapidbl;
316
317 sr_t *srci_srnode = nullptr;
318 sr_t *desti_srnode = nullptr;
319
320 // called when slave commits or aborts
321 Context *slave_commit = nullptr;
322 bufferlist rollback_bl;
323
324 MDSContext::vec waiting_for_finish;
325
326 // export & fragment
327 CDir* export_dir = nullptr;
328 dirfrag_t fragment_base;
329
330 // for internal ops doing lookup
331 filepath filepath1;
332 filepath filepath2;
333 } *_more = nullptr;
334
335 // ---------------------------------------------------
336 struct Params {
337 // keep these default values synced to MutationImpl's
338 Params() {}
339 const utime_t& get_recv_stamp() const {
340 return initiated;
341 }
342 const utime_t& get_throttle_stamp() const {
343 return throttled;
344 }
345 const utime_t& get_recv_complete_stamp() const {
346 return all_read;
347 }
348 const utime_t& get_dispatch_stamp() const {
349 return dispatched;
350 }
351 metareqid_t reqid;
352 __u32 attempt = 0;
353 cref_t<MClientRequest> client_req;
354 cref_t<Message> triggering_slave_req;
355 mds_rank_t slave_to = MDS_RANK_NONE;
356 utime_t initiated;
357 utime_t throttled, all_read, dispatched;
358 int internal_op = -1;
359 };
360 MDRequestImpl(const Params* params, OpTracker *tracker) :
361 MutationImpl(tracker, params->initiated,
362 params->reqid, params->attempt, params->slave_to),
363 item_session_request(this), client_request(params->client_req),
364 internal_op(params->internal_op) {}
365 ~MDRequestImpl() override;
366
367 More* more();
368 bool has_more() const;
369 bool has_witnesses();
370 bool slave_did_prepare();
371 bool slave_rolling_back();
372 bool freeze_auth_pin(CInode *inode);
373 void unfreeze_auth_pin(bool clear_inode=false);
374 void set_remote_frozen_auth_pin(CInode *inode);
375 bool can_auth_pin(MDSCacheObject *object);
376 void drop_local_auth_pins();
377 void set_ambiguous_auth(CInode *inode);
378 void clear_ambiguous_auth();
379 const filepath& get_filepath();
380 const filepath& get_filepath2();
381 void set_filepath(const filepath& fp);
382 void set_filepath2(const filepath& fp);
383 bool is_queued_for_replay() const;
384 bool is_batch_op();
385 int compare_paths();
386
387 void print(ostream &out) const override;
388 void dump(Formatter *f) const override;
389
390 cref_t<MClientRequest> release_client_request();
391 void reset_slave_request(const cref_t<MMDSSlaveRequest>& req=nullptr);
392
393 Session *session = nullptr;
394 elist<MDRequestImpl*>::item item_session_request; // if not on list, op is aborted.
395
396 // -- i am a client (master) request
397 cref_t<MClientRequest> client_request; // client request (if any)
398
399 // tree and depth info of path1 and path2
400 inodeno_t dir_root[2] = {0, 0};
401 int dir_depth[2] = {-1, -1};
402 file_layout_t dir_layout;
403 // store up to two sets of dn vectors, inode pointers, for request path1 and path2.
404 vector<CDentry*> dn[2];
405 CInode *in[2] = {};
406 CDentry *straydn = nullptr;
407 snapid_t snapid = CEPH_NOSNAP;
408
409 CInode *tracei = nullptr;
410 CDentry *tracedn = nullptr;
411
412 inodeno_t alloc_ino = 0, used_prealloc_ino = 0;
413 interval_set<inodeno_t> prealloc_inos;
414
415 int snap_caps = 0;
416 int getattr_caps = 0; ///< caps requested by getattr
417 bool no_early_reply = false;
418 bool did_early_reply = false;
419 bool o_trunc = false; ///< request is an O_TRUNC mutation
420 bool has_completed = false; ///< request has already completed
421
422 bufferlist reply_extra_bl;
423
424 // inos we did a embedded cap release on, and may need to eval if we haven't since reissued
425 map<vinodeno_t, ceph_seq_t> cap_releases;
426
427 // -- i am a slave request
428 cref_t<MMDSSlaveRequest> slave_request; // slave request (if one is pending; implies slave == true)
429
430 // -- i am an internal op
431 int internal_op;
432 Context *internal_op_finish = nullptr;
433 void *internal_op_private = nullptr;
434
435 // indicates how may retries of request have been made
436 int retry = 0;
437
438 bool is_batch_head = false;
439
440 // indicator for vxattr osdmap update
441 bool waited_for_osdmap = false;
442
443 std::vector<Ref> batch_reqs;
444 protected:
445 void _dump(Formatter *f) const override;
446 void _dump_op_descriptor_unlocked(ostream& stream) const override;
447 private:
448 mutable ceph::spinlock msg_lock;
449 };
450
451 struct MDSlaveUpdate {
452 MDSlaveUpdate(int oo, bufferlist &rbl) :
453 origop(oo) {
454 rollback.claim(rbl);
455 }
456 ~MDSlaveUpdate() {
457 if (waiter)
458 waiter->complete(0);
459 }
460 int origop;
461 bufferlist rollback;
462 Context *waiter = nullptr;
463 set<CInode*> olddirs;
464 set<CInode*> unlinked;
465 };
466
467 struct MDLockCacheItem {
468 MDLockCache *parent = nullptr;
469 elist<MDLockCacheItem*>::item item_lock;
470 };
471
472 struct MDLockCache : public MutationImpl {
473 using LockItem = MDLockCacheItem;
474
475 struct DirItem {
476 MDLockCache *parent = nullptr;
477 elist<DirItem*>::item item_dir;
478 };
479
480 MDLockCache(Capability *cap, int op) :
481 MutationImpl(), diri(cap->get_inode()), client_cap(cap), opcode(op) {
482 client_cap->lock_caches.push_back(&item_cap_lock_cache);
483 }
484
485 CInode *get_dir_inode() { return diri; }
486 void set_dir_layout(file_layout_t& layout) {
487 dir_layout = layout;
488 }
489 const file_layout_t& get_dir_layout() const {
490 return dir_layout;
491 }
492
493 void attach_locks();
494 void attach_dirfrags(std::vector<CDir*>&& dfv);
495 void detach_locks();
496 void detach_dirfrags();
497
498 CInode *diri;
499 Capability *client_cap;
500 int opcode;
501 file_layout_t dir_layout;
502
503 elist<MDLockCache*>::item item_cap_lock_cache;
504
505 // link myself to locked locks
506 std::unique_ptr<LockItem[]> items_lock;
507
508 // link myself to auth-pinned dirfrags
509 std::unique_ptr<DirItem[]> items_dir;
510 std::vector<CDir*> auth_pinned_dirfrags;
511
512 int ref = 1;
513 bool invalidating = false;
514 };
515
516 typedef boost::intrusive_ptr<MutationImpl> MutationRef;
517 typedef boost::intrusive_ptr<MDRequestImpl> MDRequestRef;
518
519 inline ostream& operator<<(ostream &out, const MutationImpl &mut)
520 {
521 mut.print(out);
522 return out;
523 }
524 #endif