1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_MUTATION_H
16 #define CEPH_MDS_MUTATION_H
18 #include "include/interval_set.h"
19 #include "include/elist.h"
20 #include "include/filepath.h"
22 #include "MDSCacheObject.h"
23 #include "MDSContext.h"
25 #include "SimpleLock.h"
26 #include "Capability.h"
28 #include "common/TrackedOp.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MMDSSlaveRequest.h"
31 #include "messages/MClientReply.h"
42 struct MutationImpl
: public TrackedOp
{
44 // -- my pins and auth_pins --
47 bool auth_pinned
= false;
48 mds_rank_t remote_auth_pinned
= MDS_RANK_NONE
;
58 STATE_PIN
= 16, // no RW after locked, just pin lock state
61 LockOp(SimpleLock
*l
, unsigned f
=0, mds_rank_t t
=MDS_RANK_NONE
) :
62 lock(l
), flags(f
), wrlock_target(t
) {}
64 bool is_rdlock() const { return !!(flags
& RDLOCK
); }
65 bool is_xlock() const { return !!(flags
& XLOCK
); }
66 bool is_wrlock() const { return !!(flags
& WRLOCK
); }
67 void clear_wrlock() const { flags
&= ~WRLOCK
; }
68 bool is_remote_wrlock() const { return !!(flags
& REMOTE_WRLOCK
); }
69 void clear_remote_wrlock() const {
70 flags
&= ~REMOTE_WRLOCK
;
71 wrlock_target
= MDS_RANK_NONE
;
73 bool is_state_pin() const { return !!(flags
& STATE_PIN
); }
74 bool operator<(const LockOp
& r
) const {
79 mutable unsigned flags
;
80 mutable mds_rank_t wrlock_target
;
83 struct LockOpVec
: public vector
<LockOp
> {
88 void add_rdlock(SimpleLock
*lock
) {
89 emplace_back(lock
, LockOp::RDLOCK
);
91 void erase_rdlock(SimpleLock
*lock
);
92 void add_xlock(SimpleLock
*lock
, int idx
=-1) {
94 emplace(cbegin() + idx
, lock
, LockOp::XLOCK
);
96 emplace_back(lock
, LockOp::XLOCK
);
98 void add_wrlock(SimpleLock
*lock
, int idx
=-1) {
100 emplace(cbegin() + idx
, lock
, LockOp::WRLOCK
);
102 emplace_back(lock
, LockOp::WRLOCK
);
104 void add_remote_wrlock(SimpleLock
*lock
, mds_rank_t rank
) {
105 ceph_assert(rank
!= MDS_RANK_NONE
);
106 emplace_back(lock
, LockOp::REMOTE_WRLOCK
, rank
);
108 void lock_scatter_gather(SimpleLock
*lock
) {
109 emplace_back(lock
, LockOp::WRLOCK
| LockOp::STATE_PIN
);
111 void sort_and_merge();
114 using lock_set
= set
<LockOp
>;
115 using lock_iterator
= lock_set::iterator
;
117 // keep our default values synced with MDRequestParam's
118 MutationImpl() : TrackedOp(nullptr, utime_t()) {}
119 MutationImpl(OpTracker
*tracker
, utime_t initiated
,
120 const metareqid_t
&ri
, __u32 att
=0, mds_rank_t slave_to
=MDS_RANK_NONE
)
121 : TrackedOp(tracker
, initiated
),
122 reqid(ri
), attempt(att
),
123 slave_to_mds(slave_to
) {}
124 ~MutationImpl() override
{
125 ceph_assert(!locking
);
126 ceph_assert(!lock_cache
);
127 ceph_assert(num_pins
== 0);
128 ceph_assert(num_auth_pins
== 0);
131 const ObjectState
* find_object_state(MDSCacheObject
*obj
) const {
132 auto it
= object_states
.find(obj
);
133 return it
!= object_states
.end() ? &it
->second
: nullptr;
136 bool is_any_remote_auth_pin() const { return num_remote_auth_pins
> 0; }
138 void disable_lock_cache() {
139 lock_cache_disabled
= true;
142 lock_iterator
emplace_lock(SimpleLock
*l
, unsigned f
=0, mds_rank_t t
=MDS_RANK_NONE
) {
144 return locks
.emplace(l
, f
, t
).first
;
147 bool is_rdlocked(SimpleLock
*lock
) const;
148 bool is_wrlocked(SimpleLock
*lock
) const;
149 bool is_xlocked(SimpleLock
*lock
) const {
150 auto it
= locks
.find(lock
);
151 return it
!= locks
.end() && it
->is_xlock();
153 bool is_remote_wrlocked(SimpleLock
*lock
) const {
154 auto it
= locks
.find(lock
);
155 return it
!= locks
.end() && it
->is_remote_wrlock();
157 bool is_last_locked(SimpleLock
*lock
) const {
158 return lock
== last_locked
;
161 bool is_master() const { return slave_to_mds
== MDS_RANK_NONE
; }
162 bool is_slave() const { return slave_to_mds
!= MDS_RANK_NONE
; }
164 client_t
get_client() const {
165 if (reqid
.name
.is_client())
166 return client_t(reqid
.name
.num());
170 void set_mds_stamp(utime_t t
) {
173 utime_t
get_mds_stamp() const {
176 void set_op_stamp(utime_t t
) {
179 utime_t
get_op_stamp() const {
180 if (op_stamp
!= utime_t())
182 return get_mds_stamp();
185 // pin items in cache
186 void pin(MDSCacheObject
*object
);
187 void unpin(MDSCacheObject
*object
);
188 void set_stickydirs(CInode
*in
);
189 void put_stickydirs();
192 void start_locking(SimpleLock
*lock
, int target
=-1);
193 void finish_locking(SimpleLock
*lock
);
196 bool is_auth_pinned(MDSCacheObject
*object
) const;
197 void auth_pin(MDSCacheObject
*object
);
198 void auth_unpin(MDSCacheObject
*object
);
199 void drop_local_auth_pins();
200 void set_remote_auth_pinned(MDSCacheObject
* object
, mds_rank_t from
);
201 void _clear_remote_auth_pinned(ObjectState
& stat
);
203 void add_projected_inode(CInode
*in
);
204 void pop_and_dirty_projected_inodes();
205 void add_projected_fnode(CDir
*dir
);
206 void pop_and_dirty_projected_fnodes();
207 void add_updated_lock(ScatterLock
*lock
);
208 void add_cow_inode(CInode
*in
);
209 void add_cow_dentry(CDentry
*dn
);
213 virtual void print(ostream
&out
) const {
214 out
<< "mutation(" << this << ")";
217 virtual void dump(Formatter
*f
) const {}
218 void _dump_op_descriptor_unlocked(ostream
& stream
) const override
;
221 __u32 attempt
= 0; // which attempt for this request
222 LogSegment
*ls
= nullptr; // the log segment i'm committing to
224 // flag mutation as slave
225 mds_rank_t slave_to_mds
= MDS_RANK_NONE
; // this is a slave request if >= 0.
227 ceph::unordered_map
<MDSCacheObject
*, ObjectState
> object_states
;
229 int num_auth_pins
= 0;
230 int num_remote_auth_pins
= 0;
231 // cache pins (so things don't expire)
232 CInode
* stickydiri
= nullptr;
234 lock_set locks
; // full ordering
235 MDLockCache
* lock_cache
= nullptr;
236 bool lock_cache_disabled
= false;
237 SimpleLock
*last_locked
= nullptr;
238 // Lock we are currently trying to acquire. If we give up for some reason,
239 // be sure to eval() this.
240 SimpleLock
*locking
= nullptr;
241 mds_rank_t locking_target_mds
= -1;
243 // if this flag is set, do not attempt to acquire further locks.
244 // (useful for wrlock, which may be a moving auth target)
251 int locking_state
= 0;
253 bool committing
= false;
254 bool aborted
= false;
257 // for applying projected inode changes
258 list
<CInode
*> projected_inodes
;
259 std::vector
<CDir
*> projected_fnodes
;
260 list
<ScatterLock
*> updated_locks
;
262 list
<CInode
*> dirty_cow_inodes
;
263 list
<pair
<CDentry
*,version_t
> > dirty_cow_dentries
;
266 utime_t mds_stamp
; ///< mds-local timestamp (real time)
267 utime_t op_stamp
; ///< op timestamp (client provided)
271 * MDRequestImpl: state we track for requests we are currently processing.
272 * mostly information about locks held, so that we can drop them all
273 * the request is finished or forwarded. see request_*().
275 struct MDRequestImpl
: public MutationImpl
{
277 typedef boost::intrusive_ptr
<MDRequestImpl
> Ref
;
279 // break rarely-used fields into a separately allocated structure
280 // to save memory for most ops
285 set
<mds_rank_t
> slaves
; // mds nodes that have slave requests to me (implies client_request)
286 set
<mds_rank_t
> waiting_on_slave
; // peers i'm waiting for slavereq replies from.
288 // for rename/link/unlink
289 set
<mds_rank_t
> witnessed
; // nodes who have journaled a RenamePrepare
290 map
<MDSCacheObject
*,version_t
> pvmap
;
292 bool has_journaled_slaves
= false;
293 bool slave_update_journaled
= false;
294 bool slave_rolling_back
= false;
297 set
<mds_rank_t
> extra_witnesses
; // replica list from srcdn auth (rename)
298 mds_rank_t srcdn_auth_mds
= MDS_RANK_NONE
;
299 bufferlist inode_import
;
300 version_t inode_import_v
= 0;
301 CInode
* rename_inode
= nullptr;
302 bool is_freeze_authpin
= false;
303 bool is_ambiguous_auth
= false;
304 bool is_remote_frozen_authpin
= false;
305 bool is_inode_exporter
= false;
307 map
<client_t
, pair
<Session
*, uint64_t> > imported_session_map
;
308 map
<CInode
*, map
<client_t
,Capability::Export
> > cap_imports
;
311 bool flock_was_waiting
= false;
317 sr_t
*srci_srnode
= nullptr;
318 sr_t
*desti_srnode
= nullptr;
320 // called when slave commits or aborts
321 Context
*slave_commit
= nullptr;
322 bufferlist rollback_bl
;
324 MDSContext::vec waiting_for_finish
;
327 CDir
* export_dir
= nullptr;
328 dirfrag_t fragment_base
;
330 // for internal ops doing lookup
335 // ---------------------------------------------------
337 // keep these default values synced to MutationImpl's
339 const utime_t
& get_recv_stamp() const {
342 const utime_t
& get_throttle_stamp() const {
345 const utime_t
& get_recv_complete_stamp() const {
348 const utime_t
& get_dispatch_stamp() const {
353 cref_t
<MClientRequest
> client_req
;
354 cref_t
<Message
> triggering_slave_req
;
355 mds_rank_t slave_to
= MDS_RANK_NONE
;
357 utime_t throttled
, all_read
, dispatched
;
358 int internal_op
= -1;
360 MDRequestImpl(const Params
* params
, OpTracker
*tracker
) :
361 MutationImpl(tracker
, params
->initiated
,
362 params
->reqid
, params
->attempt
, params
->slave_to
),
363 item_session_request(this), client_request(params
->client_req
),
364 internal_op(params
->internal_op
) {}
365 ~MDRequestImpl() override
;
368 bool has_more() const;
369 bool has_witnesses();
370 bool slave_did_prepare();
371 bool slave_rolling_back();
372 bool freeze_auth_pin(CInode
*inode
);
373 void unfreeze_auth_pin(bool clear_inode
=false);
374 void set_remote_frozen_auth_pin(CInode
*inode
);
375 bool can_auth_pin(MDSCacheObject
*object
);
376 void drop_local_auth_pins();
377 void set_ambiguous_auth(CInode
*inode
);
378 void clear_ambiguous_auth();
379 const filepath
& get_filepath();
380 const filepath
& get_filepath2();
381 void set_filepath(const filepath
& fp
);
382 void set_filepath2(const filepath
& fp
);
383 bool is_queued_for_replay() const;
387 void print(ostream
&out
) const override
;
388 void dump(Formatter
*f
) const override
;
390 cref_t
<MClientRequest
> release_client_request();
391 void reset_slave_request(const cref_t
<MMDSSlaveRequest
>& req
=nullptr);
393 Session
*session
= nullptr;
394 elist
<MDRequestImpl
*>::item item_session_request
; // if not on list, op is aborted.
396 // -- i am a client (master) request
397 cref_t
<MClientRequest
> client_request
; // client request (if any)
399 // tree and depth info of path1 and path2
400 inodeno_t dir_root
[2] = {0, 0};
401 int dir_depth
[2] = {-1, -1};
402 file_layout_t dir_layout
;
403 // store up to two sets of dn vectors, inode pointers, for request path1 and path2.
404 vector
<CDentry
*> dn
[2];
406 CDentry
*straydn
= nullptr;
407 snapid_t snapid
= CEPH_NOSNAP
;
409 CInode
*tracei
= nullptr;
410 CDentry
*tracedn
= nullptr;
412 inodeno_t alloc_ino
= 0, used_prealloc_ino
= 0;
413 interval_set
<inodeno_t
> prealloc_inos
;
416 int getattr_caps
= 0; ///< caps requested by getattr
417 bool no_early_reply
= false;
418 bool did_early_reply
= false;
419 bool o_trunc
= false; ///< request is an O_TRUNC mutation
420 bool has_completed
= false; ///< request has already completed
422 bufferlist reply_extra_bl
;
424 // inos we did a embedded cap release on, and may need to eval if we haven't since reissued
425 map
<vinodeno_t
, ceph_seq_t
> cap_releases
;
427 // -- i am a slave request
428 cref_t
<MMDSSlaveRequest
> slave_request
; // slave request (if one is pending; implies slave == true)
430 // -- i am an internal op
432 Context
*internal_op_finish
= nullptr;
433 void *internal_op_private
= nullptr;
435 // indicates how may retries of request have been made
438 bool is_batch_head
= false;
440 // indicator for vxattr osdmap update
441 bool waited_for_osdmap
= false;
443 std::vector
<Ref
> batch_reqs
;
445 void _dump(Formatter
*f
) const override
;
446 void _dump_op_descriptor_unlocked(ostream
& stream
) const override
;
448 mutable ceph::spinlock msg_lock
;
451 struct MDSlaveUpdate
{
452 MDSlaveUpdate(int oo
, bufferlist
&rbl
, elist
<MDSlaveUpdate
*> &list
) :
456 list
.push_back(&item
);
459 item
.remove_myself();
465 elist
<MDSlaveUpdate
*>::item item
;
466 Context
*waiter
= nullptr;
467 set
<CInode
*> olddirs
;
468 set
<CInode
*> unlinked
;
471 struct MDLockCacheItem
{
472 MDLockCache
*parent
= nullptr;
473 elist
<MDLockCacheItem
*>::item item_lock
;
476 struct MDLockCache
: public MutationImpl
{
477 using LockItem
= MDLockCacheItem
;
480 MDLockCache
*parent
= nullptr;
481 elist
<DirItem
*>::item item_dir
;
484 MDLockCache(Capability
*cap
, int op
) :
485 MutationImpl(), diri(cap
->get_inode()), client_cap(cap
), opcode(op
) {
486 client_cap
->lock_caches
.push_back(&item_cap_lock_cache
);
489 CInode
*get_dir_inode() { return diri
; }
490 void set_dir_layout(file_layout_t
& layout
) {
493 const file_layout_t
& get_dir_layout() const {
498 void attach_dirfrags(std::vector
<CDir
*>&& dfv
);
500 void detach_dirfrags();
503 Capability
*client_cap
;
505 file_layout_t dir_layout
;
507 elist
<MDLockCache
*>::item item_cap_lock_cache
;
509 // link myself to locked locks
510 std::unique_ptr
<LockItem
[]> items_lock
;
512 // link myself to auth-pinned dirfrags
513 std::unique_ptr
<DirItem
[]> items_dir
;
514 std::vector
<CDir
*> auth_pinned_dirfrags
;
517 bool invalidating
= false;
520 typedef boost::intrusive_ptr
<MutationImpl
> MutationRef
;
521 typedef boost::intrusive_ptr
<MDRequestImpl
> MDRequestRef
;
523 inline ostream
& operator<<(ostream
&out
, const MutationImpl
&mut
)