]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Locker.cc
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / mds / Locker.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
7c673cae
FG
16#include "CDir.h"
17#include "CDentry.h"
9f95a23c
TL
18#include "CInode.h"
19#include "common/config.h"
20#include "events/EOpen.h"
21#include "events/EUpdate.h"
22#include "Locker.h"
23#include "MDBalancer.h"
24#include "MDCache.h"
7c673cae 25#include "MDLog.h"
9f95a23c 26#include "MDSRank.h"
7c673cae 27#include "MDSMap.h"
7c673cae 28#include "messages/MInodeFileCaps.h"
f67539c2 29#include "messages/MMDSPeerRequest.h"
9f95a23c
TL
30#include "Migrator.h"
31#include "msg/Messenger.h"
32#include "osdc/Objecter.h"
7c673cae
FG
33
34#define dout_subsys ceph_subsys_mds
35#undef dout_prefix
36#define dout_context g_ceph_context
37#define dout_prefix _prefix(_dout, mds)
20effc67
TL
38
39using namespace std;
40
7c673cae
FG
41static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
42 return *_dout << "mds." << mds->get_nodeid() << ".locker ";
43}
44
45
11fdf7f2 46class LockerContext : public MDSContext {
7c673cae
FG
47protected:
48 Locker *locker;
49 MDSRank *get_mds() override
50 {
51 return locker->mds;
52 }
53
54public:
55 explicit LockerContext(Locker *locker_) : locker(locker_) {
11fdf7f2 56 ceph_assert(locker != NULL);
7c673cae
FG
57 }
58};
59
60class LockerLogContext : public MDSLogContextBase {
61protected:
62 Locker *locker;
63 MDSRank *get_mds() override
64 {
65 return locker->mds;
66 }
67
68public:
69 explicit LockerLogContext(Locker *locker_) : locker(locker_) {
11fdf7f2 70 ceph_assert(locker != NULL);
7c673cae
FG
71 }
72};
73
11fdf7f2 74Locker::Locker(MDSRank *m, MDCache *c) :
9f95a23c 75 need_snapflush_inodes(member_offset(CInode, item_caps)), mds(m), mdcache(c) {}
11fdf7f2
TL
76
77
9f95a23c 78void Locker::dispatch(const cref_t<Message> &m)
7c673cae
FG
79{
80
81 switch (m->get_type()) {
7c673cae
FG
82 // inter-mds locking
83 case MSG_MDS_LOCK:
9f95a23c 84 handle_lock(ref_cast<MLock>(m));
7c673cae
FG
85 break;
86 // inter-mds caps
87 case MSG_MDS_INODEFILECAPS:
9f95a23c 88 handle_inode_file_caps(ref_cast<MInodeFileCaps>(m));
7c673cae 89 break;
7c673cae
FG
90 // client sync
91 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 92 handle_client_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
93 break;
94 case CEPH_MSG_CLIENT_CAPRELEASE:
9f95a23c 95 handle_client_cap_release(ref_cast<MClientCapRelease>(m));
7c673cae
FG
96 break;
97 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 98 handle_client_lease(ref_cast<MClientLease>(m));
7c673cae 99 break;
7c673cae
FG
100 default:
101 derr << "locker unknown message " << m->get_type() << dendl;
11fdf7f2 102 ceph_abort_msg("locker unknown message");
7c673cae
FG
103 }
104}
105
106void Locker::tick()
107{
108 scatter_tick();
109 caps_tick();
110}
111
112/*
113 * locks vs rejoin
114 *
115 *
116 *
117 */
118
119void Locker::send_lock_message(SimpleLock *lock, int msg)
120{
181888fb 121 for (const auto &it : lock->get_parent()->get_replicas()) {
7c673cae 122 if (mds->is_cluster_degraded() &&
181888fb 123 mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
7c673cae 124 continue;
9f95a23c 125 auto m = make_message<MLock>(lock, msg, mds->get_nodeid());
181888fb 126 mds->send_message_mds(m, it.first);
7c673cae
FG
127 }
128}
129
130void Locker::send_lock_message(SimpleLock *lock, int msg, const bufferlist &data)
131{
181888fb 132 for (const auto &it : lock->get_parent()->get_replicas()) {
7c673cae 133 if (mds->is_cluster_degraded() &&
181888fb 134 mds->mdsmap->get_state(it.first) < MDSMap::STATE_REJOIN)
7c673cae 135 continue;
9f95a23c 136 auto m = make_message<MLock>(lock, msg, mds->get_nodeid());
7c673cae 137 m->set_data(data);
181888fb 138 mds->send_message_mds(m, it.first);
7c673cae
FG
139 }
140}
141
9f95a23c
TL
142bool Locker::try_rdlock_snap_layout(CInode *in, MDRequestRef& mdr,
143 int n, bool want_layout)
7c673cae 144{
9f95a23c 145 dout(10) << __func__ << " " << *mdr << " " << *in << dendl;
7c673cae 146 // rdlock ancestor snaps
9f95a23c
TL
147 inodeno_t root;
148 int depth = -1;
149 bool found_locked = false;
150 bool found_layout = false;
151
152 if (want_layout)
153 ceph_assert(n == 0);
154
155 client_t client = mdr->get_client();
7c673cae 156
7c673cae 157 CInode *t = in;
9f95a23c
TL
158 while (true) {
159 ++depth;
160 if (!found_locked && mdr->is_rdlocked(&t->snaplock))
161 found_locked = true;
162
163 if (!found_locked) {
164 if (!t->snaplock.can_rdlock(client)) {
165 t->snaplock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
166 goto failed;
167 }
168 t->snaplock.get_rdlock();
169 mdr->locks.emplace(&t->snaplock, MutationImpl::LockOp::RDLOCK);
170 dout(20) << " got rdlock on " << t->snaplock << " " << *t << dendl;
171 }
172 if (want_layout && !found_layout) {
173 if (!mdr->is_rdlocked(&t->policylock)) {
174 if (!t->policylock.can_rdlock(client)) {
175 t->policylock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryRequest(mdcache, mdr));
176 goto failed;
177 }
178 t->policylock.get_rdlock();
179 mdr->locks.emplace(&t->policylock, MutationImpl::LockOp::RDLOCK);
180 dout(20) << " got rdlock on " << t->policylock << " " << *t << dendl;
181 }
7c673cae 182 if (t->get_projected_inode()->has_layout()) {
9f95a23c
TL
183 mdr->dir_layout = t->get_projected_inode()->layout;
184 found_layout = true;
7c673cae
FG
185 }
186 }
9f95a23c
TL
187 CDentry* pdn = t->get_projected_parent_dn();
188 if (!pdn) {
189 root = t->ino();
190 break;
191 }
192 t = pdn->get_dir()->get_inode();
7c673cae 193 }
9f95a23c
TL
194
195 mdr->dir_root[n] = root;
196 mdr->dir_depth[n] = depth;
197 return true;
198
199failed:
200 dout(10) << __func__ << " failed" << dendl;
201
202 drop_locks(mdr.get(), nullptr);
203 mdr->drop_local_auth_pins();
204 return false;
7c673cae
FG
205}
206
207struct MarkEventOnDestruct {
208 MDRequestRef& mdr;
11fdf7f2 209 std::string_view message;
7c673cae 210 bool mark_event;
11fdf7f2
TL
211 MarkEventOnDestruct(MDRequestRef& _mdr, std::string_view _message) :
212 mdr(_mdr),
213 message(_message),
214 mark_event(true) {}
7c673cae
FG
215 ~MarkEventOnDestruct() {
216 if (mark_event)
217 mdr->mark_event(message);
218 }
219};
220
221/* If this function returns false, the mdr has been placed
222 * on the appropriate wait list */
223bool Locker::acquire_locks(MDRequestRef& mdr,
11fdf7f2 224 MutationImpl::LockOpVec& lov,
7c673cae 225 CInode *auth_pin_freeze,
9f95a23c 226 bool auth_pin_nonblocking)
7c673cae 227{
7c673cae
FG
228 dout(10) << "acquire_locks " << *mdr << dendl;
229
230 MarkEventOnDestruct marker(mdr, "failed to acquire_locks");
231
232 client_t client = mdr->get_client();
233
11fdf7f2 234 set<MDSCacheObject*> mustpin; // items to authpin
9f95a23c
TL
235 if (auth_pin_freeze)
236 mustpin.insert(auth_pin_freeze);
7c673cae
FG
237
238 // xlocks
9f95a23c 239 for (size_t i = 0; i < lov.size(); ++i) {
11fdf7f2
TL
240 auto& p = lov[i];
241 SimpleLock *lock = p.lock;
242 MDSCacheObject *object = lock->get_parent();
243
244 if (p.is_xlock()) {
245 if ((lock->get_type() == CEPH_LOCK_ISNAP ||
246 lock->get_type() == CEPH_LOCK_IPOLICY) &&
247 mds->is_cluster_degraded() &&
f67539c2 248 mdr->is_leader() &&
11fdf7f2
TL
249 !mdr->is_queued_for_replay()) {
250 // waiting for recovering mds, to guarantee replayed requests and mksnap/setlayout
251 // get processed in proper order.
252 bool wait = false;
253 if (object->is_auth()) {
9f95a23c 254 if (!mdr->is_xlocked(lock)) {
11fdf7f2
TL
255 set<mds_rank_t> ls;
256 object->list_replicas(ls);
257 for (auto m : ls) {
258 if (mds->mdsmap->get_state(m) < MDSMap::STATE_ACTIVE) {
259 wait = true;
260 break;
261 }
b32b8144
FG
262 }
263 }
11fdf7f2 264 } else {
f67539c2 265 // if the lock is the latest locked one, it's possible that peer mds got the lock
11fdf7f2 266 // while there are recovering mds.
9f95a23c 267 if (!mdr->is_xlocked(lock) || mdr->is_last_locked(lock))
11fdf7f2
TL
268 wait = true;
269 }
270 if (wait) {
271 dout(10) << " must xlock " << *lock << " " << *object
272 << ", waiting for cluster recovered" << dendl;
273 mds->locker->drop_locks(mdr.get(), NULL);
274 mdr->drop_local_auth_pins();
275 mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
276 return false;
b32b8144 277 }
b32b8144 278 }
7c673cae 279
11fdf7f2 280 dout(20) << " must xlock " << *lock << " " << *object << dendl;
7c673cae 281
11fdf7f2 282 mustpin.insert(object);
7c673cae 283
11fdf7f2
TL
284 // augment xlock with a versionlock?
285 if (lock->get_type() == CEPH_LOCK_DN) {
286 CDentry *dn = static_cast<CDentry*>(object);
287 if (!dn->is_auth())
288 continue;
f67539c2
TL
289 if (mdr->is_leader()) {
290 // leader. wrlock versionlock so we can pipeline dentry updates to journal.
9f95a23c 291 lov.add_wrlock(&dn->versionlock, i + 1);
11fdf7f2 292 } else {
f67539c2 293 // peer. exclusively lock the dentry version (i.e. block other journal updates).
11fdf7f2 294 // this makes rollback safe.
9f95a23c 295 lov.add_xlock(&dn->versionlock, i + 1);
11fdf7f2 296 }
7c673cae 297 }
9f95a23c 298 if (lock->get_type() >= CEPH_LOCK_IFIRST && lock->get_type() != CEPH_LOCK_IVERSION) {
11fdf7f2
TL
299 // inode version lock?
300 CInode *in = static_cast<CInode*>(object);
301 if (!in->is_auth())
302 continue;
f67539c2
TL
303 if (mdr->is_leader()) {
304 // leader. wrlock versionlock so we can pipeline inode updates to journal.
9f95a23c 305 lov.add_wrlock(&in->versionlock, i + 1);
11fdf7f2 306 } else {
f67539c2 307 // peer. exclusively lock the inode version (i.e. block other journal updates).
11fdf7f2 308 // this makes rollback safe.
9f95a23c 309 lov.add_xlock(&in->versionlock, i + 1);
11fdf7f2 310 }
7c673cae 311 }
11fdf7f2
TL
312 } else if (p.is_wrlock()) {
313 dout(20) << " must wrlock " << *lock << " " << *object << dendl;
9f95a23c 314 client_t _client = p.is_state_pin() ? lock->get_excl_client() : client;
11fdf7f2
TL
315 if (object->is_auth()) {
316 mustpin.insert(object);
317 } else if (!object->is_auth() &&
9f95a23c 318 !lock->can_wrlock(_client) && // we might have to request a scatter
f67539c2 319 !mdr->is_peer()) { // if we are peer (remote_wrlock), the leader already authpinned
11fdf7f2
TL
320 dout(15) << " will also auth_pin " << *object
321 << " in case we need to request a scatter" << dendl;
322 mustpin.insert(object);
323 }
324 } else if (p.is_remote_wrlock()) {
325 dout(20) << " must remote_wrlock on mds." << p.wrlock_target << " "
326 << *lock << " " << *object << dendl;
7c673cae 327 mustpin.insert(object);
11fdf7f2
TL
328 } else if (p.is_rdlock()) {
329
330 dout(20) << " must rdlock " << *lock << " " << *object << dendl;
331 if (object->is_auth()) {
332 mustpin.insert(object);
333 } else if (!object->is_auth() &&
334 !lock->can_rdlock(client)) { // we might have to request an rdlock
335 dout(15) << " will also auth_pin " << *object
336 << " in case we need to request a rdlock" << dendl;
337 mustpin.insert(object);
338 }
339 } else {
340 ceph_assert(0 == "locker unknown lock operation");
7c673cae
FG
341 }
342 }
343
11fdf7f2 344 lov.sort_and_merge();
7c673cae
FG
345
346 // AUTH PINS
347 map<mds_rank_t, set<MDSCacheObject*> > mustpin_remote; // mds -> (object set)
348
349 // can i auth pin them all now?
350 marker.message = "failed to authpin local pins";
11fdf7f2
TL
351 for (const auto &p : mustpin) {
352 MDSCacheObject *object = p;
7c673cae
FG
353
354 dout(10) << " must authpin " << *object << dendl;
355
356 if (mdr->is_auth_pinned(object)) {
357 if (object != (MDSCacheObject*)auth_pin_freeze)
358 continue;
359 if (mdr->more()->is_remote_frozen_authpin) {
360 if (mdr->more()->rename_inode == auth_pin_freeze)
361 continue;
362 // unfreeze auth pin for the wrong inode
363 mustpin_remote[mdr->more()->rename_inode->authority().first].size();
364 }
365 }
366
367 if (!object->is_auth()) {
9f95a23c
TL
368 if (mdr->lock_cache) { // debug
369 ceph_assert(mdr->lock_cache->opcode == CEPH_MDS_OP_UNLINK);
370 CDentry *dn = mdr->dn[0].back();
371 ceph_assert(dn->get_projected_linkage()->is_remote());
372 }
373
7c673cae
FG
374 if (object->is_ambiguous_auth()) {
375 // wait
376 dout(10) << " ambiguous auth, waiting to authpin " << *object << dendl;
9f95a23c
TL
377 mdr->disable_lock_cache();
378 drop_locks(mdr.get());
7c673cae 379 mdr->drop_local_auth_pins();
9f95a23c
TL
380 marker.message = "waiting for single auth, object is being migrated";
381 object->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
382 return false;
383 }
384 mustpin_remote[object->authority().first].insert(object);
385 continue;
386 }
91327a77
AA
387 int err = 0;
388 if (!object->can_auth_pin(&err)) {
9f95a23c
TL
389 if (mdr->lock_cache) {
390 CDir *dir;
391 if (CInode *in = dynamic_cast<CInode*>(object)) {
392 ceph_assert(!in->is_frozen_inode() && !in->is_frozen_auth_pin());
393 dir = in->get_projected_parent_dir();
394 } else if (CDentry *dn = dynamic_cast<CDentry*>(object)) {
395 dir = dn->get_dir();
396 } else {
397 ceph_assert(0 == "unknown type of lock parent");
398 }
399 if (dir->get_inode() == mdr->lock_cache->get_dir_inode()) {
400 // forcibly auth pin if there is lock cache on parent dir
401 continue;
402 }
403
404 { // debug
405 ceph_assert(mdr->lock_cache->opcode == CEPH_MDS_OP_UNLINK);
406 CDentry *dn = mdr->dn[0].back();
407 ceph_assert(dn->get_projected_linkage()->is_remote());
408 }
409 }
410
7c673cae 411 // wait
9f95a23c 412 mdr->disable_lock_cache();
7c673cae
FG
413 drop_locks(mdr.get());
414 mdr->drop_local_auth_pins();
9f95a23c 415 if (auth_pin_nonblocking) {
7c673cae
FG
416 dout(10) << " can't auth_pin (freezing?) " << *object << ", nonblocking" << dendl;
417 mdr->aborted = true;
418 return false;
419 }
91327a77
AA
420 if (err == MDSCacheObject::ERR_EXPORTING_TREE) {
421 marker.message = "failed to authpin, subtree is being exported";
422 } else if (err == MDSCacheObject::ERR_FRAGMENTING_DIR) {
423 marker.message = "failed to authpin, dir is being fragmented";
424 } else if (err == MDSCacheObject::ERR_EXPORTING_INODE) {
425 marker.message = "failed to authpin, inode is being exported";
426 }
7c673cae
FG
427 dout(10) << " can't auth_pin (freezing?), waiting to authpin " << *object << dendl;
428 object->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
224ce89b 429
9f95a23c 430 if (mdr->is_any_remote_auth_pin())
224ce89b
WB
431 notify_freeze_waiter(object);
432
7c673cae
FG
433 return false;
434 }
435 }
436
437 // ok, grab local auth pins
11fdf7f2
TL
438 for (const auto& p : mustpin) {
439 MDSCacheObject *object = p;
7c673cae
FG
440 if (mdr->is_auth_pinned(object)) {
441 dout(10) << " already auth_pinned " << *object << dendl;
442 } else if (object->is_auth()) {
443 dout(10) << " auth_pinning " << *object << dendl;
444 mdr->auth_pin(object);
445 }
446 }
447
448 // request remote auth_pins
449 if (!mustpin_remote.empty()) {
450 marker.message = "requesting remote authpins";
9f95a23c
TL
451 for (const auto& p : mdr->object_states) {
452 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
453 continue;
454 ceph_assert(p.second.remote_auth_pinned == p.first->authority().first);
455 auto q = mustpin_remote.find(p.second.remote_auth_pinned);
456 if (q != mustpin_remote.end())
457 q->second.insert(p.first);
7c673cae 458 }
9f95a23c
TL
459
460 for (auto& p : mustpin_remote) {
461 dout(10) << "requesting remote auth_pins from mds." << p.first << dendl;
7c673cae
FG
462
463 // wait for active auth
464 if (mds->is_cluster_degraded() &&
9f95a23c
TL
465 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first)) {
466 dout(10) << " mds." << p.first << " is not active" << dendl;
f67539c2 467 if (mdr->more()->waiting_on_peer.empty())
9f95a23c 468 mds->wait_for_active_peer(p.first, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
469 return false;
470 }
471
f67539c2
TL
472 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt,
473 MMDSPeerRequest::OP_AUTHPIN);
9f95a23c
TL
474 for (auto& o : p.second) {
475 dout(10) << " req remote auth_pin of " << *o << dendl;
7c673cae 476 MDSCacheObjectInfo info;
9f95a23c 477 o->set_object_info(info);
7c673cae 478 req->get_authpins().push_back(info);
9f95a23c
TL
479 if (o == auth_pin_freeze)
480 o->set_object_info(req->get_authpin_freeze());
481 mdr->pin(o);
7c673cae 482 }
9f95a23c
TL
483 if (auth_pin_nonblocking)
484 req->mark_nonblocking();
485 else if (!mdr->locks.empty())
486 req->mark_notify_blocking();
487
488 mds->send_message_mds(req, p.first);
7c673cae
FG
489
490 // put in waiting list
f67539c2 491 auto ret = mdr->more()->waiting_on_peer.insert(p.first);
9f95a23c 492 ceph_assert(ret.second);
7c673cae
FG
493 }
494 return false;
495 }
496
497 // caps i'll need to issue
498 set<CInode*> issue_set;
499 bool result = false;
500
501 // acquire locks.
502 // make sure they match currently acquired locks.
11fdf7f2 503 for (const auto& p : lov) {
9f95a23c
TL
504 auto lock = p.lock;
505 if (p.is_xlock()) {
506 if (mdr->is_xlocked(lock)) {
507 dout(10) << " already xlocked " << *lock << " " << *lock->get_parent() << dendl;
7c673cae
FG
508 continue;
509 }
9f95a23c
TL
510 if (mdr->locking && lock != mdr->locking)
511 cancel_locking(mdr.get(), &issue_set);
512 if (!xlock_start(lock, mdr)) {
513 marker.message = "failed to xlock, waiting";
514 goto out;
7c673cae 515 }
9f95a23c
TL
516 dout(10) << " got xlock on " << *lock << " " << *lock->get_parent() << dendl;
517 } else if (p.is_wrlock() || p.is_remote_wrlock()) {
518 auto it = mdr->locks.find(lock);
519 if (p.is_remote_wrlock()) {
520 if (it != mdr->locks.end() && it->is_remote_wrlock()) {
521 dout(10) << " already remote_wrlocked " << *lock << " " << *lock->get_parent() << dendl;
522 } else {
523 if (mdr->locking && lock != mdr->locking)
524 cancel_locking(mdr.get(), &issue_set);
525 marker.message = "waiting for remote wrlocks";
526 remote_wrlock_start(lock, p.wrlock_target, mdr);
527 goto out;
528 }
529 }
530 if (p.is_wrlock()) {
531 if (it != mdr->locks.end() && it->is_wrlock()) {
532 dout(10) << " already wrlocked " << *lock << " " << *lock->get_parent() << dendl;
7c673cae
FG
533 continue;
534 }
9f95a23c
TL
535 client_t _client = p.is_state_pin() ? lock->get_excl_client() : client;
536 if (p.is_remote_wrlock()) {
537 // nowait if we have already gotten remote wrlock
538 if (!wrlock_try(lock, mdr, _client)) {
539 marker.message = "failed to wrlock, dropping remote wrlock and waiting";
540 // can't take the wrlock because the scatter lock is gathering. need to
541 // release the remote wrlock, so that the gathering process can finish.
542 ceph_assert(it != mdr->locks.end());
543 remote_wrlock_finish(it, mdr.get());
544 remote_wrlock_start(lock, p.wrlock_target, mdr);
545 goto out;
546 }
547 } else {
548 if (!wrlock_start(p, mdr)) {
549 ceph_assert(!p.is_remote_wrlock());
550 marker.message = "failed to wrlock, waiting";
551 goto out;
552 }
11fdf7f2 553 }
9f95a23c 554 dout(10) << " got wrlock on " << *lock << " " << *lock->get_parent() << dendl;
7c673cae 555 }
9f95a23c
TL
556 } else {
557 if (mdr->is_rdlocked(lock)) {
558 dout(10) << " already rdlocked " << *lock << " " << *lock->get_parent() << dendl;
559 continue;
7c673cae 560 }
7c673cae 561
f67539c2 562 ceph_assert(mdr->is_leader());
9f95a23c 563 if (lock->needs_recover()) {
b32b8144
FG
564 if (mds->is_cluster_degraded()) {
565 if (!mdr->is_queued_for_replay()) {
566 // see comments in SimpleLock::set_state_rejoin() and
567 // ScatterLock::encode_state_for_rejoin()
568 drop_locks(mdr.get());
569 mds->wait_for_cluster_recovered(new C_MDS_RetryRequest(mdcache, mdr));
9f95a23c 570 dout(10) << " rejoin recovering " << *lock << " " << *lock->get_parent()
b32b8144
FG
571 << ", waiting for cluster recovered" << dendl;
572 marker.message = "rejoin recovering lock, waiting for cluster recovered";
573 return false;
7c673cae 574 }
b32b8144 575 } else {
9f95a23c 576 lock->clear_need_recover();
7c673cae
FG
577 }
578 }
579
9f95a23c
TL
580 if (!rdlock_start(lock, mdr)) {
581 marker.message = "failed to rdlock, waiting";
7c673cae 582 goto out;
9f95a23c
TL
583 }
584 dout(10) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl;
7c673cae 585 }
7c673cae
FG
586 }
587
7c673cae
FG
588 mdr->set_mds_stamp(ceph_clock_now());
589 result = true;
590 marker.message = "acquired locks";
591
592 out:
593 issue_caps_set(issue_set);
594 return result;
595}
596
224ce89b
WB
597void Locker::notify_freeze_waiter(MDSCacheObject *o)
598{
599 CDir *dir = NULL;
600 if (CInode *in = dynamic_cast<CInode*>(o)) {
601 if (!in->is_root())
602 dir = in->get_parent_dir();
603 } else if (CDentry *dn = dynamic_cast<CDentry*>(o)) {
604 dir = dn->get_dir();
605 } else {
606 dir = dynamic_cast<CDir*>(o);
11fdf7f2 607 ceph_assert(dir);
224ce89b
WB
608 }
609 if (dir) {
610 if (dir->is_freezing_dir())
611 mdcache->fragment_freeze_inc_num_waiters(dir);
612 if (dir->is_freezing_tree()) {
613 while (!dir->is_freezing_tree_root())
614 dir = dir->get_parent_dir();
615 mdcache->migrator->export_freeze_inc_num_waiters(dir);
616 }
617 }
618}
7c673cae
FG
619
620void Locker::set_xlocks_done(MutationImpl *mut, bool skip_dentry)
621{
11fdf7f2
TL
622 for (const auto &p : mut->locks) {
623 if (!p.is_xlock())
624 continue;
625 MDSCacheObject *obj = p.lock->get_parent();
626 ceph_assert(obj->is_auth());
7c673cae 627 if (skip_dentry &&
11fdf7f2 628 (p.lock->get_type() == CEPH_LOCK_DN || p.lock->get_type() == CEPH_LOCK_DVERSION))
7c673cae 629 continue;
11fdf7f2
TL
630 dout(10) << "set_xlocks_done on " << *p.lock << " " << *obj << dendl;
631 p.lock->set_xlock_done();
7c673cae
FG
632 }
633}
634
11fdf7f2
TL
635void Locker::_drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue,
636 bool drop_rdlocks)
7c673cae 637{
f67539c2 638 set<mds_rank_t> peers;
7c673cae 639
11fdf7f2
TL
640 for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
641 SimpleLock *lock = it->lock;
642 MDSCacheObject *obj = lock->get_parent();
643
644 if (it->is_xlock()) {
645 if (obj->is_auth()) {
646 bool ni = false;
647 xlock_finish(it++, mut, &ni);
648 if (ni)
649 pneed_issue->insert(static_cast<CInode*>(obj));
650 } else {
651 ceph_assert(lock->get_sm()->can_remote_xlock);
f67539c2 652 peers.insert(obj->authority().first);
11fdf7f2
TL
653 lock->put_xlock();
654 mut->locks.erase(it++);
655 }
656 } else if (it->is_wrlock() || it->is_remote_wrlock()) {
657 if (it->is_remote_wrlock()) {
f67539c2 658 peers.insert(it->wrlock_target);
11fdf7f2
TL
659 it->clear_remote_wrlock();
660 }
661 if (it->is_wrlock()) {
662 bool ni = false;
663 wrlock_finish(it++, mut, &ni);
664 if (ni)
665 pneed_issue->insert(static_cast<CInode*>(obj));
666 } else {
667 mut->locks.erase(it++);
668 }
669 } else if (drop_rdlocks && it->is_rdlock()) {
670 bool ni = false;
671 rdlock_finish(it++, mut, &ni);
672 if (ni)
673 pneed_issue->insert(static_cast<CInode*>(obj));
674 } else {
675 ++it;
7c673cae 676 }
7c673cae
FG
677 }
678
9f95a23c
TL
679 if (drop_rdlocks) {
680 if (mut->lock_cache) {
681 put_lock_cache(mut->lock_cache);
682 mut->lock_cache = nullptr;
683 }
684 }
685
f67539c2 686 for (set<mds_rank_t>::iterator p = peers.begin(); p != peers.end(); ++p) {
7c673cae
FG
687 if (!mds->is_cluster_degraded() ||
688 mds->mdsmap->get_state(*p) >= MDSMap::STATE_REJOIN) {
689 dout(10) << "_drop_non_rdlocks dropping remote locks on mds." << *p << dendl;
f67539c2
TL
690 auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt,
691 MMDSPeerRequest::OP_DROPLOCKS);
692 mds->send_message_mds(peerreq, *p);
7c673cae
FG
693 }
694 }
695}
696
697void Locker::cancel_locking(MutationImpl *mut, set<CInode*> *pneed_issue)
698{
699 SimpleLock *lock = mut->locking;
11fdf7f2 700 ceph_assert(lock);
7c673cae
FG
701 dout(10) << "cancel_locking " << *lock << " on " << *mut << dendl;
702
703 if (lock->get_parent()->is_auth()) {
704 bool need_issue = false;
705 if (lock->get_state() == LOCK_PREXLOCK) {
706 _finish_xlock(lock, -1, &need_issue);
a8e16298 707 } else if (lock->get_state() == LOCK_LOCK_XLOCK) {
7c673cae
FG
708 lock->set_state(LOCK_XLOCKDONE);
709 eval_gather(lock, true, &need_issue);
710 }
711 if (need_issue)
712 pneed_issue->insert(static_cast<CInode *>(lock->get_parent()));
713 }
714 mut->finish_locking(lock);
715}
716
717void Locker::drop_locks(MutationImpl *mut, set<CInode*> *pneed_issue)
718{
719 // leftover locks
720 set<CInode*> my_need_issue;
721 if (!pneed_issue)
722 pneed_issue = &my_need_issue;
723
724 if (mut->locking)
725 cancel_locking(mut, pneed_issue);
11fdf7f2 726 _drop_locks(mut, pneed_issue, true);
7c673cae
FG
727
728 if (pneed_issue == &my_need_issue)
729 issue_caps_set(*pneed_issue);
9f95a23c 730 mut->locking_state = 0;
7c673cae
FG
731}
732
733void Locker::drop_non_rdlocks(MutationImpl *mut, set<CInode*> *pneed_issue)
734{
735 set<CInode*> my_need_issue;
736 if (!pneed_issue)
737 pneed_issue = &my_need_issue;
738
11fdf7f2 739 _drop_locks(mut, pneed_issue, false);
7c673cae
FG
740
741 if (pneed_issue == &my_need_issue)
742 issue_caps_set(*pneed_issue);
743}
744
b32b8144 745void Locker::drop_rdlocks_for_early_reply(MutationImpl *mut)
7c673cae 746{
b32b8144 747 set<CInode*> need_issue;
7c673cae 748
11fdf7f2
TL
749 for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
750 if (!it->is_rdlock()) {
751 ++it;
752 continue;
753 }
754 SimpleLock *lock = it->lock;
b32b8144
FG
755 // make later mksnap/setlayout (at other mds) wait for this unsafe request
756 if (lock->get_type() == CEPH_LOCK_ISNAP ||
11fdf7f2
TL
757 lock->get_type() == CEPH_LOCK_IPOLICY) {
758 ++it;
b32b8144 759 continue;
11fdf7f2 760 }
b32b8144 761 bool ni = false;
11fdf7f2 762 rdlock_finish(it++, mut, &ni);
b32b8144
FG
763 if (ni)
764 need_issue.insert(static_cast<CInode*>(lock->get_parent()));
765 }
7c673cae 766
b32b8144 767 issue_caps_set(need_issue);
7c673cae
FG
768}
769
a8e16298
TL
770void Locker::drop_locks_for_fragment_unfreeze(MutationImpl *mut)
771{
772 set<CInode*> need_issue;
773
774 for (auto it = mut->locks.begin(); it != mut->locks.end(); ) {
11fdf7f2 775 SimpleLock *lock = it->lock;
a8e16298 776 if (lock->get_type() == CEPH_LOCK_IDFT) {
11fdf7f2 777 ++it;
a8e16298
TL
778 continue;
779 }
780 bool ni = false;
11fdf7f2 781 wrlock_finish(it++, mut, &ni);
a8e16298
TL
782 if (ni)
783 need_issue.insert(static_cast<CInode*>(lock->get_parent()));
784 }
785 issue_caps_set(need_issue);
786}
787
9f95a23c
TL
788class C_MDL_DropCache : public LockerContext {
789 MDLockCache *lock_cache;
790public:
791 C_MDL_DropCache(Locker *l, MDLockCache *lc) :
792 LockerContext(l), lock_cache(lc) { }
793 void finish(int r) override {
794 locker->drop_locks(lock_cache);
795 lock_cache->cleanup();
796 delete lock_cache;
797 }
798};
799
800void Locker::put_lock_cache(MDLockCache* lock_cache)
801{
802 ceph_assert(lock_cache->ref > 0);
803 if (--lock_cache->ref > 0)
804 return;
805
806 ceph_assert(lock_cache->invalidating);
807
1911f103
TL
808 lock_cache->detach_locks();
809
9f95a23c
TL
810 CInode *diri = lock_cache->get_dir_inode();
811 for (auto dir : lock_cache->auth_pinned_dirfrags) {
812 if (dir->get_inode() != diri)
813 continue;
814 dir->enable_frozen_inode();
815 }
816
817 mds->queue_waiter(new C_MDL_DropCache(this, lock_cache));
818}
819
820int Locker::get_cap_bit_for_lock_cache(int op)
821{
822 switch(op) {
823 case CEPH_MDS_OP_CREATE:
824 return CEPH_CAP_DIR_CREATE;
825 case CEPH_MDS_OP_UNLINK:
826 return CEPH_CAP_DIR_UNLINK;
827 default:
828 ceph_assert(0 == "unsupported operation");
829 return 0;
830 }
831}
832
833void Locker::invalidate_lock_cache(MDLockCache *lock_cache)
834{
835 ceph_assert(lock_cache->item_cap_lock_cache.is_on_list());
836 if (lock_cache->invalidating) {
837 ceph_assert(!lock_cache->client_cap);
838 } else {
839 lock_cache->invalidating = true;
1911f103 840 lock_cache->detach_dirfrags();
9f95a23c
TL
841 }
842
843 Capability *cap = lock_cache->client_cap;
844 if (cap) {
845 int cap_bit = get_cap_bit_for_lock_cache(lock_cache->opcode);
846 cap->clear_lock_cache_allowed(cap_bit);
847 if (cap->issued() & cap_bit)
848 issue_caps(lock_cache->get_dir_inode(), cap);
849 else
850 cap = nullptr;
851 }
852
853 if (!cap) {
854 lock_cache->item_cap_lock_cache.remove_myself();
855 put_lock_cache(lock_cache);
856 }
857}
858
859void Locker::eval_lock_caches(Capability *cap)
860{
861 for (auto p = cap->lock_caches.begin(); !p.end(); ) {
862 MDLockCache *lock_cache = *p;
863 ++p;
864 if (!lock_cache->invalidating)
865 continue;
866 int cap_bit = get_cap_bit_for_lock_cache(lock_cache->opcode);
867 if (!(cap->issued() & cap_bit)) {
868 lock_cache->item_cap_lock_cache.remove_myself();
869 put_lock_cache(lock_cache);
870 }
871 }
872}
873
874// ask lock caches to release auth pins
875void Locker::invalidate_lock_caches(CDir *dir)
876{
877 dout(10) << "invalidate_lock_caches on " << *dir << dendl;
878 auto &lock_caches = dir->lock_caches_with_auth_pins;
879 while (!lock_caches.empty()) {
880 invalidate_lock_cache(lock_caches.front()->parent);
881 }
882}
883
884// ask lock caches to release locks
885void Locker::invalidate_lock_caches(SimpleLock *lock)
886{
887 dout(10) << "invalidate_lock_caches " << *lock << " on " << *lock->get_parent() << dendl;
1911f103
TL
888 if (lock->is_cached()) {
889 auto&& lock_caches = lock->get_active_caches();
890 for (auto& lc : lock_caches)
891 invalidate_lock_cache(lc);
9f95a23c
TL
892 }
893}
894
895void Locker::create_lock_cache(MDRequestRef& mdr, CInode *diri, file_layout_t *dir_layout)
896{
897 if (mdr->lock_cache)
898 return;
899
900 client_t client = mdr->get_client();
901 int opcode = mdr->client_request->get_op();
902 dout(10) << "create_lock_cache for client." << client << "/" << ceph_mds_op_name(opcode)<< " on " << *diri << dendl;
903
904 if (!diri->is_auth()) {
905 dout(10) << " dir inode is not auth, noop" << dendl;
906 return;
907 }
908
f67539c2
TL
909 if (mdr->has_more() && !mdr->more()->peers.empty()) {
910 dout(10) << " there are peers requests for " << *mdr << ", noop" << dendl;
9f95a23c
TL
911 return;
912 }
913
914 Capability *cap = diri->get_client_cap(client);
915 if (!cap) {
916 dout(10) << " there is no cap for client." << client << ", noop" << dendl;
917 return;
918 }
919
920 for (auto p = cap->lock_caches.begin(); !p.end(); ++p) {
921 if ((*p)->opcode == opcode) {
922 dout(10) << " lock cache already exists for " << ceph_mds_op_name(opcode) << ", noop" << dendl;
923 return;
924 }
925 }
926
927 set<MDSCacheObject*> ancestors;
928 for (CInode *in = diri; ; ) {
929 CDentry *pdn = in->get_projected_parent_dn();
930 if (!pdn)
931 break;
932 // ancestors.insert(pdn);
933 in = pdn->get_dir()->get_inode();
934 ancestors.insert(in);
935 }
936
937 for (auto& p : mdr->object_states) {
938 if (p.first != diri && !ancestors.count(p.first))
939 continue;
940 auto& stat = p.second;
941 if (stat.auth_pinned) {
942 if (!p.first->can_auth_pin()) {
943 dout(10) << " can't auth_pin(freezing?) lock parent " << *p.first << ", noop" << dendl;
944 return;
945 }
946 if (CInode *in = dynamic_cast<CInode*>(p.first); in->is_parent_projected()) {
947 CDir *dir = in->get_projected_parent_dir();
948 if (!dir->can_auth_pin()) {
949 dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl;
950 return;
951 }
952 }
953 }
954 }
955
956 std::vector<CDir*> dfv;
957 dfv.reserve(diri->get_num_dirfrags());
958
959 diri->get_dirfrags(dfv);
960 for (auto dir : dfv) {
961 if (!dir->is_auth() || !dir->can_auth_pin()) {
962 dout(10) << " can't auth_pin(!auth|freezing?) dirfrag " << *dir << ", noop" << dendl;
963 return;
964 }
965 if (dir->is_any_freezing_or_frozen_inode()) {
966 dout(10) << " there is freezing/frozen inode in " << *dir << ", noop" << dendl;
967 return;
968 }
969 }
970
971 for (auto& p : mdr->locks) {
972 MDSCacheObject *obj = p.lock->get_parent();
973 if (obj != diri && !ancestors.count(obj))
974 continue;
975 if (!p.lock->is_stable()) {
976 dout(10) << " unstable " << *p.lock << " on " << *obj << ", noop" << dendl;
977 return;
978 }
979 }
980
981 auto lock_cache = new MDLockCache(cap, opcode);
982 if (dir_layout)
983 lock_cache->set_dir_layout(*dir_layout);
984 cap->set_lock_cache_allowed(get_cap_bit_for_lock_cache(opcode));
985
986 for (auto dir : dfv) {
987 // prevent subtree migration
988 lock_cache->auth_pin(dir);
989 // prevent frozen inode
990 dir->disable_frozen_inode();
991 }
992
993 for (auto& p : mdr->object_states) {
994 if (p.first != diri && !ancestors.count(p.first))
995 continue;
996 auto& stat = p.second;
997 if (stat.auth_pinned)
998 lock_cache->auth_pin(p.first);
999 else
1000 lock_cache->pin(p.first);
1001
1002 if (CInode *in = dynamic_cast<CInode*>(p.first)) {
1003 CDentry *pdn = in->get_projected_parent_dn();
1004 if (pdn)
1005 dfv.push_back(pdn->get_dir());
1006 } else if (CDentry *dn = dynamic_cast<CDentry*>(p.first)) {
1007 dfv.push_back(dn->get_dir());
1008 } else {
1009 ceph_assert(0 == "unknown type of lock parent");
1010 }
1011 }
1012 lock_cache->attach_dirfrags(std::move(dfv));
1013
1014 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
1015 MDSCacheObject *obj = it->lock->get_parent();
1016 if (obj != diri && !ancestors.count(obj)) {
1017 ++it;
1018 continue;
1019 }
1020 unsigned lock_flag = 0;
1021 if (it->is_wrlock()) {
1022 // skip wrlocks that were added by MDCache::predirty_journal_parent()
1023 if (obj == diri)
1024 lock_flag = MutationImpl::LockOp::WRLOCK;
1025 } else {
1026 ceph_assert(it->is_rdlock());
1027 lock_flag = MutationImpl::LockOp::RDLOCK;
1028 }
1029 if (lock_flag) {
1030 lock_cache->emplace_lock(it->lock, lock_flag);
1031 mdr->locks.erase(it++);
1032 } else {
1033 ++it;
1034 }
1035 }
1036 lock_cache->attach_locks();
1037
1038 lock_cache->ref++;
1039 mdr->lock_cache = lock_cache;
1040}
1041
1042bool Locker::find_and_attach_lock_cache(MDRequestRef& mdr, CInode *diri)
1043{
1044 if (mdr->lock_cache)
1045 return true;
1046
1047 Capability *cap = diri->get_client_cap(mdr->get_client());
1048 if (!cap)
1049 return false;
1050
1051 int opcode = mdr->client_request->get_op();
1052 for (auto p = cap->lock_caches.begin(); !p.end(); ++p) {
1053 MDLockCache *lock_cache = *p;
1054 if (lock_cache->opcode == opcode) {
1055 dout(10) << "found lock cache for " << ceph_mds_op_name(opcode) << " on " << *diri << dendl;
1056 mdr->lock_cache = lock_cache;
1057 mdr->lock_cache->ref++;
1058 return true;
1059 }
1060 }
1061 return false;
1062}
1063
7c673cae
FG
1064// generics
1065
11fdf7f2 1066void Locker::eval_gather(SimpleLock *lock, bool first, bool *pneed_issue, MDSContext::vec *pfinishers)
7c673cae
FG
1067{
1068 dout(10) << "eval_gather " << *lock << " on " << *lock->get_parent() << dendl;
11fdf7f2 1069 ceph_assert(!lock->is_stable());
7c673cae
FG
1070
1071 int next = lock->get_next_state();
1072
1073 CInode *in = 0;
1074 bool caps = lock->get_cap_shift();
1075 if (lock->get_type() != CEPH_LOCK_DN)
1076 in = static_cast<CInode *>(lock->get_parent());
1077
1078 bool need_issue = false;
1079
1080 int loner_issued = 0, other_issued = 0, xlocker_issued = 0;
11fdf7f2 1081 ceph_assert(!caps || in != NULL);
7c673cae
FG
1082 if (caps && in->is_head()) {
1083 in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued,
1084 lock->get_cap_shift(), lock->get_cap_mask());
1085 dout(10) << " next state is " << lock->get_state_name(next)
1086 << " issued/allows loner " << gcap_string(loner_issued)
1087 << "/" << gcap_string(lock->gcaps_allowed(CAP_LONER, next))
1088 << " xlocker " << gcap_string(xlocker_issued)
1089 << "/" << gcap_string(lock->gcaps_allowed(CAP_XLOCKER, next))
1090 << " other " << gcap_string(other_issued)
1091 << "/" << gcap_string(lock->gcaps_allowed(CAP_ANY, next))
1092 << dendl;
1093
1094 if (first && ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) ||
1095 (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) ||
1096 (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued)))
1097 need_issue = true;
1098 }
1099
1100#define IS_TRUE_AND_LT_AUTH(x, auth) (x && ((auth && x <= AUTH) || (!auth && x < AUTH)))
1101 bool auth = lock->get_parent()->is_auth();
1102 if (!lock->is_gathering() &&
1103 (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_rdlock, auth) || !lock->is_rdlocked()) &&
1104 (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_wrlock, auth) || !lock->is_wrlocked()) &&
1105 (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_xlock, auth) || !lock->is_xlocked()) &&
1106 (IS_TRUE_AND_LT_AUTH(lock->get_sm()->states[next].can_lease, auth) || !lock->is_leased()) &&
1107 !(lock->get_parent()->is_auth() && lock->is_flushing()) && // i.e. wait for scatter_writebehind!
1108 (!caps || ((~lock->gcaps_allowed(CAP_ANY, next) & other_issued) == 0 &&
1109 (~lock->gcaps_allowed(CAP_LONER, next) & loner_issued) == 0 &&
1110 (~lock->gcaps_allowed(CAP_XLOCKER, next) & xlocker_issued) == 0)) &&
1111 lock->get_state() != LOCK_SYNC_MIX2 && // these states need an explicit trigger from the auth mds
1112 lock->get_state() != LOCK_MIX_SYNC2
1113 ) {
1114 dout(7) << "eval_gather finished gather on " << *lock
1115 << " on " << *lock->get_parent() << dendl;
1116
1117 if (lock->get_sm() == &sm_filelock) {
11fdf7f2 1118 ceph_assert(in);
7c673cae
FG
1119 if (in->state_test(CInode::STATE_RECOVERING)) {
1120 dout(7) << "eval_gather finished gather, but still recovering" << dendl;
1121 return;
1122 } else if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
1123 dout(7) << "eval_gather finished gather, but need to recover" << dendl;
1124 mds->mdcache->queue_file_recover(in);
1125 mds->mdcache->do_file_recover();
1126 return;
1127 }
1128 }
1129
1130 if (!lock->get_parent()->is_auth()) {
1131 // replica: tell auth
1132 mds_rank_t auth = lock->get_parent()->authority().first;
1133
1134 if (lock->get_parent()->is_rejoining() &&
1135 mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
1136 dout(7) << "eval_gather finished gather, but still rejoining "
1137 << *lock->get_parent() << dendl;
1138 return;
1139 }
1140
1141 if (!mds->is_cluster_degraded() ||
1142 mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
1143 switch (lock->get_state()) {
1144 case LOCK_SYNC_LOCK:
9f95a23c 1145 mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_LOCKACK, mds->get_nodeid()), auth);
7c673cae
FG
1146 break;
1147
1148 case LOCK_MIX_SYNC:
1149 {
9f95a23c 1150 auto reply = make_message<MLock>(lock, LOCK_AC_SYNCACK, mds->get_nodeid());
7c673cae
FG
1151 lock->encode_locked_state(reply->get_data());
1152 mds->send_message_mds(reply, auth);
1153 next = LOCK_MIX_SYNC2;
1154 (static_cast<ScatterLock *>(lock))->start_flush();
1155 }
1156 break;
1157
1158 case LOCK_MIX_SYNC2:
1159 (static_cast<ScatterLock *>(lock))->finish_flush();
1160 (static_cast<ScatterLock *>(lock))->clear_flushed();
1161
1162 case LOCK_SYNC_MIX2:
1163 // do nothing, we already acked
1164 break;
1165
1166 case LOCK_SYNC_MIX:
1167 {
9f95a23c 1168 auto reply = make_message<MLock>(lock, LOCK_AC_MIXACK, mds->get_nodeid());
7c673cae
FG
1169 mds->send_message_mds(reply, auth);
1170 next = LOCK_SYNC_MIX2;
1171 }
1172 break;
1173
1174 case LOCK_MIX_LOCK:
1175 {
1176 bufferlist data;
1177 lock->encode_locked_state(data);
9f95a23c 1178 mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_LOCKACK, mds->get_nodeid(), data), auth);
7c673cae
FG
1179 (static_cast<ScatterLock *>(lock))->start_flush();
1180 // we'll get an AC_LOCKFLUSHED to complete
1181 }
1182 break;
1183
1184 default:
1185 ceph_abort();
1186 }
1187 }
1188 } else {
1189 // auth
1190
1191 // once the first (local) stage of mix->lock gather complete we can
1192 // gather from replicas
1193 if (lock->get_state() == LOCK_MIX_LOCK &&
1194 lock->get_parent()->is_replicated()) {
1195 dout(10) << " finished (local) gather for mix->lock, now gathering from replicas" << dendl;
1196 send_lock_message(lock, LOCK_AC_LOCK);
1197 lock->init_gather();
1198 lock->set_state(LOCK_MIX_LOCK2);
1199 return;
1200 }
1201
1202 if (lock->is_dirty() && !lock->is_flushed()) {
1203 scatter_writebehind(static_cast<ScatterLock *>(lock));
7c673cae
FG
1204 return;
1205 }
1206 lock->clear_flushed();
1207
1208 switch (lock->get_state()) {
1209 // to mixed
1210 case LOCK_TSYN_MIX:
1211 case LOCK_SYNC_MIX:
1212 case LOCK_EXCL_MIX:
b32b8144 1213 case LOCK_XSYN_MIX:
7c673cae
FG
1214 in->start_scatter(static_cast<ScatterLock *>(lock));
1215 if (lock->get_parent()->is_replicated()) {
1216 bufferlist softdata;
1217 lock->encode_locked_state(softdata);
1218 send_lock_message(lock, LOCK_AC_MIX, softdata);
1219 }
1220 (static_cast<ScatterLock *>(lock))->clear_scatter_wanted();
1221 break;
1222
1223 case LOCK_XLOCK:
1224 case LOCK_XLOCKDONE:
1225 if (next != LOCK_SYNC)
1226 break;
1227 // fall-thru
1228
1229 // to sync
1230 case LOCK_EXCL_SYNC:
1231 case LOCK_LOCK_SYNC:
1232 case LOCK_MIX_SYNC:
1233 case LOCK_XSYN_SYNC:
1234 if (lock->get_parent()->is_replicated()) {
1235 bufferlist softdata;
1236 lock->encode_locked_state(softdata);
1237 send_lock_message(lock, LOCK_AC_SYNC, softdata);
1238 }
1239 break;
f38dd50b
TL
1240 case LOCK_XLOCKSNAP:
1241 if (lock->get_sm() == &sm_filelock) {
1242 int pending = lock->gcaps_allowed(CAP_ANY) ||
1243 lock->gcaps_allowed(CAP_LONER) ||
1244 lock->gcaps_allowed(CAP_XLOCKER);
1245 int revoke = ~pending & (loner_issued | other_issued | xlocker_issued);
1246
1247 // wait for 'Fb' to be revoked
1248 if (revoke & CEPH_CAP_GBUFFER) {
1249 return;
1250 }
1251 }
1252 break;
7c673cae
FG
1253 }
1254
1255 }
1256
1257 lock->set_state(next);
1258
1259 if (lock->get_parent()->is_auth() &&
1260 lock->is_stable())
1261 lock->get_parent()->auth_unpin(lock);
1262
1263 // drop loner before doing waiters
1264 if (caps &&
1265 in->is_head() &&
1266 in->is_auth() &&
1267 in->get_wanted_loner() != in->get_loner()) {
1268 dout(10) << " trying to drop loner" << dendl;
1269 if (in->try_drop_loner()) {
1270 dout(10) << " dropped loner" << dendl;
1271 need_issue = true;
1272 }
1273 }
1274
1275 if (pfinishers)
1276 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK,
1277 *pfinishers);
1278 else
1279 lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD|SimpleLock::WAIT_XLOCK);
1280
1281 if (caps && in->is_head())
1282 need_issue = true;
1283
1284 if (lock->get_parent()->is_auth() &&
1285 lock->is_stable())
1286 try_eval(lock, &need_issue);
1287 }
1288
1289 if (need_issue) {
1290 if (pneed_issue)
1291 *pneed_issue = true;
1292 else if (in->is_head())
1293 issue_caps(in);
1294 }
1295
1296}
1297
1298bool Locker::eval(CInode *in, int mask, bool caps_imported)
1299{
1300 bool need_issue = caps_imported;
11fdf7f2 1301 MDSContext::vec finishers;
7c673cae
FG
1302
1303 dout(10) << "eval " << mask << " " << *in << dendl;
1304
1305 // choose loner?
1306 if (in->is_auth() && in->is_head()) {
b32b8144
FG
1307 client_t orig_loner = in->get_loner();
1308 if (in->choose_ideal_loner()) {
1309 dout(10) << "eval set loner: client." << orig_loner << " -> client." << in->get_loner() << dendl;
1310 need_issue = true;
1311 mask = -1;
1312 } else if (in->get_wanted_loner() != in->get_loner()) {
1313 dout(10) << "eval want loner: client." << in->get_wanted_loner() << " but failed to set it" << dendl;
1314 mask = -1;
1315 }
7c673cae
FG
1316 }
1317
1318 retry:
1319 if (mask & CEPH_LOCK_IFILE)
1320 eval_any(&in->filelock, &need_issue, &finishers, caps_imported);
1321 if (mask & CEPH_LOCK_IAUTH)
1322 eval_any(&in->authlock, &need_issue, &finishers, caps_imported);
1323 if (mask & CEPH_LOCK_ILINK)
1324 eval_any(&in->linklock, &need_issue, &finishers, caps_imported);
1325 if (mask & CEPH_LOCK_IXATTR)
1326 eval_any(&in->xattrlock, &need_issue, &finishers, caps_imported);
1327 if (mask & CEPH_LOCK_INEST)
1328 eval_any(&in->nestlock, &need_issue, &finishers, caps_imported);
1329 if (mask & CEPH_LOCK_IFLOCK)
1330 eval_any(&in->flocklock, &need_issue, &finishers, caps_imported);
1331 if (mask & CEPH_LOCK_IPOLICY)
1332 eval_any(&in->policylock, &need_issue, &finishers, caps_imported);
1333
1334 // drop loner?
1335 if (in->is_auth() && in->is_head() && in->get_wanted_loner() != in->get_loner()) {
7c673cae 1336 if (in->try_drop_loner()) {
7c673cae 1337 need_issue = true;
7c673cae 1338 if (in->get_wanted_loner() >= 0) {
20effc67 1339 dout(10) << "eval end set loner to client." << in->get_wanted_loner() << dendl;
b32b8144 1340 bool ok = in->try_set_loner();
11fdf7f2 1341 ceph_assert(ok);
b32b8144
FG
1342 mask = -1;
1343 goto retry;
7c673cae
FG
1344 }
1345 }
1346 }
1347
1348 finish_contexts(g_ceph_context, finishers);
1349
1350 if (need_issue && in->is_head())
1351 issue_caps(in);
1352
1353 dout(10) << "eval done" << dendl;
1354 return need_issue;
1355}
1356
1357class C_Locker_Eval : public LockerContext {
1358 MDSCacheObject *p;
1359 int mask;
1360public:
1361 C_Locker_Eval(Locker *l, MDSCacheObject *pp, int m) : LockerContext(l), p(pp), mask(m) {
1362 // We are used as an MDSCacheObject waiter, so should
1363 // only be invoked by someone already holding the big lock.
9f95a23c 1364 ceph_assert(ceph_mutex_is_locked_by_me(locker->mds->mds_lock));
7c673cae
FG
1365 p->get(MDSCacheObject::PIN_PTRWAITER);
1366 }
1367 void finish(int r) override {
1368 locker->try_eval(p, mask);
1369 p->put(MDSCacheObject::PIN_PTRWAITER);
1370 }
1371};
1372
1373void Locker::try_eval(MDSCacheObject *p, int mask)
1374{
1375 // unstable and ambiguous auth?
1376 if (p->is_ambiguous_auth()) {
1377 dout(7) << "try_eval ambiguous auth, waiting on " << *p << dendl;
1378 p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, mask));
1379 return;
1380 }
1381
1382 if (p->is_auth() && p->is_frozen()) {
1383 dout(7) << "try_eval frozen, waiting on " << *p << dendl;
1384 p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, mask));
1385 return;
1386 }
1387
1388 if (mask & CEPH_LOCK_DN) {
11fdf7f2 1389 ceph_assert(mask == CEPH_LOCK_DN);
7c673cae
FG
1390 bool need_issue = false; // ignore this, no caps on dentries
1391 CDentry *dn = static_cast<CDentry *>(p);
1392 eval_any(&dn->lock, &need_issue);
1393 } else {
1394 CInode *in = static_cast<CInode *>(p);
1395 eval(in, mask);
1396 }
1397}
1398
1399void Locker::try_eval(SimpleLock *lock, bool *pneed_issue)
1400{
1401 MDSCacheObject *p = lock->get_parent();
1402
1403 // unstable and ambiguous auth?
1404 if (p->is_ambiguous_auth()) {
1405 dout(7) << "try_eval " << *lock << " ambiguousauth, waiting on " << *p << dendl;
1406 p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_Eval(this, p, lock->get_type()));
1407 return;
1408 }
1409
1410 if (!p->is_auth()) {
1411 dout(7) << "try_eval " << *lock << " not auth for " << *p << dendl;
1412 return;
1413 }
1414
1415 if (p->is_frozen()) {
1416 dout(7) << "try_eval " << *lock << " frozen, waiting on " << *p << dendl;
1417 p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
1418 return;
1419 }
1420
1421 /*
1422 * We could have a situation like:
1423 *
1424 * - mds A authpins item on mds B
1425 * - mds B starts to freeze tree containing item
1426 * - mds A tries wrlock_start on A, sends REQSCATTER to B
1427 * - mds B lock is unstable, sets scatter_wanted
1428 * - mds B lock stabilizes, calls try_eval.
1429 *
1430 * We can defer while freezing without causing a deadlock. Honor
1431 * scatter_wanted flag here. This will never get deferred by the
f67539c2 1432 * checks above due to the auth_pin held by the leader.
7c673cae
FG
1433 */
1434 if (lock->is_scatterlock()) {
1435 ScatterLock *slock = static_cast<ScatterLock *>(lock);
1436 if (slock->get_scatter_wanted() &&
1437 slock->get_state() != LOCK_MIX) {
1438 scatter_mix(slock, pneed_issue);
1439 if (!lock->is_stable())
1440 return;
1441 } else if (slock->get_unscatter_wanted() &&
1442 slock->get_state() != LOCK_LOCK) {
1443 simple_lock(slock, pneed_issue);
1444 if (!lock->is_stable()) {
1445 return;
1446 }
1447 }
1448 }
1449
11fdf7f2
TL
1450 if (lock->get_type() != CEPH_LOCK_DN &&
1451 lock->get_type() != CEPH_LOCK_ISNAP &&
9f95a23c 1452 lock->get_type() != CEPH_LOCK_IPOLICY &&
11fdf7f2 1453 p->is_freezing()) {
7c673cae
FG
1454 dout(7) << "try_eval " << *lock << " freezing, waiting on " << *p << dendl;
1455 p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_Eval(this, p, lock->get_type()));
1456 return;
1457 }
1458
1459 eval(lock, pneed_issue);
1460}
1461
1462void Locker::eval_cap_gather(CInode *in, set<CInode*> *issue_set)
1463{
1464 bool need_issue = false;
11fdf7f2 1465 MDSContext::vec finishers;
7c673cae
FG
1466
1467 // kick locks now
1468 if (!in->filelock.is_stable())
1469 eval_gather(&in->filelock, false, &need_issue, &finishers);
1470 if (!in->authlock.is_stable())
1471 eval_gather(&in->authlock, false, &need_issue, &finishers);
1472 if (!in->linklock.is_stable())
1473 eval_gather(&in->linklock, false, &need_issue, &finishers);
1474 if (!in->xattrlock.is_stable())
1475 eval_gather(&in->xattrlock, false, &need_issue, &finishers);
1476
1477 if (need_issue && in->is_head()) {
1478 if (issue_set)
1479 issue_set->insert(in);
1480 else
1481 issue_caps(in);
1482 }
1483
1484 finish_contexts(g_ceph_context, finishers);
1485}
1486
1487void Locker::eval_scatter_gathers(CInode *in)
1488{
1489 bool need_issue = false;
11fdf7f2 1490 MDSContext::vec finishers;
7c673cae
FG
1491
1492 dout(10) << "eval_scatter_gathers " << *in << dendl;
1493
1494 // kick locks now
1495 if (!in->filelock.is_stable())
1496 eval_gather(&in->filelock, false, &need_issue, &finishers);
1497 if (!in->nestlock.is_stable())
1498 eval_gather(&in->nestlock, false, &need_issue, &finishers);
1499 if (!in->dirfragtreelock.is_stable())
1500 eval_gather(&in->dirfragtreelock, false, &need_issue, &finishers);
1501
1502 if (need_issue && in->is_head())
1503 issue_caps(in);
1504
1505 finish_contexts(g_ceph_context, finishers);
1506}
1507
1508void Locker::eval(SimpleLock *lock, bool *need_issue)
1509{
1510 switch (lock->get_type()) {
1511 case CEPH_LOCK_IFILE:
1512 return file_eval(static_cast<ScatterLock*>(lock), need_issue);
1513 case CEPH_LOCK_IDFT:
1514 case CEPH_LOCK_INEST:
1515 return scatter_eval(static_cast<ScatterLock*>(lock), need_issue);
1516 default:
1517 return simple_eval(lock, need_issue);
1518 }
1519}
1520
1521
1522// ------------------
1523// rdlock
1524
1525bool Locker::_rdlock_kick(SimpleLock *lock, bool as_anon)
1526{
1527 // kick the lock
1528 if (lock->is_stable()) {
1529 if (lock->get_parent()->is_auth()) {
1530 if (lock->get_sm() == &sm_scatterlock) {
1531 // not until tempsync is fully implemented
1532 //if (lock->get_parent()->is_replicated())
1533 //scatter_tempsync((ScatterLock*)lock);
1534 //else
1535 simple_sync(lock);
1536 } else if (lock->get_sm() == &sm_filelock) {
1537 CInode *in = static_cast<CInode*>(lock->get_parent());
1538 if (lock->get_state() == LOCK_EXCL &&
1539 in->get_target_loner() >= 0 &&
1540 !in->is_dir() && !as_anon) // as_anon => caller wants SYNC, not XSYN
1541 file_xsyn(lock);
1542 else
1543 simple_sync(lock);
1544 } else
1545 simple_sync(lock);
1546 return true;
1547 } else {
1548 // request rdlock state change from auth
1549 mds_rank_t auth = lock->get_parent()->authority().first;
1550 if (!mds->is_cluster_degraded() ||
1551 mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
1552 dout(10) << "requesting rdlock from auth on "
1553 << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 1554 mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_REQRDLOCK, mds->get_nodeid()), auth);
7c673cae
FG
1555 }
1556 return false;
1557 }
1558 }
1559 if (lock->get_type() == CEPH_LOCK_IFILE) {
1560 CInode *in = static_cast<CInode *>(lock->get_parent());
1561 if (in->state_test(CInode::STATE_RECOVERING)) {
1562 mds->mdcache->recovery_queue.prioritize(in);
1563 }
1564 }
1565
1566 return false;
1567}
1568
9f95a23c 1569bool Locker::rdlock_try(SimpleLock *lock, client_t client)
7c673cae
FG
1570{
1571 dout(7) << "rdlock_try on " << *lock << " on " << *lock->get_parent() << dendl;
1572
1573 // can read? grab ref.
1574 if (lock->can_rdlock(client))
1575 return true;
1576
1577 _rdlock_kick(lock, false);
1578
1579 if (lock->can_rdlock(client))
1580 return true;
1581
7c673cae
FG
1582 return false;
1583}
1584
1585bool Locker::rdlock_start(SimpleLock *lock, MDRequestRef& mut, bool as_anon)
1586{
1587 dout(7) << "rdlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
1588
1589 // client may be allowed to rdlock the same item it has xlocked.
1590 // UNLESS someone passes in as_anon, or we're reading snapped version here.
1591 if (mut->snapid != CEPH_NOSNAP)
1592 as_anon = true;
1593 client_t client = as_anon ? -1 : mut->get_client();
1594
1595 CInode *in = 0;
1596 if (lock->get_type() != CEPH_LOCK_DN)
1597 in = static_cast<CInode *>(lock->get_parent());
1598
1599 /*
1600 if (!lock->get_parent()->is_auth() &&
1601 lock->fw_rdlock_to_auth()) {
1602 mdcache->request_forward(mut, lock->get_parent()->authority().first);
1603 return false;
1604 }
1605 */
1606
1607 while (1) {
1608 // can read? grab ref.
1609 if (lock->can_rdlock(client)) {
1610 lock->get_rdlock();
9f95a23c 1611 mut->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
7c673cae
FG
1612 return true;
1613 }
1614
1615 // hmm, wait a second.
1616 if (in && !in->is_head() && in->is_auth() &&
1617 lock->get_state() == LOCK_SNAP_SYNC) {
1618 // okay, we actually need to kick the head's lock to get ourselves synced up.
1619 CInode *head = mdcache->get_inode(in->ino());
11fdf7f2 1620 ceph_assert(head);
c07f9fc5
FG
1621 SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
1622 if (hlock->get_state() == LOCK_SYNC)
1623 hlock = head->get_lock(lock->get_type());
1624
7c673cae
FG
1625 if (hlock->get_state() != LOCK_SYNC) {
1626 dout(10) << "rdlock_start trying head inode " << *head << dendl;
c07f9fc5 1627 if (!rdlock_start(hlock, mut, true)) // ** as_anon, no rdlock on EXCL **
7c673cae
FG
1628 return false;
1629 // oh, check our lock again then
1630 }
1631 }
1632
1633 if (!_rdlock_kick(lock, as_anon))
1634 break;
1635 }
1636
1637 // wait!
1638 int wait_on;
1639 if (lock->get_parent()->is_auth() && lock->is_stable())
1640 wait_on = SimpleLock::WAIT_RD;
1641 else
1642 wait_on = SimpleLock::WAIT_STABLE; // REQRDLOCK is ignored if lock is unstable, so we need to retry.
1643 dout(7) << "rdlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
1644 lock->add_waiter(wait_on, new C_MDS_RetryRequest(mdcache, mut));
1645 nudge_log(lock);
1646 return false;
1647}
1648
1649void Locker::nudge_log(SimpleLock *lock)
1650{
1651 dout(10) << "nudge_log " << *lock << " on " << *lock->get_parent() << dendl;
1652 if (lock->get_parent()->is_auth() && lock->is_unstable_and_locked()) // as with xlockdone, or cap flush
1653 mds->mdlog->flush();
1654}
1655
11fdf7f2 1656void Locker::rdlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
7c673cae 1657{
11fdf7f2
TL
1658 ceph_assert(it->is_rdlock());
1659 SimpleLock *lock = it->lock;
7c673cae
FG
1660 // drop ref
1661 lock->put_rdlock();
11fdf7f2
TL
1662 if (mut)
1663 mut->locks.erase(it);
7c673cae
FG
1664
1665 dout(7) << "rdlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
1666
1667 // last one?
1668 if (!lock->is_rdlocked()) {
1669 if (!lock->is_stable())
1670 eval_gather(lock, false, pneed_issue);
1671 else if (lock->get_parent()->is_auth())
1672 try_eval(lock, pneed_issue);
1673 }
1674}
1675
9f95a23c 1676bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MDRequestRef& mdr)
7c673cae 1677{
9f95a23c 1678 dout(10) << __func__ << dendl;
11fdf7f2 1679 for (const auto& p : lov) {
9f95a23c 1680 auto lock = p.lock;
11fdf7f2 1681 ceph_assert(p.is_rdlock());
9f95a23c
TL
1682 if (!mdr->is_rdlocked(lock) && !rdlock_try(lock, mdr->get_client())) {
1683 lock->add_waiter(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD,
1684 new C_MDS_RetryRequest(mdcache, mdr));
1685 goto failed;
7c673cae 1686 }
9f95a23c
TL
1687 lock->get_rdlock();
1688 mdr->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
1689 dout(20) << " got rdlock on " << *lock << " " << *lock->get_parent() << dendl;
11fdf7f2 1690 }
9f95a23c 1691
7c673cae 1692 return true;
9f95a23c
TL
1693failed:
1694 dout(10) << __func__ << " failed" << dendl;
1695 drop_locks(mdr.get(), nullptr);
1696 mdr->drop_local_auth_pins();
1697 return false;
7c673cae
FG
1698}
1699
9f95a23c 1700bool Locker::rdlock_try_set(MutationImpl::LockOpVec& lov, MutationRef& mut)
7c673cae 1701{
9f95a23c 1702 dout(10) << __func__ << dendl;
11fdf7f2 1703 for (const auto& p : lov) {
9f95a23c 1704 auto lock = p.lock;
11fdf7f2 1705 ceph_assert(p.is_rdlock());
9f95a23c
TL
1706 if (!lock->can_rdlock(mut->get_client()))
1707 return false;
11fdf7f2 1708 p.lock->get_rdlock();
9f95a23c 1709 mut->emplace_lock(p.lock, MutationImpl::LockOp::RDLOCK);
7c673cae 1710 }
9f95a23c 1711 return true;
7c673cae
FG
1712}
1713
1714// ------------------
1715// wrlock
1716
1717void Locker::wrlock_force(SimpleLock *lock, MutationRef& mut)
1718{
1719 if (lock->get_type() == CEPH_LOCK_IVERSION ||
1720 lock->get_type() == CEPH_LOCK_DVERSION)
f67539c2 1721 return local_wrlock_grab(static_cast<LocalLockC*>(lock), mut);
7c673cae
FG
1722
1723 dout(7) << "wrlock_force on " << *lock
1724 << " on " << *lock->get_parent() << dendl;
1725 lock->get_wrlock(true);
9f95a23c
TL
1726 mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
1727}
1728
1729bool Locker::wrlock_try(SimpleLock *lock, const MutationRef& mut, client_t client)
1730{
1731 dout(10) << "wrlock_try " << *lock << " on " << *lock->get_parent() << dendl;
1732 if (client == -1)
1733 client = mut->get_client();
1734
1735 while (1) {
1736 if (lock->can_wrlock(client)) {
1737 lock->get_wrlock();
1738 auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
1739 it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked
1740 return true;
1741 }
1742 if (!lock->is_stable())
1743 break;
1744 CInode *in = static_cast<CInode *>(lock->get_parent());
1745 if (!in->is_auth())
1746 break;
1747 // caller may already has a log entry open. To avoid calling
1748 // scatter_writebehind or start_scatter. don't change nest lock
1749 // state if it has dirty scatterdata.
1750 if (lock->is_dirty())
1751 break;
1752 // To avoid calling scatter_writebehind or start_scatter. don't
1753 // change nest lock state to MIX.
1754 ScatterLock *slock = static_cast<ScatterLock*>(lock);
1755 if (slock->get_scatter_wanted() || in->has_subtree_or_exporting_dirfrag())
1756 break;
1757
1758 simple_lock(lock);
1759 }
1760 return false;
7c673cae
FG
1761}
1762
9f95a23c 1763bool Locker::wrlock_start(const MutationImpl::LockOp &op, MDRequestRef& mut)
7c673cae 1764{
9f95a23c 1765 SimpleLock *lock = op.lock;
7c673cae
FG
1766 if (lock->get_type() == CEPH_LOCK_IVERSION ||
1767 lock->get_type() == CEPH_LOCK_DVERSION)
f67539c2 1768 return local_wrlock_start(static_cast<LocalLockC*>(lock), mut);
7c673cae
FG
1769
1770 dout(10) << "wrlock_start " << *lock << " on " << *lock->get_parent() << dendl;
1771
1772 CInode *in = static_cast<CInode *>(lock->get_parent());
9f95a23c
TL
1773 client_t client = op.is_state_pin() ? lock->get_excl_client() : mut->get_client();
1774 bool want_scatter = lock->get_parent()->is_auth() &&
7c673cae
FG
1775 (in->has_subtree_or_exporting_dirfrag() ||
1776 static_cast<ScatterLock*>(lock)->get_scatter_wanted());
1777
1778 while (1) {
1779 // wrlock?
1780 if (lock->can_wrlock(client) &&
1781 (!want_scatter || lock->get_state() == LOCK_MIX)) {
1782 lock->get_wrlock();
9f95a23c 1783 auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
11fdf7f2 1784 it->flags |= MutationImpl::LockOp::WRLOCK; // may already remote_wrlocked
7c673cae
FG
1785 return true;
1786 }
1787
1788 if (lock->get_type() == CEPH_LOCK_IFILE &&
1789 in->state_test(CInode::STATE_RECOVERING)) {
1790 mds->mdcache->recovery_queue.prioritize(in);
1791 }
1792
1793 if (!lock->is_stable())
1794 break;
1795
1796 if (in->is_auth()) {
7c673cae
FG
1797 if (want_scatter)
1798 scatter_mix(static_cast<ScatterLock*>(lock));
1799 else
1800 simple_lock(lock);
7c673cae
FG
1801 } else {
1802 // replica.
1803 // auth should be auth_pinned (see acquire_locks wrlock weird mustpin case).
1804 mds_rank_t auth = lock->get_parent()->authority().first;
1805 if (!mds->is_cluster_degraded() ||
1806 mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
1807 dout(10) << "requesting scatter from auth on "
1808 << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 1809 mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_REQSCATTER, mds->get_nodeid()), auth);
7c673cae
FG
1810 }
1811 break;
1812 }
1813 }
1814
9f95a23c
TL
1815 dout(7) << "wrlock_start waiting on " << *lock << " on " << *lock->get_parent() << dendl;
1816 lock->add_waiter(SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
1817 nudge_log(lock);
7c673cae
FG
1818
1819 return false;
1820}
1821
11fdf7f2 1822void Locker::wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
7c673cae 1823{
11fdf7f2
TL
1824 ceph_assert(it->is_wrlock());
1825 SimpleLock* lock = it->lock;
1826
7c673cae
FG
1827 if (lock->get_type() == CEPH_LOCK_IVERSION ||
1828 lock->get_type() == CEPH_LOCK_DVERSION)
11fdf7f2 1829 return local_wrlock_finish(it, mut);
7c673cae
FG
1830
1831 dout(7) << "wrlock_finish on " << *lock << " on " << *lock->get_parent() << dendl;
1832 lock->put_wrlock();
11fdf7f2
TL
1833
1834 if (it->is_remote_wrlock())
1835 it->clear_wrlock();
1836 else
1837 mut->locks.erase(it);
7c673cae 1838
9f95a23c
TL
1839 if (lock->is_wrlocked()) {
1840 // Evaluate unstable lock after scatter_writebehind_finish(). Because
1841 // eval_gather() does not change lock's state when lock is flushing.
1842 if (!lock->is_stable() && lock->is_flushed() &&
1843 lock->get_parent()->is_auth())
1844 eval_gather(lock, false, pneed_issue);
1845 } else {
7c673cae
FG
1846 if (!lock->is_stable())
1847 eval_gather(lock, false, pneed_issue);
1848 else if (lock->get_parent()->is_auth())
1849 try_eval(lock, pneed_issue);
1850 }
1851}
1852
1853
1854// remote wrlock
1855
1856void Locker::remote_wrlock_start(SimpleLock *lock, mds_rank_t target, MDRequestRef& mut)
1857{
1858 dout(7) << "remote_wrlock_start mds." << target << " on " << *lock << " on " << *lock->get_parent() << dendl;
1859
1860 // wait for active target
1861 if (mds->is_cluster_degraded() &&
1862 !mds->mdsmap->is_clientreplay_or_active_or_stopping(target)) {
1863 dout(7) << " mds." << target << " is not active" << dendl;
f67539c2 1864 if (mut->more()->waiting_on_peer.empty())
7c673cae
FG
1865 mds->wait_for_active_peer(target, new C_MDS_RetryRequest(mdcache, mut));
1866 return;
1867 }
1868
1869 // send lock request
1870 mut->start_locking(lock, target);
f67539c2
TL
1871 mut->more()->peers.insert(target);
1872 auto r = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_WRLOCK);
7c673cae
FG
1873 r->set_lock_type(lock->get_type());
1874 lock->get_parent()->set_object_info(r->get_object_info());
1875 mds->send_message_mds(r, target);
1876
f67539c2
TL
1877 ceph_assert(mut->more()->waiting_on_peer.count(target) == 0);
1878 mut->more()->waiting_on_peer.insert(target);
7c673cae
FG
1879}
1880
11fdf7f2 1881void Locker::remote_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
7c673cae 1882{
11fdf7f2
TL
1883 ceph_assert(it->is_remote_wrlock());
1884 SimpleLock *lock = it->lock;
1885 mds_rank_t target = it->wrlock_target;
1886
1887 if (it->is_wrlock())
1888 it->clear_remote_wrlock();
1889 else
1890 mut->locks.erase(it);
7c673cae
FG
1891
1892 dout(7) << "remote_wrlock_finish releasing remote wrlock on mds." << target
1893 << " " << *lock->get_parent() << dendl;
1894 if (!mds->is_cluster_degraded() ||
1895 mds->mdsmap->get_state(target) >= MDSMap::STATE_REJOIN) {
f67539c2
TL
1896 auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_UNWRLOCK);
1897 peerreq->set_lock_type(lock->get_type());
1898 lock->get_parent()->set_object_info(peerreq->get_object_info());
1899 mds->send_message_mds(peerreq, target);
7c673cae
FG
1900 }
1901}
1902
1903
1904// ------------------
1905// xlock
1906
1907bool Locker::xlock_start(SimpleLock *lock, MDRequestRef& mut)
1908{
1909 if (lock->get_type() == CEPH_LOCK_IVERSION ||
1910 lock->get_type() == CEPH_LOCK_DVERSION)
f67539c2 1911 return local_xlock_start(static_cast<LocalLockC*>(lock), mut);
7c673cae
FG
1912
1913 dout(7) << "xlock_start on " << *lock << " on " << *lock->get_parent() << dendl;
1914 client_t client = mut->get_client();
1915
a8e16298
TL
1916 CInode *in = nullptr;
1917 if (lock->get_cap_shift())
1918 in = static_cast<CInode *>(lock->get_parent());
1919
7c673cae
FG
1920 // auth?
1921 if (lock->get_parent()->is_auth()) {
1922 // auth
1923 while (1) {
e306af50
TL
1924 if (mut->locking && // started xlock (not preempt other request)
1925 lock->can_xlock(client) &&
a8e16298
TL
1926 !(lock->get_state() == LOCK_LOCK_XLOCK && // client is not xlocker or
1927 in && in->issued_caps_need_gather(lock))) { // xlocker does not hold shared cap
7c673cae
FG
1928 lock->set_state(LOCK_XLOCK);
1929 lock->get_xlock(mut, client);
9f95a23c 1930 mut->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
1931 mut->finish_locking(lock);
1932 return true;
1933 }
1934
a8e16298
TL
1935 if (lock->get_type() == CEPH_LOCK_IFILE &&
1936 in->state_test(CInode::STATE_RECOVERING)) {
1937 mds->mdcache->recovery_queue.prioritize(in);
7c673cae
FG
1938 }
1939
1940 if (!lock->is_stable() && (lock->get_state() != LOCK_XLOCKDONE ||
39ae355f
TL
1941 lock->get_xlock_by_client() != client ||
1942 lock->is_waiter_for(SimpleLock::WAIT_STABLE)))
1943 break;
1944 // Avoid unstable XLOCKDONE state reset,
1945 // see: https://tracker.ceph.com/issues/49132
1946 if (lock->get_state() == LOCK_XLOCKDONE &&
1947 lock->get_type() == CEPH_LOCK_IFILE)
7c673cae
FG
1948 break;
1949
1950 if (lock->get_state() == LOCK_LOCK || lock->get_state() == LOCK_XLOCKDONE) {
1951 mut->start_locking(lock);
1952 simple_xlock(lock);
1953 } else {
1954 simple_lock(lock);
1955 }
1956 }
1957
1958 lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
1959 nudge_log(lock);
1960 return false;
1961 } else {
1962 // replica
11fdf7f2 1963 ceph_assert(lock->get_sm()->can_remote_xlock);
f67539c2 1964 ceph_assert(!mut->peer_request);
7c673cae
FG
1965
1966 // wait for single auth
1967 if (lock->get_parent()->is_ambiguous_auth()) {
1968 lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
1969 new C_MDS_RetryRequest(mdcache, mut));
1970 return false;
1971 }
1972
1973 // wait for active auth
1974 mds_rank_t auth = lock->get_parent()->authority().first;
1975 if (mds->is_cluster_degraded() &&
1976 !mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
1977 dout(7) << " mds." << auth << " is not active" << dendl;
f67539c2 1978 if (mut->more()->waiting_on_peer.empty())
7c673cae
FG
1979 mds->wait_for_active_peer(auth, new C_MDS_RetryRequest(mdcache, mut));
1980 return false;
1981 }
1982
1983 // send lock request
f67539c2 1984 mut->more()->peers.insert(auth);
7c673cae 1985 mut->start_locking(lock, auth);
f67539c2 1986 auto r = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_XLOCK);
7c673cae
FG
1987 r->set_lock_type(lock->get_type());
1988 lock->get_parent()->set_object_info(r->get_object_info());
1989 mds->send_message_mds(r, auth);
1990
f67539c2
TL
1991 ceph_assert(mut->more()->waiting_on_peer.count(auth) == 0);
1992 mut->more()->waiting_on_peer.insert(auth);
7c673cae
FG
1993
1994 return false;
1995 }
1996}
1997
1998void Locker::_finish_xlock(SimpleLock *lock, client_t xlocker, bool *pneed_issue)
1999{
11fdf7f2
TL
2000 if (lock->get_type() != CEPH_LOCK_DN &&
2001 lock->get_type() != CEPH_LOCK_ISNAP &&
9f95a23c 2002 lock->get_type() != CEPH_LOCK_IPOLICY &&
11fdf7f2 2003 lock->get_num_rdlocks() == 0 &&
7c673cae 2004 lock->get_num_wrlocks() == 0 &&
b32b8144 2005 !lock->is_leased() &&
11fdf7f2 2006 lock->get_state() != LOCK_XLOCKSNAP) {
7c673cae
FG
2007 CInode *in = static_cast<CInode*>(lock->get_parent());
2008 client_t loner = in->get_target_loner();
2009 if (loner >= 0 && (xlocker < 0 || xlocker == loner)) {
2010 lock->set_state(LOCK_EXCL);
2011 lock->get_parent()->auth_unpin(lock);
2012 lock->finish_waiters(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_WR|SimpleLock::WAIT_RD);
2013 if (lock->get_cap_shift())
2014 *pneed_issue = true;
2015 if (lock->get_parent()->is_auth() &&
2016 lock->is_stable())
2017 try_eval(lock, pneed_issue);
2018 return;
2019 }
2020 }
2021 // the xlocker may have CEPH_CAP_GSHARED, need to revoke it if next state is LOCK_LOCK
2022 eval_gather(lock, lock->get_state() != LOCK_XLOCKSNAP, pneed_issue);
2023}
2024
11fdf7f2 2025void Locker::xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut, bool *pneed_issue)
7c673cae 2026{
11fdf7f2
TL
2027 ceph_assert(it->is_xlock());
2028 SimpleLock *lock = it->lock;
2029
7c673cae
FG
2030 if (lock->get_type() == CEPH_LOCK_IVERSION ||
2031 lock->get_type() == CEPH_LOCK_DVERSION)
11fdf7f2 2032 return local_xlock_finish(it, mut);
7c673cae
FG
2033
2034 dout(10) << "xlock_finish on " << *lock << " " << *lock->get_parent() << dendl;
2035
2036 client_t xlocker = lock->get_xlock_by_client();
2037
2038 // drop ref
2039 lock->put_xlock();
11fdf7f2
TL
2040 ceph_assert(mut);
2041 mut->locks.erase(it);
7c673cae
FG
2042
2043 bool do_issue = false;
2044
2045 // remote xlock?
2046 if (!lock->get_parent()->is_auth()) {
11fdf7f2 2047 ceph_assert(lock->get_sm()->can_remote_xlock);
7c673cae
FG
2048
2049 // tell auth
2050 dout(7) << "xlock_finish releasing remote xlock on " << *lock->get_parent() << dendl;
2051 mds_rank_t auth = lock->get_parent()->authority().first;
2052 if (!mds->is_cluster_degraded() ||
2053 mds->mdsmap->get_state(auth) >= MDSMap::STATE_REJOIN) {
f67539c2
TL
2054 auto peerreq = make_message<MMDSPeerRequest>(mut->reqid, mut->attempt, MMDSPeerRequest::OP_UNXLOCK);
2055 peerreq->set_lock_type(lock->get_type());
2056 lock->get_parent()->set_object_info(peerreq->get_object_info());
2057 mds->send_message_mds(peerreq, auth);
7c673cae
FG
2058 }
2059 // others waiting?
2060 lock->finish_waiters(SimpleLock::WAIT_STABLE |
2061 SimpleLock::WAIT_WR |
2062 SimpleLock::WAIT_RD, 0);
2063 } else {
a8e16298
TL
2064 if (lock->get_num_xlocks() == 0 &&
2065 lock->get_state() != LOCK_LOCK_XLOCK) { // no one is taking xlock
7c673cae
FG
2066 _finish_xlock(lock, xlocker, &do_issue);
2067 }
2068 }
2069
2070 if (do_issue) {
2071 CInode *in = static_cast<CInode*>(lock->get_parent());
2072 if (in->is_head()) {
2073 if (pneed_issue)
2074 *pneed_issue = true;
2075 else
2076 issue_caps(in);
2077 }
2078 }
2079}
2080
11fdf7f2 2081void Locker::xlock_export(const MutationImpl::lock_iterator& it, MutationImpl *mut)
7c673cae 2082{
11fdf7f2
TL
2083 ceph_assert(it->is_xlock());
2084 SimpleLock *lock = it->lock;
7c673cae
FG
2085 dout(10) << "xlock_export on " << *lock << " " << *lock->get_parent() << dendl;
2086
2087 lock->put_xlock();
11fdf7f2 2088 mut->locks.erase(it);
7c673cae
FG
2089
2090 MDSCacheObject *p = lock->get_parent();
11fdf7f2 2091 ceph_assert(p->state_test(CInode::STATE_AMBIGUOUSAUTH)); // we are exporting this (inode)
7c673cae
FG
2092
2093 if (!lock->is_stable())
2094 lock->get_parent()->auth_unpin(lock);
2095
2096 lock->set_state(LOCK_LOCK);
2097}
2098
2099void Locker::xlock_import(SimpleLock *lock)
2100{
2101 dout(10) << "xlock_import on " << *lock << " " << *lock->get_parent() << dendl;
2102 lock->get_parent()->auth_pin(lock);
2103}
2104
9f95a23c
TL
2105void Locker::xlock_downgrade(SimpleLock *lock, MutationImpl *mut)
2106{
2107 dout(10) << "xlock_downgrade on " << *lock << " " << *lock->get_parent() << dendl;
2108 auto it = mut->locks.find(lock);
2109 if (it->is_rdlock())
2110 return; // already downgraded
2111
2112 ceph_assert(lock->get_parent()->is_auth());
2113 ceph_assert(it != mut->locks.end());
2114 ceph_assert(it->is_xlock());
2115
2116 lock->set_xlock_done();
2117 lock->get_rdlock();
2118 xlock_finish(it, mut, nullptr);
2119 mut->emplace_lock(lock, MutationImpl::LockOp::RDLOCK);
2120}
7c673cae
FG
2121
2122
2123// file i/o -----------------------------------------
2124
2125version_t Locker::issue_file_data_version(CInode *in)
2126{
2127 dout(7) << "issue_file_data_version on " << *in << dendl;
f67539c2 2128 return in->get_inode()->file_data_version;
7c673cae
FG
2129}
2130
2131class C_Locker_FileUpdate_finish : public LockerLogContext {
2132 CInode *in;
2133 MutationRef mut;
11fdf7f2 2134 unsigned flags;
7c673cae 2135 client_t client;
9f95a23c 2136 ref_t<MClientCaps> ack;
7c673cae 2137public:
11fdf7f2 2138 C_Locker_FileUpdate_finish(Locker *l, CInode *i, MutationRef& m, unsigned f,
9f95a23c 2139 const ref_t<MClientCaps> &ack, client_t c=-1)
11fdf7f2 2140 : LockerLogContext(l), in(i), mut(m), flags(f), client(c), ack(ack) {
7c673cae
FG
2141 in->get(CInode::PIN_PTRWAITER);
2142 }
2143 void finish(int r) override {
11fdf7f2 2144 locker->file_update_finish(in, mut, flags, client, ack);
7c673cae
FG
2145 in->put(CInode::PIN_PTRWAITER);
2146 }
2147};
2148
11fdf7f2
TL
2149enum {
2150 UPDATE_SHAREMAX = 1,
2151 UPDATE_NEEDSISSUE = 2,
2152 UPDATE_SNAPFLUSH = 4,
2153};
2154
2155void Locker::file_update_finish(CInode *in, MutationRef& mut, unsigned flags,
9f95a23c 2156 client_t client, const ref_t<MClientCaps> &ack)
7c673cae
FG
2157{
2158 dout(10) << "file_update_finish on " << *in << dendl;
7c673cae
FG
2159
2160 mut->apply();
11fdf7f2 2161
7c673cae
FG
2162 if (ack) {
2163 Session *session = mds->get_session(client);
92f5a8d4 2164 if (session && !session->is_closed()) {
7c673cae
FG
2165 // "oldest flush tid" > 0 means client uses unique TID for each flush
2166 if (ack->get_oldest_flush_tid() > 0)
11fdf7f2 2167 session->add_completed_flush(ack->get_client_tid());
7c673cae
FG
2168 mds->send_message_client_counted(ack, session);
2169 } else {
2170 dout(10) << " no session for client." << client << " " << *ack << dendl;
7c673cae
FG
2171 }
2172 }
2173
2174 set<CInode*> need_issue;
2175 drop_locks(mut.get(), &need_issue);
2176
11fdf7f2
TL
2177 if (in->is_head()) {
2178 if ((flags & UPDATE_NEEDSISSUE) && need_issue.count(in) == 0) {
2179 Capability *cap = in->get_client_cap(client);
2180 if (cap && (cap->wanted() & ~cap->pending()))
2181 issue_caps(in, cap);
2182 }
2183
2184 if ((flags & UPDATE_SHAREMAX) && in->is_auth() &&
2185 (in->filelock.gcaps_allowed(CAP_LONER) & (CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))
2186 share_inode_max_size(in);
2187
2188 } else if ((flags & UPDATE_SNAPFLUSH) && !in->client_snap_caps.empty()) {
7c673cae
FG
2189 dout(10) << " client_snap_caps " << in->client_snap_caps << dendl;
2190 // check for snap writeback completion
eafe8130
TL
2191 in->client_snap_caps.erase(client);
2192 if (in->client_snap_caps.empty()) {
2193 for (int i = 0; i < num_cinode_locks; i++) {
2194 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
11fdf7f2 2195 ceph_assert(lock);
11fdf7f2 2196 lock->put_wrlock();
11fdf7f2 2197 }
eafe8130
TL
2198 in->item_open_file.remove_myself();
2199 in->item_caps.remove_myself();
7c673cae
FG
2200 eval_cap_gather(in, &need_issue);
2201 }
7c673cae
FG
2202 }
2203 issue_caps_set(need_issue);
2204
11fdf7f2 2205 mds->balancer->hit_inode(in, META_POP_IWR);
28e407b8 2206
7c673cae
FG
2207 // auth unpin after issuing caps
2208 mut->cleanup();
2209}
2210
2211Capability* Locker::issue_new_caps(CInode *in,
2212 int mode,
9f95a23c
TL
2213 MDRequestRef& mdr,
2214 SnapRealm *realm)
7c673cae
FG
2215{
2216 dout(7) << "issue_new_caps for mode " << mode << " on " << *in << dendl;
9f95a23c
TL
2217 Session *session = mdr->session;
2218 bool new_inode = (mdr->alloc_ino || mdr->used_prealloc_ino);
7c673cae 2219
9f95a23c
TL
2220 // if replay or async, try to reconnect cap, and otherwise do nothing.
2221 if (new_inode && mdr->client_request->is_queued_for_replay())
a8e16298
TL
2222 return mds->mdcache->try_reconnect_cap(in, session);
2223
7c673cae 2224 // my needs
11fdf7f2
TL
2225 ceph_assert(session->info.inst.name.is_client());
2226 client_t my_client = session->get_client();
7c673cae
FG
2227 int my_want = ceph_caps_for_mode(mode);
2228
2229 // register a capability
2230 Capability *cap = in->get_client_cap(my_client);
2231 if (!cap) {
2232 // new cap
9f95a23c 2233 cap = in->add_client_cap(my_client, session, realm, new_inode);
7c673cae
FG
2234 cap->set_wanted(my_want);
2235 cap->mark_new();
7c673cae 2236 } else {
7c673cae
FG
2237 // make sure it wants sufficient caps
2238 if (my_want & ~cap->wanted()) {
2239 // augment wanted caps for this client
2240 cap->set_wanted(cap->wanted() | my_want);
2241 }
2242 }
9f95a23c 2243 cap->inc_suppress(); // suppress file cap messages (we'll bundle with the request reply)
7c673cae
FG
2244
2245 if (in->is_auth()) {
2246 // [auth] twiddle mode?
2247 eval(in, CEPH_CAP_LOCKS);
2248
2a845540
TL
2249 int all_allowed = -1, loner_allowed = -1, xlocker_allowed = -1;
2250 int allowed = get_allowed_caps(in, cap, all_allowed, loner_allowed,
2251 xlocker_allowed);
2252
2253 if (_need_flush_mdlog(in, my_want & ~allowed, true))
7c673cae
FG
2254 mds->mdlog->flush();
2255
2256 } else {
2257 // [replica] tell auth about any new caps wanted
2258 request_inode_file_caps(in);
2259 }
2260
2261 // issue caps (pot. incl new one)
2262 //issue_caps(in); // note: _eval above may have done this already...
2263
2264 // re-issue whatever we can
2265 //cap->issue(cap->pending());
2266
9f95a23c 2267 cap->dec_suppress();
7c673cae
FG
2268
2269 return cap;
2270}
2271
7c673cae
FG
2272void Locker::issue_caps_set(set<CInode*>& inset)
2273{
2274 for (set<CInode*>::iterator p = inset.begin(); p != inset.end(); ++p)
2275 issue_caps(*p);
2276}
2277
494da23a
TL
2278class C_Locker_RevokeStaleCap : public LockerContext {
2279 CInode *in;
2280 client_t client;
2281public:
2282 C_Locker_RevokeStaleCap(Locker *l, CInode *i, client_t c) :
2283 LockerContext(l), in(i), client(c) {
2284 in->get(CInode::PIN_PTRWAITER);
2285 }
2286 void finish(int r) override {
2287 locker->revoke_stale_cap(in, client);
2288 in->put(CInode::PIN_PTRWAITER);
2289 }
2290};
2291
2a845540
TL
2292int Locker::get_allowed_caps(CInode *in, Capability *cap,
2293 int &all_allowed, int &loner_allowed,
2294 int &xlocker_allowed)
7c673cae 2295{
2a845540
TL
2296 client_t client = cap->get_client();
2297
7c673cae 2298 // allowed caps are determined by the lock mode.
2a845540
TL
2299 if (all_allowed == -1)
2300 all_allowed = in->get_caps_allowed_by_type(CAP_ANY);
2301 if (loner_allowed == -1)
2302 loner_allowed = in->get_caps_allowed_by_type(CAP_LONER);
2303 if (xlocker_allowed == -1)
2304 xlocker_allowed = in->get_caps_allowed_by_type(CAP_XLOCKER);
7c673cae
FG
2305
2306 client_t loner = in->get_loner();
2307 if (loner >= 0) {
2a845540 2308 dout(7) << "get_allowed_caps loner client." << loner
7c673cae
FG
2309 << " allowed=" << ccap_string(loner_allowed)
2310 << ", xlocker allowed=" << ccap_string(xlocker_allowed)
2311 << ", others allowed=" << ccap_string(all_allowed)
2312 << " on " << *in << dendl;
2313 } else {
2a845540 2314 dout(7) << "get_allowed_caps allowed=" << ccap_string(all_allowed)
7c673cae
FG
2315 << ", xlocker allowed=" << ccap_string(xlocker_allowed)
2316 << " on " << *in << dendl;
2317 }
2318
2a845540
TL
2319 // do not issue _new_ bits when size|mtime is projected
2320 int allowed;
2321 if (loner == client)
2322 allowed = loner_allowed;
2323 else
2324 allowed = all_allowed;
7c673cae 2325
2a845540
TL
2326 // add in any xlocker-only caps (for locks this client is the xlocker for)
2327 allowed |= xlocker_allowed & in->get_xlocker_mask(client);
2328 if (in->is_dir()) {
2329 allowed &= ~CEPH_CAP_ANY_DIR_OPS;
2330 if (allowed & CEPH_CAP_FILE_EXCL)
2331 allowed |= cap->get_lock_cache_allowed();
2332 }
2333
2334 if ((in->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
2335 cap->is_noinline()) ||
2336 (!in->get_inode()->layout.pool_ns.empty() &&
2337 cap->is_nopoolns()))
2338 allowed &= ~(CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR);
2339
2340 return allowed;
2341}
2342
2343int Locker::issue_caps(CInode *in, Capability *only_cap)
2344{
7c673cae 2345 // count conflicts with
2a845540
TL
2346 int nissued = 0;
2347 int all_allowed = -1, loner_allowed = -1, xlocker_allowed = -1;
2348
2349 ceph_assert(in->is_head());
7c673cae
FG
2350
2351 // client caps
11fdf7f2 2352 map<client_t, Capability>::iterator it;
7c673cae
FG
2353 if (only_cap)
2354 it = in->client_caps.find(only_cap->get_client());
2355 else
2356 it = in->client_caps.begin();
2357 for (; it != in->client_caps.end(); ++it) {
11fdf7f2 2358 Capability *cap = &it->second;
2a845540
TL
2359 int allowed = get_allowed_caps(in, cap, all_allowed, loner_allowed,
2360 xlocker_allowed);
7c673cae
FG
2361 int pending = cap->pending();
2362 int wanted = cap->wanted();
2363
2364 dout(20) << " client." << it->first
2365 << " pending " << ccap_string(pending)
2366 << " allowed " << ccap_string(allowed)
2367 << " wanted " << ccap_string(wanted)
2368 << dendl;
2369
2370 if (!(pending & ~allowed)) {
2371 // skip if suppress or new, and not revocation
494da23a
TL
2372 if (cap->is_new() || cap->is_suppress() || cap->is_stale()) {
2373 dout(20) << " !revoke and new|suppressed|stale, skipping client." << it->first << dendl;
2374 continue;
2375 }
2376 } else {
2377 ceph_assert(!cap->is_new());
2378 if (cap->is_stale()) {
2379 dout(20) << " revoke stale cap from client." << it->first << dendl;
2380 ceph_assert(!cap->is_valid());
2381 cap->issue(allowed & pending, false);
2382 mds->queue_waiter_front(new C_Locker_RevokeStaleCap(this, in, it->first));
7c673cae
FG
2383 continue;
2384 }
92f5a8d4
TL
2385
2386 if (!cap->is_valid() && (pending & ~CEPH_CAP_PIN)) {
2387 // After stale->resume circle, client thinks it only has CEPH_CAP_PIN.
2388 // mds needs to re-issue caps, then do revocation.
2389 long seq = cap->issue(pending, true);
2390
2391 dout(7) << " sending MClientCaps to client." << it->first
2392 << " seq " << seq << " re-issue " << ccap_string(pending) << dendl;
2393
f67539c2
TL
2394 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
2395
9f95a23c
TL
2396 auto m = make_message<MClientCaps>(CEPH_CAP_OP_GRANT, in->ino(),
2397 in->find_snaprealm()->inode->ino(),
2398 cap->get_cap_id(), cap->get_last_seq(),
2399 pending, wanted, 0, cap->get_mseq(),
2400 mds->get_osd_epoch_barrier());
92f5a8d4
TL
2401 in->encode_cap_message(m, cap);
2402
2403 mds->send_message_client_counted(m, cap->get_session());
2404 }
7c673cae
FG
2405 }
2406
2407 // notify clients about deleted inode, to make sure they release caps ASAP.
f67539c2 2408 if (in->get_inode()->nlink == 0)
7c673cae
FG
2409 wanted |= CEPH_CAP_LINK_SHARED;
2410
2411 // are there caps that the client _wants_ and can have, but aren't pending?
2412 // or do we need to revoke?
494da23a
TL
2413 if ((pending & ~allowed) || // need to revoke ~allowed caps.
2414 ((wanted & allowed) & ~pending) || // missing wanted+allowed caps
2415 !cap->is_valid()) { // after stale->resume circle
7c673cae
FG
2416 // issue
2417 nissued++;
2418
2419 // include caps that clients generally like, while we're at it.
2420 int likes = in->get_caps_liked();
2421 int before = pending;
2422 long seq;
2423 if (pending & ~allowed)
494da23a 2424 seq = cap->issue((wanted|likes) & allowed & pending, true); // if revoking, don't issue anything new.
7c673cae 2425 else
494da23a 2426 seq = cap->issue((wanted|likes) & allowed, true);
7c673cae
FG
2427 int after = cap->pending();
2428
494da23a
TL
2429 dout(7) << " sending MClientCaps to client." << it->first
2430 << " seq " << seq << " new pending " << ccap_string(after)
2431 << " was " << ccap_string(before) << dendl;
7c673cae 2432
494da23a
TL
2433 int op = (before & ~after) ? CEPH_CAP_OP_REVOKE : CEPH_CAP_OP_GRANT;
2434 if (op == CEPH_CAP_OP_REVOKE) {
f67539c2 2435 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_revoke);
494da23a
TL
2436 revoking_caps.push_back(&cap->item_revoking_caps);
2437 revoking_caps_by_client[cap->get_client()].push_back(&cap->item_client_revoking_caps);
2438 cap->set_last_revoke_stamp(ceph_clock_now());
2439 cap->reset_num_revoke_warnings();
f67539c2
TL
2440 } else {
2441 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
7c673cae 2442 }
494da23a 2443
9f95a23c
TL
2444 auto m = make_message<MClientCaps>(op, in->ino(),
2445 in->find_snaprealm()->inode->ino(),
2446 cap->get_cap_id(), cap->get_last_seq(),
2447 after, wanted, 0, cap->get_mseq(),
2448 mds->get_osd_epoch_barrier());
494da23a
TL
2449 in->encode_cap_message(m, cap);
2450
2451 mds->send_message_client_counted(m, cap->get_session());
7c673cae
FG
2452 }
2453
2454 if (only_cap)
2455 break;
2456 }
2457
81eedcae 2458 return nissued;
7c673cae
FG
2459}
2460
2461void Locker::issue_truncate(CInode *in)
2462{
2463 dout(7) << "issue_truncate on " << *in << dendl;
2464
11fdf7f2 2465 for (auto &p : in->client_caps) {
f67539c2 2466 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_trunc);
11fdf7f2 2467 Capability *cap = &p.second;
9f95a23c 2468 auto m = make_message<MClientCaps>(CEPH_CAP_OP_TRUNC,
11fdf7f2
TL
2469 in->ino(),
2470 in->find_snaprealm()->inode->ino(),
2471 cap->get_cap_id(), cap->get_last_seq(),
2472 cap->pending(), cap->wanted(), 0,
2473 cap->get_mseq(),
2474 mds->get_osd_epoch_barrier());
7c673cae 2475 in->encode_cap_message(m, cap);
11fdf7f2 2476 mds->send_message_client_counted(m, p.first);
7c673cae
FG
2477 }
2478
2479 // should we increase max_size?
2480 if (in->is_auth() && in->is_file())
2481 check_inode_max_size(in);
2482}
2483
494da23a
TL
2484
2485void Locker::revoke_stale_cap(CInode *in, client_t client)
2486{
2487 dout(7) << __func__ << " client." << client << " on " << *in << dendl;
2488 Capability *cap = in->get_client_cap(client);
2489 if (!cap)
2490 return;
2491
2492 if (cap->revoking() & CEPH_CAP_ANY_WR) {
f67539c2
TL
2493 CachedStackStringStream css;
2494 mds->evict_client(client.v, false, g_conf()->mds_session_blocklist_on_timeout, *css, nullptr);
494da23a
TL
2495 return;
2496 }
2497
2498 cap->revoke();
2499
f67539c2 2500 if (in->is_auth() && in->get_inode()->client_ranges.count(cap->get_client()))
494da23a
TL
2501 in->state_set(CInode::STATE_NEEDSRECOVER);
2502
2503 if (in->state_test(CInode::STATE_EXPORTINGCAPS))
2504 return;
2505
2506 if (!in->filelock.is_stable())
2507 eval_gather(&in->filelock);
2508 if (!in->linklock.is_stable())
2509 eval_gather(&in->linklock);
2510 if (!in->authlock.is_stable())
2511 eval_gather(&in->authlock);
2512 if (!in->xattrlock.is_stable())
2513 eval_gather(&in->xattrlock);
2514
2515 if (in->is_auth())
2516 try_eval(in, CEPH_CAP_LOCKS);
2517 else
2518 request_inode_file_caps(in);
2519}
2520
2521bool Locker::revoke_stale_caps(Session *session)
7c673cae 2522{
a8e16298 2523 dout(10) << "revoke_stale_caps for " << session->info.inst.name << dendl;
7c673cae 2524
494da23a
TL
2525 // invalidate all caps
2526 session->inc_cap_gen();
2527
92f5a8d4 2528 bool ret = true;
a8e16298
TL
2529 std::vector<CInode*> to_eval;
2530
2531 for (auto p = session->caps.begin(); !p.end(); ) {
2532 Capability *cap = *p;
2533 ++p;
2534 if (!cap->is_notable()) {
2535 // the rest ones are not being revoked and don't have writeable range
2536 // and don't want exclusive caps or want file read/write. They don't
2537 // need recover, they don't affect eval_gather()/try_eval()
2538 break;
2539 }
2540
494da23a
TL
2541 int revoking = cap->revoking();
2542 if (!revoking)
a8e16298
TL
2543 continue;
2544
92f5a8d4
TL
2545 if (revoking & CEPH_CAP_ANY_WR) {
2546 ret = false;
2547 break;
2548 }
494da23a
TL
2549
2550 int issued = cap->issued();
a8e16298 2551 CInode *in = cap->get_inode();
7c673cae 2552 dout(10) << " revoking " << ccap_string(issued) << " on " << *in << dendl;
9f95a23c
TL
2553 int revoked = cap->revoke();
2554 if (revoked & CEPH_CAP_ANY_DIR_OPS)
2555 eval_lock_caches(cap);
7c673cae
FG
2556
2557 if (in->is_auth() &&
f67539c2 2558 in->get_inode()->client_ranges.count(cap->get_client()))
7c673cae
FG
2559 in->state_set(CInode::STATE_NEEDSRECOVER);
2560
a8e16298
TL
2561 // eval lock/inode may finish contexts, which may modify other cap's position
2562 // in the session->caps.
2563 to_eval.push_back(in);
7c673cae 2564 }
7c673cae 2565
a8e16298
TL
2566 for (auto in : to_eval) {
2567 if (in->state_test(CInode::STATE_EXPORTINGCAPS))
2568 continue;
2569
2570 if (!in->filelock.is_stable())
2571 eval_gather(&in->filelock);
2572 if (!in->linklock.is_stable())
2573 eval_gather(&in->linklock);
2574 if (!in->authlock.is_stable())
2575 eval_gather(&in->authlock);
2576 if (!in->xattrlock.is_stable())
2577 eval_gather(&in->xattrlock);
2578
2579 if (in->is_auth())
2580 try_eval(in, CEPH_CAP_LOCKS);
2581 else
2582 request_inode_file_caps(in);
7c673cae 2583 }
494da23a 2584
92f5a8d4 2585 return ret;
7c673cae
FG
2586}
2587
2588void Locker::resume_stale_caps(Session *session)
2589{
2590 dout(10) << "resume_stale_caps for " << session->info.inst.name << dendl;
2591
11fdf7f2 2592 bool lazy = session->info.has_feature(CEPHFS_FEATURE_LAZY_CAP_WANTED);
a8e16298 2593 for (xlist<Capability*>::iterator p = session->caps.begin(); !p.end(); ) {
7c673cae 2594 Capability *cap = *p;
a8e16298 2595 ++p;
11fdf7f2 2596 if (lazy && !cap->is_notable())
a8e16298
TL
2597 break; // see revoke_stale_caps()
2598
7c673cae 2599 CInode *in = cap->get_inode();
a8e16298
TL
2600 ceph_assert(in->is_head());
2601 dout(10) << " clearing stale flag on " << *in << dendl;
7c673cae 2602
a8e16298
TL
2603 if (in->state_test(CInode::STATE_EXPORTINGCAPS)) {
2604 // if export succeeds, the cap will be removed. if export fails,
2605 // we need to re-issue the cap if it's not stale.
2606 in->state_set(CInode::STATE_EVALSTALECAPS);
2607 continue;
7c673cae 2608 }
a8e16298
TL
2609
2610 if (!in->is_auth() || !eval(in, CEPH_CAP_LOCKS))
2611 issue_caps(in, cap);
7c673cae
FG
2612 }
2613}
2614
2615void Locker::remove_stale_leases(Session *session)
2616{
2617 dout(10) << "remove_stale_leases for " << session->info.inst.name << dendl;
2618 xlist<ClientLease*>::iterator p = session->leases.begin();
2619 while (!p.end()) {
2620 ClientLease *l = *p;
2621 ++p;
2622 CDentry *parent = static_cast<CDentry*>(l->parent);
2623 dout(15) << " removing lease on " << *parent << dendl;
2624 parent->remove_client_lease(l, this);
2625 }
2626}
2627
2628
2629class C_MDL_RequestInodeFileCaps : public LockerContext {
2630 CInode *in;
2631public:
2632 C_MDL_RequestInodeFileCaps(Locker *l, CInode *i) : LockerContext(l), in(i) {
2633 in->get(CInode::PIN_PTRWAITER);
2634 }
2635 void finish(int r) override {
2636 if (!in->is_auth())
2637 locker->request_inode_file_caps(in);
2638 in->put(CInode::PIN_PTRWAITER);
2639 }
2640};
2641
2642void Locker::request_inode_file_caps(CInode *in)
2643{
11fdf7f2 2644 ceph_assert(!in->is_auth());
7c673cae 2645
f64942e4 2646 int wanted = in->get_caps_wanted() & in->get_caps_allowed_ever() & ~CEPH_CAP_PIN;
7c673cae
FG
2647 if (wanted != in->replica_caps_wanted) {
2648 // wait for single auth
2649 if (in->is_ambiguous_auth()) {
2650 in->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH,
2651 new C_MDL_RequestInodeFileCaps(this, in));
2652 return;
2653 }
2654
2655 mds_rank_t auth = in->authority().first;
2656 if (mds->is_cluster_degraded() &&
2657 mds->mdsmap->get_state(auth) == MDSMap::STATE_REJOIN) {
2658 mds->wait_for_active_peer(auth, new C_MDL_RequestInodeFileCaps(this, in));
2659 return;
2660 }
2661
2662 dout(7) << "request_inode_file_caps " << ccap_string(wanted)
2663 << " was " << ccap_string(in->replica_caps_wanted)
2664 << " on " << *in << " to mds." << auth << dendl;
2665
2666 in->replica_caps_wanted = wanted;
2667
2668 if (!mds->is_cluster_degraded() ||
2669 mds->mdsmap->is_clientreplay_or_active_or_stopping(auth))
9f95a23c 2670 mds->send_message_mds(make_message<MInodeFileCaps>(in->ino(), in->replica_caps_wanted), auth);
7c673cae
FG
2671 }
2672}
2673
9f95a23c 2674void Locker::handle_inode_file_caps(const cref_t<MInodeFileCaps> &m)
7c673cae
FG
2675{
2676 // nobody should be talking to us during recovery.
a8e16298
TL
2677 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
2678 if (mds->get_want_state() >= MDSMap::STATE_CLIENTREPLAY) {
2679 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2680 return;
2681 }
11fdf7f2 2682 ceph_abort_msg("got unexpected message during recovery");
a8e16298 2683 }
7c673cae
FG
2684
2685 // ok
2686 CInode *in = mdcache->get_inode(m->get_ino());
2687 mds_rank_t from = mds_rank_t(m->get_source().num());
2688
11fdf7f2
TL
2689 ceph_assert(in);
2690 ceph_assert(in->is_auth());
7c673cae
FG
2691
2692 dout(7) << "handle_inode_file_caps replica mds." << from << " wants caps " << ccap_string(m->get_caps()) << " on " << *in << dendl;
2693
f67539c2
TL
2694 if (mds->logger) mds->logger->inc(l_mdss_handle_inode_file_caps);
2695
11fdf7f2 2696 in->set_mds_caps_wanted(from, m->get_caps());
7c673cae
FG
2697
2698 try_eval(in, CEPH_CAP_LOCKS);
7c673cae
FG
2699}
2700
2701
2702class C_MDL_CheckMaxSize : public LockerContext {
2703 CInode *in;
2704 uint64_t new_max_size;
2705 uint64_t newsize;
2706 utime_t mtime;
2707
2708public:
2709 C_MDL_CheckMaxSize(Locker *l, CInode *i, uint64_t _new_max_size,
2710 uint64_t _newsize, utime_t _mtime) :
2711 LockerContext(l), in(i),
2712 new_max_size(_new_max_size), newsize(_newsize), mtime(_mtime)
2713 {
2714 in->get(CInode::PIN_PTRWAITER);
2715 }
2716 void finish(int r) override {
2717 if (in->is_auth())
2718 locker->check_inode_max_size(in, false, new_max_size, newsize, mtime);
2719 in->put(CInode::PIN_PTRWAITER);
2720 }
2721};
2722
f67539c2 2723uint64_t Locker::calc_new_max_size(const CInode::inode_const_ptr &pi, uint64_t size)
31f18b77
FG
2724{
2725 uint64_t new_max = (size + 1) << 1;
11fdf7f2 2726 uint64_t max_inc = g_conf()->mds_client_writeable_range_max_inc_objs;
31f18b77 2727 if (max_inc > 0) {
b32b8144 2728 max_inc *= pi->layout.object_size;
11fdf7f2 2729 new_max = std::min(new_max, size + max_inc);
31f18b77 2730 }
11fdf7f2 2731 return round_up_to(new_max, pi->get_layout_size_increment());
31f18b77 2732}
7c673cae 2733
f91f0fd5 2734bool Locker::check_client_ranges(CInode *in, uint64_t size)
7c673cae 2735{
f67539c2 2736 const auto& latest = in->get_projected_inode();
7c673cae 2737 uint64_t ms;
a8e16298 2738 if (latest->has_layout()) {
31f18b77 2739 ms = calc_new_max_size(latest, size);
7c673cae
FG
2740 } else {
2741 // Layout-less directories like ~mds0/, have zero size
2742 ms = 0;
2743 }
2744
f91f0fd5
TL
2745 auto it = latest->client_ranges.begin();
2746 for (auto &p : in->client_caps) {
2747 if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) {
2748 if (it == latest->client_ranges.end())
2749 return true;
2750 if (it->first != p.first)
2751 return true;
2752 if (ms > it->second.range.last)
2753 return true;
2754 ++it;
2755 }
2756 }
2757 return it != latest->client_ranges.end();
2758}
2759
2760bool Locker::calc_new_client_ranges(CInode *in, uint64_t size, bool *max_increased)
2761{
f67539c2 2762 const auto& latest = in->get_projected_inode();
f91f0fd5 2763 uint64_t ms;
f67539c2
TL
2764 if (latest->has_layout()) {
2765 ms = calc_new_max_size(latest, size);
f91f0fd5
TL
2766 } else {
2767 // Layout-less directories like ~mds0/, have zero size
2768 ms = 0;
2769 }
2770
f67539c2 2771 auto pi = in->_get_projected_inode();
f91f0fd5
TL
2772 bool updated = false;
2773
7c673cae
FG
2774 // increase ranges as appropriate.
2775 // shrink to 0 if no WR|BUFFER caps issued.
f91f0fd5 2776 auto it = pi->client_ranges.begin();
11fdf7f2
TL
2777 for (auto &p : in->client_caps) {
2778 if ((p.second.issued() | p.second.wanted()) & CEPH_CAP_ANY_FILE_WR) {
f91f0fd5
TL
2779 while (it != pi->client_ranges.end() && it->first < p.first) {
2780 it = pi->client_ranges.erase(it);
2781 updated = true;
2782 }
2783
2784 if (it != pi->client_ranges.end() && it->first == p.first) {
2785 if (ms > it->second.range.last) {
2786 it->second.range.last = ms;
2787 updated = true;
2788 if (max_increased)
2789 *max_increased = true;
2790 }
7c673cae 2791 } else {
f91f0fd5
TL
2792 it = pi->client_ranges.emplace_hint(it, std::piecewise_construct,
2793 std::forward_as_tuple(p.first),
2794 std::forward_as_tuple());
2795 it->second.range.last = ms;
2796 it->second.follows = in->first - 1;
2797 updated = true;
2798 if (max_increased)
2799 *max_increased = true;
7c673cae 2800 }
f91f0fd5
TL
2801 p.second.mark_clientwriteable();
2802 ++it;
a8e16298 2803 } else {
f91f0fd5 2804 p.second.clear_clientwriteable();
7c673cae
FG
2805 }
2806 }
f91f0fd5
TL
2807 while (it != pi->client_ranges.end()) {
2808 it = pi->client_ranges.erase(it);
2809 updated = true;
2810 }
2811 if (updated) {
2812 if (pi->client_ranges.empty())
2813 in->clear_clientwriteable();
2814 else
2815 in->mark_clientwriteable();
2816 }
2817 return updated;
7c673cae
FG
2818}
2819
2820bool Locker::check_inode_max_size(CInode *in, bool force_wrlock,
2821 uint64_t new_max_size, uint64_t new_size,
2822 utime_t new_mtime)
2823{
11fdf7f2
TL
2824 ceph_assert(in->is_auth());
2825 ceph_assert(in->is_file());
7c673cae 2826
f67539c2 2827 const auto& latest = in->get_projected_inode();
7c673cae
FG
2828 uint64_t size = latest->size;
2829 bool update_size = new_size > 0;
7c673cae
FG
2830
2831 if (update_size) {
11fdf7f2
TL
2832 new_size = size = std::max(size, new_size);
2833 new_mtime = std::max(new_mtime, latest->mtime);
7c673cae
FG
2834 if (latest->size == new_size && latest->mtime == new_mtime)
2835 update_size = false;
2836 }
2837
f91f0fd5
TL
2838 bool new_ranges = check_client_ranges(in, std::max(new_max_size, size));
2839 if (!update_size && !new_ranges) {
2840 dout(20) << "check_inode_max_size no-op on " << *in << dendl;
2841 return false;
2842 }
2843
2844 dout(10) << "check_inode_max_size new_ranges " << new_ranges
2845 << " update_size " << update_size
2846 << " on " << *in << dendl;
2847
a8e16298 2848 if (in->is_frozen()) {
f91f0fd5
TL
2849 dout(10) << "check_inode_max_size frozen, waiting on " << *in << dendl;
2850 in->add_waiter(CInode::WAIT_UNFREEZE,
2851 new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime));
2852 return false;
a8e16298
TL
2853 } else if (!force_wrlock && !in->filelock.can_wrlock(in->get_loner())) {
2854 // lock?
2855 if (in->filelock.is_stable()) {
2856 if (in->get_target_loner() >= 0)
2857 file_excl(&in->filelock);
2858 else
2859 simple_lock(&in->filelock);
2860 }
f91f0fd5 2861 if (!in->filelock.can_wrlock(in->get_loner())) {
7c673cae 2862 dout(10) << "check_inode_max_size can't wrlock, waiting on " << *in << dendl;
f91f0fd5
TL
2863 in->filelock.add_waiter(SimpleLock::WAIT_STABLE,
2864 new C_MDL_CheckMaxSize(this, in, new_max_size, new_size, new_mtime));
2865 return false;
7c673cae
FG
2866 }
2867 }
2868
2869 MutationRef mut(new MutationImpl());
2870 mut->ls = mds->mdlog->get_current_segment();
2871
f67539c2
TL
2872 auto pi = in->project_inode(mut);
2873 pi.inode->version = in->pre_dirty();
7c673cae 2874
f91f0fd5
TL
2875 bool max_increased = false;
2876 if (new_ranges &&
2877 calc_new_client_ranges(in, std::max(new_max_size, size), &max_increased)) {
2878 dout(10) << "check_inode_max_size client_ranges "
2879 << in->get_previous_projected_inode()->client_ranges
f67539c2 2880 << " -> " << pi.inode->client_ranges << dendl;
7c673cae
FG
2881 }
2882
2883 if (update_size) {
f67539c2
TL
2884 dout(10) << "check_inode_max_size size " << pi.inode->size << " -> " << new_size << dendl;
2885 pi.inode->size = new_size;
2886 pi.inode->rstat.rbytes = new_size;
2887 dout(10) << "check_inode_max_size mtime " << pi.inode->mtime << " -> " << new_mtime << dendl;
2888 pi.inode->mtime = new_mtime;
2889 if (new_mtime > pi.inode->ctime) {
2890 pi.inode->ctime = new_mtime;
2891 if (new_mtime > pi.inode->rstat.rctime)
2892 pi.inode->rstat.rctime = new_mtime;
91327a77 2893 }
7c673cae
FG
2894 }
2895
2896 // use EOpen if the file is still open; otherwise, use EUpdate.
2897 // this is just an optimization to push open files forward into
2898 // newer log segments.
2899 LogEvent *le;
2900 EMetaBlob *metablob;
2901 if (in->is_any_caps_wanted() && in->last == CEPH_NOSNAP) {
2902 EOpen *eo = new EOpen(mds->mdlog);
2903 eo->add_ino(in->ino());
2904 metablob = &eo->metablob;
2905 le = eo;
7c673cae
FG
2906 } else {
2907 EUpdate *eu = new EUpdate(mds->mdlog, "check_inode_max_size");
2908 metablob = &eu->metablob;
2909 le = eu;
2910 }
2911 mds->mdlog->start_entry(le);
f67539c2
TL
2912
2913 mdcache->predirty_journal_parents(mut, metablob, in, 0, PREDIRTY_PRIMARY);
2914 // no cow, here!
2915 CDentry *parent = in->get_projected_parent_dn();
2916 metablob->add_primary_dentry(parent, in, true);
2917 mdcache->journal_dirty_inode(mut.get(), metablob, in);
2918
11fdf7f2 2919 mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut,
9f95a23c 2920 UPDATE_SHAREMAX, ref_t<MClientCaps>()));
7c673cae
FG
2921 wrlock_force(&in->filelock, mut); // wrlock for duration of journal
2922 mut->auth_pin(in);
2923
2924 // make max_size _increase_ timely
2925 if (max_increased)
2926 mds->mdlog->flush();
2927
2928 return true;
2929}
2930
2931
2932void Locker::share_inode_max_size(CInode *in, Capability *only_cap)
2933{
2934 /*
2935 * only share if currently issued a WR cap. if client doesn't have it,
2936 * file_max doesn't matter, and the client will get it if/when they get
2937 * the cap later.
2938 */
2939 dout(10) << "share_inode_max_size on " << *in << dendl;
11fdf7f2 2940 map<client_t, Capability>::iterator it;
7c673cae
FG
2941 if (only_cap)
2942 it = in->client_caps.find(only_cap->get_client());
2943 else
2944 it = in->client_caps.begin();
2945 for (; it != in->client_caps.end(); ++it) {
2946 const client_t client = it->first;
11fdf7f2 2947 Capability *cap = &it->second;
7c673cae
FG
2948 if (cap->is_suppress())
2949 continue;
2950 if (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER)) {
2951 dout(10) << "share_inode_max_size with client." << client << dendl;
f67539c2 2952 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_grant);
7c673cae 2953 cap->inc_last_seq();
9f95a23c 2954 auto m = make_message<MClientCaps>(CEPH_CAP_OP_GRANT,
11fdf7f2
TL
2955 in->ino(),
2956 in->find_snaprealm()->inode->ino(),
2957 cap->get_cap_id(),
2958 cap->get_last_seq(),
2959 cap->pending(),
2960 cap->wanted(), 0,
2961 cap->get_mseq(),
2962 mds->get_osd_epoch_barrier());
7c673cae
FG
2963 in->encode_cap_message(m, cap);
2964 mds->send_message_client_counted(m, client);
2965 }
2966 if (only_cap)
2967 break;
2968 }
2969}
2970
2a845540 2971bool Locker::_need_flush_mdlog(CInode *in, int wanted, bool lock_state_any)
7c673cae
FG
2972{
2973 /* flush log if caps are wanted by client but corresponding lock is unstable and locked by
2974 * pending mutations. */
2975 if (((wanted & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR|CEPH_CAP_FILE_SHARED|CEPH_CAP_FILE_EXCL)) &&
2a845540 2976 (lock_state_any ? in->filelock.is_locked() : in->filelock.is_unstable_and_locked())) ||
7c673cae 2977 ((wanted & (CEPH_CAP_AUTH_SHARED|CEPH_CAP_AUTH_EXCL)) &&
2a845540 2978 (lock_state_any ? in->authlock.is_locked() : in->authlock.is_unstable_and_locked())) ||
7c673cae 2979 ((wanted & (CEPH_CAP_LINK_SHARED|CEPH_CAP_LINK_EXCL)) &&
2a845540 2980 (lock_state_any ? in->linklock.is_locked() : in->linklock.is_unstable_and_locked())) ||
7c673cae 2981 ((wanted & (CEPH_CAP_XATTR_SHARED|CEPH_CAP_XATTR_EXCL)) &&
2a845540 2982 (lock_state_any ? in->xattrlock.is_locked() : in->xattrlock.is_unstable_and_locked())))
7c673cae
FG
2983 return true;
2984 return false;
2985}
2986
2987void Locker::adjust_cap_wanted(Capability *cap, int wanted, int issue_seq)
2988{
2989 if (ceph_seq_cmp(issue_seq, cap->get_last_issue()) == 0) {
2990 dout(10) << " wanted " << ccap_string(cap->wanted())
2991 << " -> " << ccap_string(wanted) << dendl;
2992 cap->set_wanted(wanted);
2993 } else if (wanted & ~cap->wanted()) {
2994 dout(10) << " wanted " << ccap_string(cap->wanted())
2995 << " -> " << ccap_string(wanted)
2996 << " (added caps even though we had seq mismatch!)" << dendl;
2997 cap->set_wanted(wanted | cap->wanted());
2998 } else {
2999 dout(10) << " NOT changing wanted " << ccap_string(cap->wanted())
3000 << " -> " << ccap_string(wanted)
3001 << " (issue_seq " << issue_seq << " != last_issue "
3002 << cap->get_last_issue() << ")" << dendl;
3003 return;
3004 }
3005
3006 CInode *cur = cap->get_inode();
3007 if (!cur->is_auth()) {
3008 request_inode_file_caps(cur);
3009 return;
3010 }
3011
11fdf7f2 3012 if (cap->wanted()) {
7c673cae
FG
3013 if (cur->state_test(CInode::STATE_RECOVERING) &&
3014 (cap->wanted() & (CEPH_CAP_FILE_RD |
3015 CEPH_CAP_FILE_WR))) {
3016 mds->mdcache->recovery_queue.prioritize(cur);
3017 }
3018
11fdf7f2
TL
3019 if (mdcache->open_file_table.should_log_open(cur)) {
3020 ceph_assert(cur->last == CEPH_NOSNAP);
7c673cae
FG
3021 EOpen *le = new EOpen(mds->mdlog);
3022 mds->mdlog->start_entry(le);
3023 le->add_clean_inode(cur);
7c673cae
FG
3024 mds->mdlog->submit_entry(le);
3025 }
3026 }
3027}
3028
11fdf7f2
TL
3029void Locker::snapflush_nudge(CInode *in)
3030{
3031 ceph_assert(in->last != CEPH_NOSNAP);
3032 if (in->client_snap_caps.empty())
3033 return;
3034
3035 CInode *head = mdcache->get_inode(in->ino());
3036 // head inode gets unpinned when snapflush starts. It might get trimmed
3037 // before snapflush finishes.
3038 if (!head)
3039 return;
7c673cae 3040
11fdf7f2
TL
3041 ceph_assert(head->is_auth());
3042 if (head->client_need_snapflush.empty())
3043 return;
3044
3045 SimpleLock *hlock = head->get_lock(CEPH_LOCK_IFILE);
3046 if (hlock->get_state() == LOCK_SYNC || !hlock->is_stable()) {
3047 hlock = NULL;
3048 for (int i = 0; i < num_cinode_locks; i++) {
3049 SimpleLock *lock = head->get_lock(cinode_lock_info[i].lock);
3050 if (lock->get_state() != LOCK_SYNC && lock->is_stable()) {
3051 hlock = lock;
3052 break;
3053 }
3054 }
3055 }
3056 if (hlock) {
3057 _rdlock_kick(hlock, true);
3058 } else {
3059 // also, requeue, in case of unstable lock
3060 need_snapflush_inodes.push_back(&in->item_caps);
3061 }
3062}
3063
3064void Locker::mark_need_snapflush_inode(CInode *in)
3065{
3066 ceph_assert(in->last != CEPH_NOSNAP);
3067 if (!in->item_caps.is_on_list()) {
3068 need_snapflush_inodes.push_back(&in->item_caps);
3069 utime_t now = ceph_clock_now();
3070 in->last_dirstat_prop = now;
3071 dout(10) << "mark_need_snapflush_inode " << *in << " - added at " << now << dendl;
3072 }
3073}
7c673cae 3074
494da23a
TL
3075bool Locker::is_revoking_any_caps_from(client_t client)
3076{
3077 auto it = revoking_caps_by_client.find(client);
3078 if (it == revoking_caps_by_client.end())
3079 return false;
3080 return !it->second.empty();
3081}
3082
c07f9fc5 3083void Locker::_do_null_snapflush(CInode *head_in, client_t client, snapid_t last)
7c673cae
FG
3084{
3085 dout(10) << "_do_null_snapflush client." << client << " on " << *head_in << dendl;
c07f9fc5
FG
3086 for (auto p = head_in->client_need_snapflush.begin();
3087 p != head_in->client_need_snapflush.end() && p->first < last; ) {
7c673cae 3088 snapid_t snapid = p->first;
94b18763 3089 auto &clients = p->second;
7c673cae
FG
3090 ++p; // be careful, q loop below depends on this
3091
3092 if (clients.count(client)) {
3093 dout(10) << " doing async NULL snapflush on " << snapid << " from client." << client << dendl;
b32b8144 3094 CInode *sin = mdcache->pick_inode_snap(head_in, snapid - 1);
11fdf7f2
TL
3095 ceph_assert(sin);
3096 ceph_assert(sin->first <= snapid);
9f95a23c 3097 _do_snap_update(sin, snapid, 0, sin->first - 1, client, ref_t<MClientCaps>(), ref_t<MClientCaps>());
7c673cae
FG
3098 head_in->remove_need_snapflush(sin, snapid, client);
3099 }
3100 }
3101}
3102
3103
3104bool Locker::should_defer_client_cap_frozen(CInode *in)
3105{
9f95a23c
TL
3106 if (in->is_frozen())
3107 return true;
3108
7c673cae 3109 /*
9f95a23c
TL
3110 * This policy needs to be AT LEAST as permissive as allowing a client
3111 * request to go forward, or else a client request can release something,
3112 * the release gets deferred, but the request gets processed and deadlocks
3113 * because when the caps can't get revoked.
7c673cae 3114 *
9f95a23c
TL
3115 * No auth_pin implies that there is no unstable lock and @in is not auth
3116 * pinnned by client request. If parent dirfrag is auth pinned by a lock
3117 * cache, later request from lock cache owner may forcibly auth pin the @in.
7c673cae 3118 */
9f95a23c
TL
3119 if (in->is_freezing() && in->get_num_auth_pins() == 0) {
3120 CDir* dir = in->get_parent_dir();
3121 if (!dir || !dir->is_auth_pinned_by_lock_cache())
3122 return true;
3123 }
3124 return false;
7c673cae
FG
3125}
3126
9f95a23c 3127void Locker::handle_client_caps(const cref_t<MClientCaps> &m)
7c673cae 3128{
7c673cae 3129 client_t client = m->get_source().num();
7c673cae 3130 snapid_t follows = m->get_snap_follows();
11fdf7f2
TL
3131 auto op = m->get_op();
3132 auto dirty = m->get_dirty();
7c673cae 3133 dout(7) << "handle_client_caps "
7c673cae
FG
3134 << " on " << m->get_ino()
3135 << " tid " << m->get_client_tid() << " follows " << follows
11fdf7f2
TL
3136 << " op " << ceph_cap_op_name(op)
3137 << " flags 0x" << std::hex << m->flags << std::dec << dendl;
7c673cae 3138
94b18763 3139 Session *session = mds->get_session(m);
7c673cae
FG
3140 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
3141 if (!session) {
3142 dout(5) << " no session, dropping " << *m << dendl;
7c673cae
FG
3143 return;
3144 }
3145 if (session->is_closed() ||
3146 session->is_closing() ||
3147 session->is_killing()) {
3148 dout(7) << " session closed|closing|killing, dropping " << *m << dendl;
7c673cae
FG
3149 return;
3150 }
11fdf7f2
TL
3151 if ((mds->is_reconnect() || mds->get_want_state() == MDSMap::STATE_RECONNECT) &&
3152 dirty && m->get_client_tid() > 0 &&
7c673cae 3153 !session->have_completed_flush(m->get_client_tid())) {
11fdf7f2
TL
3154 mdcache->set_reconnected_dirty_caps(client, m->get_ino(), dirty,
3155 op == CEPH_CAP_OP_FLUSHSNAP);
7c673cae
FG
3156 }
3157 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
3158 return;
3159 }
3160
f67539c2
TL
3161 if (mds->logger) mds->logger->inc(l_mdss_handle_client_caps);
3162 if (dirty) {
3163 if (mds->logger) mds->logger->inc(l_mdss_handle_client_caps_dirty);
3164 }
3165
7c673cae
FG
3166 if (m->get_client_tid() > 0 && session &&
3167 session->have_completed_flush(m->get_client_tid())) {
3168 dout(7) << "handle_client_caps already flushed tid " << m->get_client_tid()
3169 << " for client." << client << dendl;
9f95a23c 3170 ref_t<MClientCaps> ack;
11fdf7f2 3171 if (op == CEPH_CAP_OP_FLUSHSNAP) {
f67539c2 3172 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
9f95a23c 3173 ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, m->get_ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
7c673cae 3174 } else {
f67539c2 3175 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
9f95a23c 3176 ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, m->get_ino(), 0, m->get_cap_id(), m->get_seq(), m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
7c673cae
FG
3177 }
3178 ack->set_snap_follows(follows);
3179 ack->set_client_tid(m->get_client_tid());
3180 mds->send_message_client_counted(ack, m->get_connection());
11fdf7f2 3181 if (op == CEPH_CAP_OP_FLUSHSNAP) {
7c673cae
FG
3182 return;
3183 } else {
3184 // fall-thru because the message may release some caps
11fdf7f2
TL
3185 dirty = false;
3186 op = CEPH_CAP_OP_UPDATE;
7c673cae
FG
3187 }
3188 }
3189
3190 // "oldest flush tid" > 0 means client uses unique TID for each flush
3191 if (m->get_oldest_flush_tid() > 0 && session) {
3192 if (session->trim_completed_flushes(m->get_oldest_flush_tid())) {
3193 mds->mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
3194
3195 if (session->get_num_trim_flushes_warnings() > 0 &&
11fdf7f2 3196 session->get_num_completed_flushes() * 2 < g_conf()->mds_max_completed_flushes)
7c673cae
FG
3197 session->reset_num_trim_flushes_warnings();
3198 } else {
3199 if (session->get_num_completed_flushes() >=
11fdf7f2 3200 (g_conf()->mds_max_completed_flushes << session->get_num_trim_flushes_warnings())) {
7c673cae 3201 session->inc_num_trim_flushes_warnings();
f67539c2
TL
3202 CachedStackStringStream css;
3203 *css << "client." << session->get_client() << " does not advance its oldest_flush_tid ("
3204 << m->get_oldest_flush_tid() << "), "
3205 << session->get_num_completed_flushes()
3206 << " completed flushes recorded in session";
3207 mds->clog->warn() << css->strv();
3208 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae
FG
3209 }
3210 }
3211 }
3212
3213 CInode *head_in = mdcache->get_inode(m->get_ino());
3214 if (!head_in) {
3215 if (mds->is_clientreplay()) {
3216 dout(7) << "handle_client_caps on unknown ino " << m->get_ino()
3217 << ", will try again after replayed client requests" << dendl;
3218 mdcache->wait_replay_cap_reconnect(m->get_ino(), new C_MDS_RetryMessage(mds, m));
3219 return;
3220 }
1adf2230
AA
3221
3222 /*
3223 * "handle_client_caps on unknown ino xxx” is normal after migrating a subtree
3224 * Sequence of events that cause this are:
3225 * - client sends caps message to mds.a
3226 * - mds finishes subtree migration, send cap export to client
3227 * - mds trim its cache
3228 * - mds receives cap messages from client
3229 */
3230 dout(7) << "handle_client_caps on unknown ino " << m->get_ino() << ", dropping" << dendl;
7c673cae
FG
3231 return;
3232 }
3233
3234 if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
3235 // Pause RADOS operations until we see the required epoch
3236 mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
3237 }
3238
3239 if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
3240 // Record the barrier so that we will retransmit it to clients
3241 mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
3242 }
3243
b32b8144 3244 dout(10) << " head inode " << *head_in << dendl;
7c673cae
FG
3245
3246 Capability *cap = 0;
b32b8144 3247 cap = head_in->get_client_cap(client);
7c673cae 3248 if (!cap) {
b32b8144 3249 dout(7) << "handle_client_caps no cap for client." << client << " on " << *head_in << dendl;
7c673cae
FG
3250 return;
3251 }
11fdf7f2 3252 ceph_assert(cap);
7c673cae
FG
3253
3254 // freezing|frozen?
b32b8144
FG
3255 if (should_defer_client_cap_frozen(head_in)) {
3256 dout(7) << "handle_client_caps freezing|frozen on " << *head_in << dendl;
3257 head_in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, m));
7c673cae
FG
3258 return;
3259 }
3260 if (ceph_seq_cmp(m->get_mseq(), cap->get_mseq()) < 0) {
3261 dout(7) << "handle_client_caps mseq " << m->get_mseq() << " < " << cap->get_mseq()
3262 << ", dropping" << dendl;
7c673cae
FG
3263 return;
3264 }
3265
11fdf7f2 3266 bool need_unpin = false;
7c673cae
FG
3267
3268 // flushsnap?
3269 if (op == CEPH_CAP_OP_FLUSHSNAP) {
b32b8144
FG
3270 if (!head_in->is_auth()) {
3271 dout(7) << " not auth, ignoring flushsnap on " << *head_in << dendl;
7c673cae
FG
3272 goto out;
3273 }
3274
b32b8144 3275 SnapRealm *realm = head_in->find_snaprealm();
7c673cae
FG
3276 snapid_t snap = realm->get_snap_following(follows);
3277 dout(10) << " flushsnap follows " << follows << " -> snap " << snap << dendl;
3278
11fdf7f2
TL
3279 auto p = head_in->client_need_snapflush.begin();
3280 if (p != head_in->client_need_snapflush.end() && p->first < snap) {
3281 head_in->auth_pin(this); // prevent subtree frozen
3282 need_unpin = true;
3283 _do_null_snapflush(head_in, client, snap);
3284 }
3285
b32b8144
FG
3286 CInode *in = head_in;
3287 if (snap != CEPH_NOSNAP) {
3288 in = mdcache->pick_inode_snap(head_in, snap - 1);
3289 if (in != head_in)
3290 dout(10) << " snapped inode " << *in << dendl;
3291 }
3292
7c673cae
FG
3293 // we can prepare the ack now, since this FLUSHEDSNAP is independent of any
3294 // other cap ops. (except possibly duplicate FLUSHSNAP requests, but worst
3295 // case we get a dup response, so whatever.)
9f95a23c 3296 ref_t<MClientCaps> ack;
11fdf7f2 3297 if (dirty) {
9f95a23c 3298 ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP_ACK, in->ino(), 0, 0, 0, 0, 0, dirty, 0, mds->get_osd_epoch_barrier());
7c673cae
FG
3299 ack->set_snap_follows(follows);
3300 ack->set_client_tid(m->get_client_tid());
3301 ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
3302 }
3303
3304 if (in == head_in ||
3305 (head_in->client_need_snapflush.count(snap) &&
3306 head_in->client_need_snapflush[snap].count(client))) {
3307 dout(7) << " flushsnap snap " << snap
3308 << " client." << client << " on " << *in << dendl;
3309
3310 // this cap now follows a later snap (i.e. the one initiating this flush, or later)
3311 if (in == head_in)
3312 cap->client_follows = snap < CEPH_NOSNAP ? snap : realm->get_newest_seq();
3313
11fdf7f2 3314 _do_snap_update(in, snap, dirty, follows, client, m, ack);
7c673cae
FG
3315
3316 if (in != head_in)
3317 head_in->remove_need_snapflush(in, snap, client);
7c673cae
FG
3318 } else {
3319 dout(7) << " not expecting flushsnap " << snap << " from client." << client << " on " << *in << dendl;
f67539c2
TL
3320 if (ack) {
3321 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
7c673cae 3322 mds->send_message_client_counted(ack, m->get_connection());
f67539c2 3323 }
7c673cae
FG
3324 }
3325 goto out;
3326 }
3327
3328 if (cap->get_cap_id() != m->get_cap_id()) {
3329 dout(7) << " ignoring client capid " << m->get_cap_id() << " != my " << cap->get_cap_id() << dendl;
3330 } else {
b32b8144
FG
3331 CInode *in = head_in;
3332 if (follows > 0) {
3333 in = mdcache->pick_inode_snap(head_in, follows);
3334 // intermediate snap inodes
3335 while (in != head_in) {
11fdf7f2
TL
3336 ceph_assert(in->last != CEPH_NOSNAP);
3337 if (in->is_auth() && dirty) {
b32b8144 3338 dout(10) << " updating intermediate snapped inode " << *in << dendl;
9f95a23c 3339 _do_cap_update(in, NULL, dirty, follows, m, ref_t<MClientCaps>());
b32b8144
FG
3340 }
3341 in = mdcache->pick_inode_snap(head_in, in->last);
7c673cae 3342 }
7c673cae
FG
3343 }
3344
3345 // head inode, and cap
9f95a23c 3346 ref_t<MClientCaps> ack;
7c673cae
FG
3347
3348 int caps = m->get_caps();
3349 if (caps & ~cap->issued()) {
3350 dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
3351 caps &= cap->issued();
3352 }
3353
9f95a23c 3354 int revoked = cap->confirm_receipt(m->get_seq(), caps);
7c673cae
FG
3355 dout(10) << " follows " << follows
3356 << " retains " << ccap_string(m->get_caps())
11fdf7f2 3357 << " dirty " << ccap_string(dirty)
7c673cae
FG
3358 << " on " << *in << dendl;
3359
9f95a23c
TL
3360 if (revoked & CEPH_CAP_ANY_DIR_OPS)
3361 eval_lock_caches(cap);
7c673cae
FG
3362
3363 // missing/skipped snapflush?
3364 // The client MAY send a snapflush if it is issued WR/EXCL caps, but
3365 // presently only does so when it has actual dirty metadata. But, we
3366 // set up the need_snapflush stuff based on the issued caps.
3367 // We can infer that the client WONT send a FLUSHSNAP once they have
3368 // released all WR/EXCL caps (the FLUSHSNAP always comes before the cap
3369 // update/release).
3370 if (!head_in->client_need_snapflush.empty()) {
11fdf7f2
TL
3371 if (!(cap->issued() & CEPH_CAP_ANY_FILE_WR) &&
3372 !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP)) {
3373 head_in->auth_pin(this); // prevent subtree frozen
3374 need_unpin = true;
7c673cae
FG
3375 _do_null_snapflush(head_in, client);
3376 } else {
3377 dout(10) << " revocation in progress, not making any conclusions about null snapflushes" << dendl;
3378 }
3379 }
eafe8130
TL
3380 if (cap->need_snapflush() && !(m->flags & MClientCaps::FLAG_PENDING_CAPSNAP))
3381 cap->clear_needsnapflush();
11fdf7f2 3382
11fdf7f2
TL
3383 if (dirty && in->is_auth()) {
3384 dout(7) << " flush client." << client << " dirty " << ccap_string(dirty)
7c673cae 3385 << " seq " << m->get_seq() << " on " << *in << dendl;
9f95a23c 3386 ack = make_message<MClientCaps>(CEPH_CAP_OP_FLUSH_ACK, in->ino(), 0, cap->get_cap_id(), m->get_seq(),
11fdf7f2 3387 m->get_caps(), 0, dirty, 0, mds->get_osd_epoch_barrier());
7c673cae
FG
3388 ack->set_client_tid(m->get_client_tid());
3389 ack->set_oldest_flush_tid(m->get_oldest_flush_tid());
3390 }
3391
3392 // filter wanted based on what we could ever give out (given auth/replica status)
11fdf7f2 3393 bool need_flush = m->flags & MClientCaps::FLAG_SYNC;
f64942e4 3394 int new_wanted = m->get_wanted();
7c673cae 3395 if (new_wanted != cap->wanted()) {
f64942e4 3396 if (!need_flush && in->is_auth() && (new_wanted & ~cap->pending())) {
7c673cae
FG
3397 // exapnding caps. make sure we aren't waiting for a log flush
3398 need_flush = _need_flush_mdlog(head_in, new_wanted & ~cap->pending());
3399 }
3400
3401 adjust_cap_wanted(cap, new_wanted, m->get_issue_seq());
3402 }
11fdf7f2 3403
eafe8130
TL
3404 if (in->is_auth() &&
3405 _do_cap_update(in, cap, dirty, follows, m, ack, &need_flush)) {
3406 // updated
7c673cae
FG
3407 eval(in, CEPH_CAP_LOCKS);
3408
3409 if (!need_flush && (cap->wanted() & ~cap->pending()))
3410 need_flush = _need_flush_mdlog(in, cap->wanted() & ~cap->pending());
3411 } else {
3412 // no update, ack now.
f67539c2
TL
3413 if (ack) {
3414 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flush_ack);
7c673cae 3415 mds->send_message_client_counted(ack, m->get_connection());
f67539c2 3416 }
7c673cae
FG
3417
3418 bool did_issue = eval(in, CEPH_CAP_LOCKS);
3419 if (!did_issue && (cap->wanted() & ~cap->pending()))
3420 issue_caps(in, cap);
3421
3422 if (cap->get_last_seq() == 0 &&
3423 (cap->pending() & (CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER))) {
7c673cae
FG
3424 share_inode_max_size(in, cap);
3425 }
3426 }
3427
3428 if (need_flush)
3429 mds->mdlog->flush();
3430 }
3431
3432 out:
11fdf7f2
TL
3433 if (need_unpin)
3434 head_in->auth_unpin(this);
7c673cae
FG
3435}
3436
3437
3438class C_Locker_RetryRequestCapRelease : public LockerContext {
3439 client_t client;
3440 ceph_mds_request_release item;
3441public:
3442 C_Locker_RetryRequestCapRelease(Locker *l, client_t c, const ceph_mds_request_release& it) :
3443 LockerContext(l), client(c), item(it) { }
3444 void finish(int r) override {
3445 string dname;
3446 MDRequestRef null_ref;
3447 locker->process_request_cap_release(null_ref, client, item, dname);
3448 }
3449};
3450
3451void Locker::process_request_cap_release(MDRequestRef& mdr, client_t client, const ceph_mds_request_release& item,
11fdf7f2 3452 std::string_view dname)
7c673cae
FG
3453{
3454 inodeno_t ino = (uint64_t)item.ino;
3455 uint64_t cap_id = item.cap_id;
3456 int caps = item.caps;
3457 int wanted = item.wanted;
3458 int seq = item.seq;
3459 int issue_seq = item.issue_seq;
3460 int mseq = item.mseq;
3461
3462 CInode *in = mdcache->get_inode(ino);
3463 if (!in)
3464 return;
3465
3466 if (dname.length()) {
3467 frag_t fg = in->pick_dirfrag(dname);
3468 CDir *dir = in->get_dirfrag(fg);
3469 if (dir) {
3470 CDentry *dn = dir->lookup(dname);
3471 if (dn) {
3472 ClientLease *l = dn->get_client_lease(client);
3473 if (l) {
92f5a8d4 3474 dout(10) << __func__ << " removing lease on " << *dn << dendl;
7c673cae
FG
3475 dn->remove_client_lease(l, this);
3476 } else {
92f5a8d4 3477 dout(7) << __func__ << " client." << client
7c673cae
FG
3478 << " doesn't have lease on " << *dn << dendl;
3479 }
3480 } else {
92f5a8d4 3481 dout(7) << __func__ << " client." << client << " released lease on dn "
7c673cae
FG
3482 << dir->dirfrag() << "/" << dname << " which dne" << dendl;
3483 }
3484 }
3485 }
3486
3487 Capability *cap = in->get_client_cap(client);
3488 if (!cap)
3489 return;
3490
92f5a8d4 3491 dout(10) << __func__ << " client." << client << " " << ccap_string(caps) << " on " << *in
7c673cae
FG
3492 << (mdr ? "" : " (DEFERRED, no mdr)")
3493 << dendl;
3494
3495 if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
3496 dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", dropping" << dendl;
3497 return;
3498 }
3499
3500 if (cap->get_cap_id() != cap_id) {
3501 dout(7) << " cap_id " << cap_id << " != " << cap->get_cap_id() << ", dropping" << dendl;
3502 return;
3503 }
3504
3505 if (should_defer_client_cap_frozen(in)) {
3506 dout(7) << " frozen, deferring" << dendl;
3507 in->add_waiter(CInode::WAIT_UNFREEZE, new C_Locker_RetryRequestCapRelease(this, client, item));
3508 return;
3509 }
3510
f67539c2
TL
3511 if (mds->logger) mds->logger->inc(l_mdss_process_request_cap_release);
3512
7c673cae
FG
3513 if (caps & ~cap->issued()) {
3514 dout(10) << " confirming not issued caps " << ccap_string(caps & ~cap->issued()) << dendl;
3515 caps &= cap->issued();
3516 }
9f95a23c
TL
3517 int revoked = cap->confirm_receipt(seq, caps);
3518 if (revoked & CEPH_CAP_ANY_DIR_OPS)
3519 eval_lock_caches(cap);
7c673cae
FG
3520
3521 if (!in->client_need_snapflush.empty() &&
3522 (cap->issued() & CEPH_CAP_ANY_FILE_WR) == 0) {
3523 _do_null_snapflush(in, client);
3524 }
3525
3526 adjust_cap_wanted(cap, wanted, issue_seq);
3527
3528 if (mdr)
3529 cap->inc_suppress();
3530 eval(in, CEPH_CAP_LOCKS);
3531 if (mdr)
3532 cap->dec_suppress();
3533
3534 // take note; we may need to reissue on this cap later
3535 if (mdr)
3536 mdr->cap_releases[in->vino()] = cap->get_last_seq();
3537}
3538
3539class C_Locker_RetryKickIssueCaps : public LockerContext {
3540 CInode *in;
3541 client_t client;
3542 ceph_seq_t seq;
3543public:
3544 C_Locker_RetryKickIssueCaps(Locker *l, CInode *i, client_t c, ceph_seq_t s) :
3545 LockerContext(l), in(i), client(c), seq(s) {
3546 in->get(CInode::PIN_PTRWAITER);
3547 }
3548 void finish(int r) override {
3549 locker->kick_issue_caps(in, client, seq);
3550 in->put(CInode::PIN_PTRWAITER);
3551 }
3552};
3553
3554void Locker::kick_issue_caps(CInode *in, client_t client, ceph_seq_t seq)
3555{
3556 Capability *cap = in->get_client_cap(client);
a8e16298 3557 if (!cap || cap->get_last_seq() != seq)
7c673cae
FG
3558 return;
3559 if (in->is_frozen()) {
3560 dout(10) << "kick_issue_caps waiting for unfreeze on " << *in << dendl;
3561 in->add_waiter(CInode::WAIT_UNFREEZE,
3562 new C_Locker_RetryKickIssueCaps(this, in, client, seq));
3563 return;
3564 }
3565 dout(10) << "kick_issue_caps released at current seq " << seq
3566 << ", reissuing" << dendl;
3567 issue_caps(in, cap);
3568}
3569
3570void Locker::kick_cap_releases(MDRequestRef& mdr)
3571{
3572 client_t client = mdr->get_client();
3573 for (map<vinodeno_t,ceph_seq_t>::iterator p = mdr->cap_releases.begin();
3574 p != mdr->cap_releases.end();
3575 ++p) {
3576 CInode *in = mdcache->get_inode(p->first);
3577 if (!in)
3578 continue;
3579 kick_issue_caps(in, client, p->second);
3580 }
3581}
3582
f38dd50b
TL
3583__u32 Locker::get_xattr_total_length(CInode::mempool_xattr_map &xattr)
3584{
3585 __u32 total = 0;
3586
3587 for (const auto &p : xattr)
3588 total += (p.first.length() + p.second.length());
3589 return total;
3590}
3591
3592void Locker::decode_new_xattrs(CInode::mempool_inode *inode,
3593 CInode::mempool_xattr_map *px,
3594 const cref_t<MClientCaps> &m)
3595{
3596 CInode::mempool_xattr_map tmp;
3597
3598 auto p = m->xattrbl.cbegin();
3599 decode_noshare(tmp, p);
3600 __u32 total = get_xattr_total_length(tmp);
3601 inode->xattr_version = m->head.xattr_version;
3602 if (total > mds->mdsmap->get_max_xattr_size()) {
3603 dout(1) << "Maximum xattr size exceeded: " << total
3604 << " max size: " << mds->mdsmap->get_max_xattr_size() << dendl;
3605 // Ignore new xattr (!!!) but increase xattr version
3606 // XXX how to force the client to drop cached xattrs?
3607 inode->xattr_version++;
3608 } else {
3609 *px = std::move(tmp);
3610 }
3611}
3612
7c673cae
FG
3613/**
3614 * m and ack might be NULL, so don't dereference them unless dirty != 0
3615 */
9f95a23c 3616void Locker::_do_snap_update(CInode *in, snapid_t snap, int dirty, snapid_t follows, client_t client, const cref_t<MClientCaps> &m, const ref_t<MClientCaps> &ack)
7c673cae
FG
3617{
3618 dout(10) << "_do_snap_update dirty " << ccap_string(dirty)
3619 << " follows " << follows << " snap " << snap
3620 << " on " << *in << dendl;
3621
3622 if (snap == CEPH_NOSNAP) {
3623 // hmm, i guess snap was already deleted? just ack!
3624 dout(10) << " wow, the snap following " << follows
3625 << " was already deleted. nothing to record, just ack." << dendl;
f67539c2
TL
3626 if (ack) {
3627 if (ack->get_op() == CEPH_CAP_OP_FLUSHSNAP_ACK) {
3628 if (mds->logger) mds->logger->inc(l_mdss_ceph_cap_op_flushsnap_ack);
3629 }
7c673cae 3630 mds->send_message_client_counted(ack, m->get_connection());
f67539c2 3631 }
7c673cae
FG
3632 return;
3633 }
3634
3635 EUpdate *le = new EUpdate(mds->mdlog, "snap flush");
3636 mds->mdlog->start_entry(le);
3637 MutationRef mut = new MutationImpl();
3638 mut->ls = mds->mdlog->get_current_segment();
3639
3640 // normal metadata updates that we can apply to the head as well.
3641
3642 // update xattrs?
94b18763
FG
3643 CInode::mempool_xattr_map *px = nullptr;
3644 bool xattrs = (dirty & CEPH_CAP_XATTR_EXCL) &&
3645 m->xattrbl.length() &&
3646 m->head.xattr_version > in->get_projected_inode()->xattr_version;
3647
f67539c2
TL
3648 CInode::mempool_old_inode *oi = nullptr;
3649 CInode::old_inode_map_ptr _old_inodes;
3650 if (in->is_any_old_inodes()) {
3651 auto last = in->pick_old_inode(snap);
3652 if (last) {
3653 _old_inodes = CInode::allocate_old_inode_map(*in->get_old_inodes());
3654 oi = &_old_inodes->at(last);
3655 if (snap > oi->first) {
3656 (*_old_inodes)[snap - 1] = *oi;;
3657 oi->first = snap;
3658 }
3659 }
7c673cae
FG
3660 }
3661
94b18763 3662 CInode::mempool_inode *i;
7c673cae
FG
3663 if (oi) {
3664 dout(10) << " writing into old inode" << dendl;
f67539c2
TL
3665 auto pi = in->project_inode(mut);
3666 pi.inode->version = in->pre_dirty();
94b18763 3667 i = &oi->inode;
7c673cae
FG
3668 if (xattrs)
3669 px = &oi->xattrs;
3670 } else {
f67539c2
TL
3671 auto pi = in->project_inode(mut, xattrs);
3672 pi.inode->version = in->pre_dirty();
3673 i = pi.inode.get();
7c673cae 3674 if (xattrs)
94b18763 3675 px = pi.xattrs.get();
7c673cae
FG
3676 }
3677
94b18763 3678 _update_cap_fields(in, dirty, m, i);
7c673cae
FG
3679
3680 // xattr
94b18763
FG
3681 if (xattrs) {
3682 dout(7) << " xattrs v" << i->xattr_version << " -> " << m->head.xattr_version
f38dd50b
TL
3683 << " len " << m->xattrbl.length() << dendl;
3684 decode_new_xattrs(i, px, m);
7c673cae
FG
3685 }
3686
94b18763
FG
3687 {
3688 auto it = i->client_ranges.find(client);
3689 if (it != i->client_ranges.end()) {
3690 if (in->last == snap) {
3691 dout(10) << " removing client_range entirely" << dendl;
3692 i->client_ranges.erase(it);
3693 } else {
3694 dout(10) << " client_range now follows " << snap << dendl;
3695 it->second.follows = snap;
3696 }
7c673cae
FG
3697 }
3698 }
3699
f67539c2
TL
3700 if (_old_inodes)
3701 in->reset_old_inodes(std::move(_old_inodes));
3702
7c673cae
FG
3703 mut->auth_pin(in);
3704 mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
3705 mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
3706
3707 // "oldest flush tid" > 0 means client uses unique TID for each flush
3708 if (ack && ack->get_oldest_flush_tid() > 0)
3709 le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
3710 ack->get_oldest_flush_tid());
3711
11fdf7f2
TL
3712 mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, UPDATE_SNAPFLUSH,
3713 ack, client));
7c673cae
FG
3714}
3715
9f95a23c 3716void Locker::_update_cap_fields(CInode *in, int dirty, const cref_t<MClientCaps> &m, CInode::mempool_inode *pi)
7c673cae
FG
3717{
3718 if (dirty == 0)
3719 return;
3720
3721 /* m must be valid if there are dirty caps */
11fdf7f2 3722 ceph_assert(m);
7c673cae
FG
3723 uint64_t features = m->get_connection()->get_features();
3724
3725 if (m->get_ctime() > pi->ctime) {
3726 dout(7) << " ctime " << pi->ctime << " -> " << m->get_ctime()
3727 << " for " << *in << dendl;
91327a77
AA
3728 pi->ctime = m->get_ctime();
3729 if (m->get_ctime() > pi->rstat.rctime)
3730 pi->rstat.rctime = m->get_ctime();
7c673cae
FG
3731 }
3732
3733 if ((features & CEPH_FEATURE_FS_CHANGE_ATTR) &&
3734 m->get_change_attr() > pi->change_attr) {
3735 dout(7) << " change_attr " << pi->change_attr << " -> " << m->get_change_attr()
3736 << " for " << *in << dendl;
3737 pi->change_attr = m->get_change_attr();
3738 }
3739
3740 // file
3741 if (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) {
3742 utime_t atime = m->get_atime();
3743 utime_t mtime = m->get_mtime();
3744 uint64_t size = m->get_size();
3745 version_t inline_version = m->inline_version;
3746
3747 if (((dirty & CEPH_CAP_FILE_WR) && mtime > pi->mtime) ||
3748 ((dirty & CEPH_CAP_FILE_EXCL) && mtime != pi->mtime)) {
3749 dout(7) << " mtime " << pi->mtime << " -> " << mtime
3750 << " for " << *in << dendl;
3751 pi->mtime = mtime;
91327a77
AA
3752 if (mtime > pi->rstat.rctime)
3753 pi->rstat.rctime = mtime;
7c673cae 3754 }
f67539c2 3755 if (in->is_file() && // ONLY if regular file
7c673cae
FG
3756 size > pi->size) {
3757 dout(7) << " size " << pi->size << " -> " << size
3758 << " for " << *in << dendl;
3759 pi->size = size;
3760 pi->rstat.rbytes = size;
3761 }
f67539c2 3762 if (in->is_file() &&
7c673cae
FG
3763 (dirty & CEPH_CAP_FILE_WR) &&
3764 inline_version > pi->inline_data.version) {
3765 pi->inline_data.version = inline_version;
3766 if (inline_version != CEPH_INLINE_NONE && m->inline_data.length() > 0)
f67539c2 3767 pi->inline_data.set_data(m->inline_data);
7c673cae
FG
3768 else
3769 pi->inline_data.free_data();
3770 }
3771 if ((dirty & CEPH_CAP_FILE_EXCL) && atime != pi->atime) {
3772 dout(7) << " atime " << pi->atime << " -> " << atime
3773 << " for " << *in << dendl;
3774 pi->atime = atime;
3775 }
3776 if ((dirty & CEPH_CAP_FILE_EXCL) &&
3777 ceph_seq_cmp(pi->time_warp_seq, m->get_time_warp_seq()) < 0) {
3778 dout(7) << " time_warp_seq " << pi->time_warp_seq << " -> " << m->get_time_warp_seq()
3779 << " for " << *in << dendl;
3780 pi->time_warp_seq = m->get_time_warp_seq();
3781 }
1e59de90
TL
3782 if (m->fscrypt_file.size())
3783 pi->fscrypt_file = m->fscrypt_file;
7c673cae
FG
3784 }
3785 // auth
3786 if (dirty & CEPH_CAP_AUTH_EXCL) {
3787 if (m->head.uid != pi->uid) {
3788 dout(7) << " uid " << pi->uid
3789 << " -> " << m->head.uid
3790 << " for " << *in << dendl;
3791 pi->uid = m->head.uid;
3792 }
3793 if (m->head.gid != pi->gid) {
3794 dout(7) << " gid " << pi->gid
3795 << " -> " << m->head.gid
3796 << " for " << *in << dendl;
3797 pi->gid = m->head.gid;
3798 }
3799 if (m->head.mode != pi->mode) {
3800 dout(7) << " mode " << oct << pi->mode
3801 << " -> " << m->head.mode << dec
3802 << " for " << *in << dendl;
3803 pi->mode = m->head.mode;
3804 }
3805 if ((features & CEPH_FEATURE_FS_BTIME) && m->get_btime() != pi->btime) {
3806 dout(7) << " btime " << oct << pi->btime
3807 << " -> " << m->get_btime() << dec
3808 << " for " << *in << dendl;
3809 pi->btime = m->get_btime();
3810 }
1e59de90
TL
3811 if (m->fscrypt_auth.size())
3812 pi->fscrypt_auth = m->fscrypt_auth;
7c673cae
FG
3813 }
3814}
3815
3816/*
3817 * update inode based on cap flush|flushsnap|wanted.
3818 * adjust max_size, if needed.
3819 * if we update, return true; otherwise, false (no updated needed).
3820 */
3821bool Locker::_do_cap_update(CInode *in, Capability *cap,
3822 int dirty, snapid_t follows,
9f95a23c 3823 const cref_t<MClientCaps> &m, const ref_t<MClientCaps> &ack,
7c673cae
FG
3824 bool *need_flush)
3825{
3826 dout(10) << "_do_cap_update dirty " << ccap_string(dirty)
3827 << " issued " << ccap_string(cap ? cap->issued() : 0)
3828 << " wanted " << ccap_string(cap ? cap->wanted() : 0)
3829 << " on " << *in << dendl;
11fdf7f2 3830 ceph_assert(in->is_auth());
7c673cae 3831 client_t client = m->get_source().num();
f67539c2 3832 const auto& latest = in->get_projected_inode();
7c673cae
FG
3833
3834 // increase or zero max_size?
3835 uint64_t size = m->get_size();
3836 bool change_max = false;
f91f0fd5 3837 uint64_t old_max = latest->get_client_range(client);
7c673cae
FG
3838 uint64_t new_max = old_max;
3839
3840 if (in->is_file()) {
3841 bool forced_change_max = false;
3842 dout(20) << "inode is file" << dendl;
3843 if (cap && ((cap->issued() | cap->wanted()) & CEPH_CAP_ANY_FILE_WR)) {
3844 dout(20) << "client has write caps; m->get_max_size="
3845 << m->get_max_size() << "; old_max=" << old_max << dendl;
3846 if (m->get_max_size() > new_max) {
3847 dout(10) << "client requests file_max " << m->get_max_size()
3848 << " > max " << old_max << dendl;
3849 change_max = true;
3850 forced_change_max = true;
31f18b77 3851 new_max = calc_new_max_size(latest, m->get_max_size());
7c673cae 3852 } else {
31f18b77 3853 new_max = calc_new_max_size(latest, size);
7c673cae
FG
3854
3855 if (new_max > old_max)
3856 change_max = true;
3857 else
3858 new_max = old_max;
3859 }
3860 } else {
3861 if (old_max) {
3862 change_max = true;
3863 new_max = 0;
3864 }
3865 }
3866
3867 if (in->last == CEPH_NOSNAP &&
3868 change_max &&
3869 !in->filelock.can_wrlock(client) &&
3870 !in->filelock.can_force_wrlock(client)) {
3871 dout(10) << " i want to change file_max, but lock won't allow it (yet)" << dendl;
3872 if (in->filelock.is_stable()) {
3873 bool need_issue = false;
3874 if (cap)
3875 cap->inc_suppress();
11fdf7f2 3876 if (in->get_mds_caps_wanted().empty() &&
7c673cae
FG
3877 (in->get_loner() >= 0 || (in->get_wanted_loner() >= 0 && in->try_set_loner()))) {
3878 if (in->filelock.get_state() != LOCK_EXCL)
3879 file_excl(&in->filelock, &need_issue);
3880 } else
3881 simple_lock(&in->filelock, &need_issue);
3882 if (need_issue)
3883 issue_caps(in);
3884 if (cap)
3885 cap->dec_suppress();
3886 }
3887 if (!in->filelock.can_wrlock(client) &&
3888 !in->filelock.can_force_wrlock(client)) {
3889 C_MDL_CheckMaxSize *cms = new C_MDL_CheckMaxSize(this, in,
3890 forced_change_max ? new_max : 0,
3891 0, utime_t());
3892
3893 in->filelock.add_waiter(SimpleLock::WAIT_STABLE, cms);
3894 change_max = false;
3895 }
3896 }
3897 }
3898
3899 if (m->flockbl.length()) {
3900 int32_t num_locks;
11fdf7f2
TL
3901 auto bli = m->flockbl.cbegin();
3902 decode(num_locks, bli);
7c673cae
FG
3903 for ( int i=0; i < num_locks; ++i) {
3904 ceph_filelock decoded_lock;
11fdf7f2 3905 decode(decoded_lock, bli);
7c673cae
FG
3906 in->get_fcntl_lock_state()->held_locks.
3907 insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
3908 ++in->get_fcntl_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
3909 }
11fdf7f2 3910 decode(num_locks, bli);
7c673cae
FG
3911 for ( int i=0; i < num_locks; ++i) {
3912 ceph_filelock decoded_lock;
11fdf7f2 3913 decode(decoded_lock, bli);
7c673cae
FG
3914 in->get_flock_lock_state()->held_locks.
3915 insert(pair<uint64_t, ceph_filelock>(decoded_lock.start, decoded_lock));
3916 ++in->get_flock_lock_state()->client_held_lock_counts[(client_t)(decoded_lock.client)];
3917 }
3918 }
3919
3920 if (!dirty && !change_max)
3921 return false;
3922
7c673cae
FG
3923 // do the update.
3924 EUpdate *le = new EUpdate(mds->mdlog, "cap update");
3925 mds->mdlog->start_entry(le);
3926
94b18763
FG
3927 bool xattr = (dirty & CEPH_CAP_XATTR_EXCL) &&
3928 m->xattrbl.length() &&
3929 m->head.xattr_version > in->get_projected_inode()->xattr_version;
7c673cae 3930
7c673cae
FG
3931 MutationRef mut(new MutationImpl());
3932 mut->ls = mds->mdlog->get_current_segment();
3933
f67539c2
TL
3934 auto pi = in->project_inode(mut, xattr);
3935 pi.inode->version = in->pre_dirty();
3936
3937 _update_cap_fields(in, dirty, m, pi.inode.get());
7c673cae
FG
3938
3939 if (change_max) {
3940 dout(7) << " max_size " << old_max << " -> " << new_max
3941 << " for " << *in << dendl;
3942 if (new_max) {
f67539c2 3943 auto &cr = pi.inode->client_ranges[client];
94b18763
FG
3944 cr.range.first = 0;
3945 cr.range.last = new_max;
3946 cr.follows = in->first - 1;
f91f0fd5 3947 in->mark_clientwriteable();
a8e16298
TL
3948 if (cap)
3949 cap->mark_clientwriteable();
3950 } else {
f67539c2
TL
3951 pi.inode->client_ranges.erase(client);
3952 if (pi.inode->client_ranges.empty())
f91f0fd5 3953 in->clear_clientwriteable();
a8e16298
TL
3954 if (cap)
3955 cap->clear_clientwriteable();
3956 }
7c673cae
FG
3957 }
3958
3959 if (change_max || (dirty & (CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)))
3960 wrlock_force(&in->filelock, mut); // wrlock for duration of journal
3961
3962 // auth
3963 if (dirty & CEPH_CAP_AUTH_EXCL)
3964 wrlock_force(&in->authlock, mut);
3965
94b18763
FG
3966 // xattrs update?
3967 if (xattr) {
f67539c2 3968 dout(7) << " xattrs v" << pi.inode->xattr_version << " -> " << m->head.xattr_version << dendl;
f38dd50b 3969 decode_new_xattrs(pi.inode.get(), pi.xattrs.get(), m);
7c673cae
FG
3970 wrlock_force(&in->xattrlock, mut);
3971 }
3972
3973 mut->auth_pin(in);
3974 mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY, 0, follows);
3975 mdcache->journal_dirty_inode(mut.get(), &le->metablob, in, follows);
3976
3977 // "oldest flush tid" > 0 means client uses unique TID for each flush
3978 if (ack && ack->get_oldest_flush_tid() > 0)
3979 le->metablob.add_client_flush(metareqid_t(m->get_source(), ack->get_client_tid()),
3980 ack->get_oldest_flush_tid());
3981
11fdf7f2
TL
3982 unsigned update_flags = 0;
3983 if (change_max)
3984 update_flags |= UPDATE_SHAREMAX;
3985 if (cap)
3986 update_flags |= UPDATE_NEEDSISSUE;
3987 mds->mdlog->submit_entry(le, new C_Locker_FileUpdate_finish(this, in, mut, update_flags,
3988 ack, client));
7c673cae
FG
3989 if (need_flush && !*need_flush &&
3990 ((change_max && new_max) || // max INCREASE
3991 _need_flush_mdlog(in, dirty)))
3992 *need_flush = true;
3993
3994 return true;
3995}
3996
9f95a23c 3997void Locker::handle_client_cap_release(const cref_t<MClientCapRelease> &m)
7c673cae
FG
3998{
3999 client_t client = m->get_source().num();
4000 dout(10) << "handle_client_cap_release " << *m << dendl;
4001
4002 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
4003 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
4004 return;
4005 }
4006
f67539c2
TL
4007 if (mds->logger) mds->logger->inc(l_mdss_handle_client_cap_release);
4008
7c673cae
FG
4009 if (m->osd_epoch_barrier && !mds->objecter->have_map(m->osd_epoch_barrier)) {
4010 // Pause RADOS operations until we see the required epoch
4011 mds->objecter->set_epoch_barrier(m->osd_epoch_barrier);
4012 }
4013
4014 if (mds->get_osd_epoch_barrier() < m->osd_epoch_barrier) {
4015 // Record the barrier so that we will retransmit it to clients
4016 mds->set_osd_epoch_barrier(m->osd_epoch_barrier);
4017 }
4018
94b18763 4019 Session *session = mds->get_session(m);
7c673cae 4020
11fdf7f2
TL
4021 for (const auto &cap : m->caps) {
4022 _do_cap_release(client, inodeno_t((uint64_t)cap.ino) , cap.cap_id, cap.migrate_seq, cap.seq);
7c673cae
FG
4023 }
4024
4025 if (session) {
4026 session->notify_cap_release(m->caps.size());
4027 }
7c673cae
FG
4028}
4029
4030class C_Locker_RetryCapRelease : public LockerContext {
4031 client_t client;
4032 inodeno_t ino;
4033 uint64_t cap_id;
4034 ceph_seq_t migrate_seq;
4035 ceph_seq_t issue_seq;
4036public:
4037 C_Locker_RetryCapRelease(Locker *l, client_t c, inodeno_t i, uint64_t id,
4038 ceph_seq_t mseq, ceph_seq_t seq) :
4039 LockerContext(l), client(c), ino(i), cap_id(id), migrate_seq(mseq), issue_seq(seq) {}
4040 void finish(int r) override {
4041 locker->_do_cap_release(client, ino, cap_id, migrate_seq, issue_seq);
4042 }
4043};
4044
4045void Locker::_do_cap_release(client_t client, inodeno_t ino, uint64_t cap_id,
4046 ceph_seq_t mseq, ceph_seq_t seq)
4047{
4048 CInode *in = mdcache->get_inode(ino);
4049 if (!in) {
4050 dout(7) << "_do_cap_release missing ino " << ino << dendl;
4051 return;
4052 }
4053 Capability *cap = in->get_client_cap(client);
4054 if (!cap) {
4055 dout(7) << "_do_cap_release no cap for client" << client << " on "<< *in << dendl;
4056 return;
4057 }
4058
4059 dout(7) << "_do_cap_release for client." << client << " on "<< *in << dendl;
4060 if (cap->get_cap_id() != cap_id) {
4061 dout(7) << " capid " << cap_id << " != " << cap->get_cap_id() << ", ignore" << dendl;
4062 return;
4063 }
4064 if (ceph_seq_cmp(mseq, cap->get_mseq()) < 0) {
4065 dout(7) << " mseq " << mseq << " < " << cap->get_mseq() << ", ignore" << dendl;
4066 return;
4067 }
4068 if (should_defer_client_cap_frozen(in)) {
4069 dout(7) << " freezing|frozen, deferring" << dendl;
4070 in->add_waiter(CInode::WAIT_UNFREEZE,
4071 new C_Locker_RetryCapRelease(this, client, ino, cap_id, mseq, seq));
4072 return;
4073 }
4074 if (seq != cap->get_last_issue()) {
4075 dout(7) << " issue_seq " << seq << " != " << cap->get_last_issue() << dendl;
4076 // clean out any old revoke history
4077 cap->clean_revoke_from(seq);
4078 eval_cap_gather(in);
4079 return;
4080 }
a8e16298 4081 remove_client_cap(in, cap);
7c673cae
FG
4082}
4083
494da23a 4084void Locker::remove_client_cap(CInode *in, Capability *cap, bool kill)
7c673cae 4085{
a8e16298 4086 client_t client = cap->get_client();
7c673cae
FG
4087 // clean out any pending snapflush state
4088 if (!in->client_need_snapflush.empty())
4089 _do_null_snapflush(in, client);
4090
9f95a23c
TL
4091 while (!cap->lock_caches.empty()) {
4092 MDLockCache* lock_cache = cap->lock_caches.front();
4093 lock_cache->client_cap = nullptr;
4094 invalidate_lock_cache(lock_cache);
4095 }
4096
a8e16298 4097 bool notable = cap->is_notable();
7c673cae 4098 in->remove_client_cap(client);
a8e16298
TL
4099 if (!notable)
4100 return;
7c673cae
FG
4101
4102 if (in->is_auth()) {
4103 // make sure we clear out the client byte range
4104 if (in->get_projected_inode()->client_ranges.count(client) &&
f67539c2 4105 !(in->get_inode()->nlink == 0 && !in->is_any_caps())) { // unless it's unlink + stray
494da23a
TL
4106 if (kill)
4107 in->state_set(CInode::STATE_NEEDSRECOVER);
4108 else
4109 check_inode_max_size(in);
4110 }
7c673cae
FG
4111 } else {
4112 request_inode_file_caps(in);
4113 }
4114
4115 try_eval(in, CEPH_CAP_LOCKS);
4116}
4117
4118
4119/**
4120 * Return true if any currently revoking caps exceed the
f64942e4 4121 * session_timeout threshold.
7c673cae 4122 */
91327a77
AA
4123bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking,
4124 double timeout) const
7c673cae
FG
4125{
4126 xlist<Capability*>::const_iterator p = revoking.begin();
4127 if (p.end()) {
4128 // No revoking caps at the moment
4129 return false;
4130 } else {
4131 utime_t now = ceph_clock_now();
4132 utime_t age = now - (*p)->get_last_revoke_stamp();
91327a77 4133 if (age <= timeout) {
7c673cae
FG
4134 return false;
4135 } else {
4136 return true;
4137 }
4138 }
4139}
4140
9f95a23c 4141std::set<client_t> Locker::get_late_revoking_clients(double timeout) const
7c673cae 4142{
9f95a23c 4143 std::set<client_t> result;
7c673cae 4144
9f95a23c
TL
4145 if (any_late_revoking_caps(revoking_caps, timeout)) {
4146 // Slow path: execute in O(N_clients)
4147 for (auto &p : revoking_caps_by_client) {
4148 if (any_late_revoking_caps(p.second, timeout)) {
4149 result.insert(p.first);
4150 }
7c673cae 4151 }
9f95a23c
TL
4152 } else {
4153 // Fast path: no misbehaving clients, execute in O(1)
7c673cae 4154 }
9f95a23c 4155 return result;
7c673cae
FG
4156}
4157
4158// Hard-code instead of surfacing a config settings because this is
4159// really a hack that should go away at some point when we have better
4160// inspection tools for getting at detailed cap state (#7316)
4161#define MAX_WARN_CAPS 100
4162
4163void Locker::caps_tick()
4164{
4165 utime_t now = ceph_clock_now();
4166
11fdf7f2
TL
4167 if (!need_snapflush_inodes.empty()) {
4168 // snap inodes that needs flush are auth pinned, they affect
4169 // subtree/difrarg freeze.
4170 utime_t cutoff = now;
4171 cutoff -= g_conf()->mds_freeze_tree_timeout / 3;
4172
4173 CInode *last = need_snapflush_inodes.back();
4174 while (!need_snapflush_inodes.empty()) {
4175 CInode *in = need_snapflush_inodes.front();
4176 if (in->last_dirstat_prop >= cutoff)
4177 break;
4178 in->item_caps.remove_myself();
4179 snapflush_nudge(in);
4180 if (in == last)
4181 break;
4182 }
4183 }
4184
7c673cae
FG
4185 dout(20) << __func__ << " " << revoking_caps.size() << " revoking caps" << dendl;
4186
11fdf7f2
TL
4187 now = ceph_clock_now();
4188 int n = 0;
7c673cae
FG
4189 for (xlist<Capability*>::iterator p = revoking_caps.begin(); !p.end(); ++p) {
4190 Capability *cap = *p;
4191
4192 utime_t age = now - cap->get_last_revoke_stamp();
11fdf7f2 4193 dout(20) << __func__ << " age = " << age << " client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
f64942e4
AA
4194 if (age <= mds->mdsmap->get_session_timeout()) {
4195 dout(20) << __func__ << " age below timeout " << mds->mdsmap->get_session_timeout() << dendl;
7c673cae
FG
4196 break;
4197 } else {
11fdf7f2
TL
4198 ++n;
4199 if (n > MAX_WARN_CAPS) {
7c673cae
FG
4200 dout(1) << __func__ << " more than " << MAX_WARN_CAPS << " caps are late"
4201 << "revoking, ignoring subsequent caps" << dendl;
4202 break;
4203 }
4204 }
4205 // exponential backoff of warning intervals
f64942e4 4206 if (age > mds->mdsmap->get_session_timeout() * (1 << cap->get_num_revoke_warnings())) {
7c673cae 4207 cap->inc_num_revoke_warnings();
f67539c2
TL
4208 CachedStackStringStream css;
4209 *css << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
4210 << cap->get_inode()->ino() << " pending " << ccap_string(cap->pending())
4211 << " issued " << ccap_string(cap->issued()) << ", sent " << age << " seconds ago";
4212 mds->clog->warn() << css->strv();
4213 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae 4214 } else {
11fdf7f2 4215 dout(20) << __func__ << " silencing log message (backoff) for " << "client." << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
7c673cae
FG
4216 }
4217 }
4218}
4219
4220
9f95a23c 4221void Locker::handle_client_lease(const cref_t<MClientLease> &m)
7c673cae
FG
4222{
4223 dout(10) << "handle_client_lease " << *m << dendl;
4224
11fdf7f2 4225 ceph_assert(m->get_source().is_client());
7c673cae
FG
4226 client_t client = m->get_source().num();
4227
4228 CInode *in = mdcache->get_inode(m->get_ino(), m->get_last());
4229 if (!in) {
4230 dout(7) << "handle_client_lease don't have ino " << m->get_ino() << "." << m->get_last() << dendl;
7c673cae
FG
4231 return;
4232 }
4233 CDentry *dn = 0;
4234
4235 frag_t fg = in->pick_dirfrag(m->dname);
4236 CDir *dir = in->get_dirfrag(fg);
4237 if (dir)
4238 dn = dir->lookup(m->dname);
4239 if (!dn) {
4240 dout(7) << "handle_client_lease don't have dn " << m->get_ino() << " " << m->dname << dendl;
7c673cae
FG
4241 return;
4242 }
4243 dout(10) << " on " << *dn << dendl;
4244
4245 // replica and lock
4246 ClientLease *l = dn->get_client_lease(client);
4247 if (!l) {
4248 dout(7) << "handle_client_lease didn't have lease for client." << client << " of " << *dn << dendl;
7c673cae
FG
4249 return;
4250 }
4251
4252 switch (m->get_action()) {
4253 case CEPH_MDS_LEASE_REVOKE_ACK:
4254 case CEPH_MDS_LEASE_RELEASE:
4255 if (l->seq != m->get_seq()) {
4256 dout(7) << "handle_client_lease release - seq " << l->seq << " != provided " << m->get_seq() << dendl;
4257 } else {
4258 dout(7) << "handle_client_lease client." << client
4259 << " on " << *dn << dendl;
4260 dn->remove_client_lease(l, this);
4261 }
7c673cae
FG
4262 break;
4263
4264 case CEPH_MDS_LEASE_RENEW:
4265 {
4266 dout(7) << "handle_client_lease client." << client << " renew on " << *dn
4267 << (!dn->lock.can_lease(client)?", revoking lease":"") << dendl;
4268 if (dn->lock.can_lease(client)) {
9f95a23c 4269 auto reply = make_message<MClientLease>(*m);
7c673cae 4270 int pool = 1; // fixme.. do something smart!
11fdf7f2
TL
4271 reply->h.duration_ms = (int)(1000 * mdcache->client_lease_durations[pool]);
4272 reply->h.seq = ++l->seq;
4273 reply->clear_payload();
7c673cae
FG
4274
4275 utime_t now = ceph_clock_now();
4276 now += mdcache->client_lease_durations[pool];
4277 mdcache->touch_client_lease(l, pool, now);
4278
11fdf7f2 4279 mds->send_message_client_counted(reply, m->get_connection());
7c673cae
FG
4280 }
4281 }
4282 break;
4283
4284 default:
4285 ceph_abort(); // implement me
4286 break;
4287 }
4288}
4289
4290
2a845540
TL
4291void Locker::issue_client_lease(CDentry *dn, CInode *in, MDRequestRef &mdr, utime_t now,
4292 bufferlist &bl)
7c673cae 4293{
9f95a23c
TL
4294 client_t client = mdr->get_client();
4295 Session *session = mdr->session;
4296
7c673cae 4297 CInode *diri = dn->get_dir()->get_inode();
9f95a23c
TL
4298 if (mdr->snapid == CEPH_NOSNAP &&
4299 dn->lock.can_lease(client) &&
4300 !diri->is_stray() && // do not issue dn leases in stray dir!
4301 !diri->filelock.can_lease(client) &&
4302 !(diri->get_client_cap_pending(client) & (CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL))) {
2a845540
TL
4303 int mask = 0;
4304 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
4305 if (dnl->is_primary()) {
4306 ceph_assert(dnl->get_inode() == in);
4307 mask = CEPH_LEASE_PRIMARY_LINK;
4308 } else {
4309 if (dnl->is_remote())
4310 ceph_assert(dnl->get_remote_ino() == in->ino());
4311 else
4312 ceph_assert(!in);
4313 }
7c673cae
FG
4314 // issue a dentry lease
4315 ClientLease *l = dn->add_client_lease(client, session);
4316 session->touch_lease(l);
4317
9f95a23c 4318 int pool = 1; // fixme.. do something smart!
7c673cae
FG
4319 now += mdcache->client_lease_durations[pool];
4320 mdcache->touch_client_lease(l, pool, now);
4321
11fdf7f2 4322 LeaseStat lstat;
9f95a23c 4323 lstat.mask = CEPH_LEASE_VALID | mask;
11fdf7f2
TL
4324 lstat.duration_ms = (uint32_t)(1000 * mdcache->client_lease_durations[pool]);
4325 lstat.seq = ++l->seq;
f67539c2 4326 lstat.alternate_name = std::string(dn->alternate_name);
11fdf7f2
TL
4327 encode_lease(bl, session->info, lstat);
4328 dout(20) << "issue_client_lease seq " << lstat.seq << " dur " << lstat.duration_ms << "ms "
7c673cae
FG
4329 << " on " << *dn << dendl;
4330 } else {
4331 // null lease
11fdf7f2 4332 LeaseStat lstat;
2a845540 4333 lstat.mask = 0;
f67539c2 4334 lstat.alternate_name = std::string(dn->alternate_name);
11fdf7f2 4335 encode_lease(bl, session->info, lstat);
7c673cae
FG
4336 dout(20) << "issue_client_lease no/null lease on " << *dn << dendl;
4337 }
4338}
4339
4340
4341void Locker::revoke_client_leases(SimpleLock *lock)
4342{
4343 int n = 0;
4344 CDentry *dn = static_cast<CDentry*>(lock->get_parent());
4345 for (map<client_t, ClientLease*>::iterator p = dn->client_lease_map.begin();
4346 p != dn->client_lease_map.end();
4347 ++p) {
4348 ClientLease *l = p->second;
4349
4350 n++;
11fdf7f2 4351 ceph_assert(lock->get_type() == CEPH_LOCK_DN);
7c673cae
FG
4352
4353 CDentry *dn = static_cast<CDentry*>(lock->get_parent());
4354 int mask = 1 | CEPH_LOCK_DN; // old and new bits
4355
4356 // i should also revoke the dir ICONTENT lease, if they have it!
4357 CInode *diri = dn->get_dir()->get_inode();
9f95a23c 4358 auto lease = make_message<MClientLease>(CEPH_MDS_LEASE_REVOKE, l->seq, mask, diri->ino(), diri->first, CEPH_NOSNAP, dn->get_name());
11fdf7f2 4359 mds->send_message_client_counted(lease, l->client);
7c673cae 4360 }
7c673cae
FG
4361}
4362
11fdf7f2
TL
4363void Locker::encode_lease(bufferlist& bl, const session_info_t& info,
4364 const LeaseStat& ls)
4365{
4366 if (info.has_feature(CEPHFS_FEATURE_REPLY_ENCODING)) {
f67539c2 4367 ENCODE_START(2, 1, bl);
11fdf7f2
TL
4368 encode(ls.mask, bl);
4369 encode(ls.duration_ms, bl);
4370 encode(ls.seq, bl);
f67539c2 4371 encode(ls.alternate_name, bl);
11fdf7f2
TL
4372 ENCODE_FINISH(bl);
4373 }
4374 else {
4375 encode(ls.mask, bl);
4376 encode(ls.duration_ms, bl);
4377 encode(ls.seq, bl);
4378 }
4379}
7c673cae
FG
4380
4381// locks ----------------------------------------------------------------
4382
11fdf7f2 4383SimpleLock *Locker::get_lock(int lock_type, const MDSCacheObjectInfo &info)
7c673cae
FG
4384{
4385 switch (lock_type) {
4386 case CEPH_LOCK_DN:
4387 {
4388 // be careful; info.dirfrag may have incorrect frag; recalculate based on dname.
4389 CInode *diri = mdcache->get_inode(info.dirfrag.ino);
4390 frag_t fg;
4391 CDir *dir = 0;
4392 CDentry *dn = 0;
4393 if (diri) {
4394 fg = diri->pick_dirfrag(info.dname);
4395 dir = diri->get_dirfrag(fg);
4396 if (dir)
4397 dn = dir->lookup(info.dname, info.snapid);
4398 }
4399 if (!dn) {
4400 dout(7) << "get_lock don't have dn " << info.dirfrag.ino << " " << info.dname << dendl;
4401 return 0;
4402 }
4403 return &dn->lock;
4404 }
4405
4406 case CEPH_LOCK_IAUTH:
4407 case CEPH_LOCK_ILINK:
4408 case CEPH_LOCK_IDFT:
4409 case CEPH_LOCK_IFILE:
4410 case CEPH_LOCK_INEST:
4411 case CEPH_LOCK_IXATTR:
4412 case CEPH_LOCK_ISNAP:
4413 case CEPH_LOCK_IFLOCK:
4414 case CEPH_LOCK_IPOLICY:
4415 {
4416 CInode *in = mdcache->get_inode(info.ino, info.snapid);
4417 if (!in) {
4418 dout(7) << "get_lock don't have ino " << info.ino << dendl;
4419 return 0;
4420 }
4421 switch (lock_type) {
4422 case CEPH_LOCK_IAUTH: return &in->authlock;
4423 case CEPH_LOCK_ILINK: return &in->linklock;
4424 case CEPH_LOCK_IDFT: return &in->dirfragtreelock;
4425 case CEPH_LOCK_IFILE: return &in->filelock;
4426 case CEPH_LOCK_INEST: return &in->nestlock;
4427 case CEPH_LOCK_IXATTR: return &in->xattrlock;
4428 case CEPH_LOCK_ISNAP: return &in->snaplock;
4429 case CEPH_LOCK_IFLOCK: return &in->flocklock;
4430 case CEPH_LOCK_IPOLICY: return &in->policylock;
4431 }
4432 }
4433
4434 default:
4435 dout(7) << "get_lock don't know lock_type " << lock_type << dendl;
4436 ceph_abort();
4437 break;
4438 }
4439
4440 return 0;
4441}
4442
9f95a23c 4443void Locker::handle_lock(const cref_t<MLock> &m)
7c673cae
FG
4444{
4445 // nobody should be talking to us during recovery.
11fdf7f2 4446 ceph_assert(mds->is_rejoin() || mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
7c673cae
FG
4447
4448 SimpleLock *lock = get_lock(m->get_lock_type(), m->get_object_info());
4449 if (!lock) {
4450 dout(10) << "don't have object " << m->get_object_info() << ", must have trimmed, dropping" << dendl;
7c673cae
FG
4451 return;
4452 }
4453
4454 switch (lock->get_type()) {
4455 case CEPH_LOCK_DN:
4456 case CEPH_LOCK_IAUTH:
4457 case CEPH_LOCK_ILINK:
4458 case CEPH_LOCK_ISNAP:
4459 case CEPH_LOCK_IXATTR:
4460 case CEPH_LOCK_IFLOCK:
4461 case CEPH_LOCK_IPOLICY:
4462 handle_simple_lock(lock, m);
4463 break;
4464
4465 case CEPH_LOCK_IDFT:
4466 case CEPH_LOCK_INEST:
4467 //handle_scatter_lock((ScatterLock*)lock, m);
4468 //break;
4469
4470 case CEPH_LOCK_IFILE:
4471 handle_file_lock(static_cast<ScatterLock*>(lock), m);
4472 break;
4473
4474 default:
4475 dout(7) << "handle_lock got otype " << m->get_lock_type() << dendl;
4476 ceph_abort();
4477 break;
4478 }
4479}
4480
4481
4482
4483
4484
4485// ==========================================================================
4486// simple lock
4487
4488/** This function may take a reference to m if it needs one, but does
4489 * not put references. */
9f95a23c 4490void Locker::handle_reqrdlock(SimpleLock *lock, const cref_t<MLock> &m)
7c673cae
FG
4491{
4492 MDSCacheObject *parent = lock->get_parent();
4493 if (parent->is_auth() &&
4494 lock->get_state() != LOCK_SYNC &&
4495 !parent->is_frozen()) {
4496 dout(7) << "handle_reqrdlock got rdlock request on " << *lock
4497 << " on " << *parent << dendl;
11fdf7f2 4498 ceph_assert(parent->is_auth()); // replica auth pinned if they're doing this!
7c673cae
FG
4499 if (lock->is_stable()) {
4500 simple_sync(lock);
4501 } else {
4502 dout(7) << "handle_reqrdlock delaying request until lock is stable" << dendl;
4503 lock->add_waiter(SimpleLock::WAIT_STABLE | MDSCacheObject::WAIT_UNFREEZE,
11fdf7f2 4504 new C_MDS_RetryMessage(mds, m));
7c673cae
FG
4505 }
4506 } else {
4507 dout(7) << "handle_reqrdlock dropping rdlock request on " << *lock
4508 << " on " << *parent << dendl;
4509 // replica should retry
4510 }
4511}
4512
9f95a23c 4513void Locker::handle_simple_lock(SimpleLock *lock, const cref_t<MLock> &m)
7c673cae
FG
4514{
4515 int from = m->get_asker();
4516
4517 dout(10) << "handle_simple_lock " << *m
4518 << " on " << *lock << " " << *lock->get_parent() << dendl;
4519
4520 if (mds->is_rejoin()) {
4521 if (lock->get_parent()->is_rejoining()) {
4522 dout(7) << "handle_simple_lock still rejoining " << *lock->get_parent()
4523 << ", dropping " << *m << dendl;
7c673cae
FG
4524 return;
4525 }
4526 }
4527
4528 switch (m->get_action()) {
4529 // -- replica --
4530 case LOCK_AC_SYNC:
11fdf7f2 4531 ceph_assert(lock->get_state() == LOCK_LOCK);
7c673cae
FG
4532 lock->decode_locked_state(m->get_data());
4533 lock->set_state(LOCK_SYNC);
4534 lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
4535 break;
4536
4537 case LOCK_AC_LOCK:
11fdf7f2 4538 ceph_assert(lock->get_state() == LOCK_SYNC);
7c673cae
FG
4539 lock->set_state(LOCK_SYNC_LOCK);
4540 if (lock->is_leased())
4541 revoke_client_leases(lock);
4542 eval_gather(lock, true);
9f95a23c
TL
4543 if (lock->is_unstable_and_locked()) {
4544 if (lock->is_cached())
4545 invalidate_lock_caches(lock);
31f18b77 4546 mds->mdlog->flush();
9f95a23c 4547 }
7c673cae
FG
4548 break;
4549
4550
4551 // -- auth --
4552 case LOCK_AC_LOCKACK:
11fdf7f2 4553 ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
7c673cae 4554 lock->get_state() == LOCK_SYNC_EXCL);
11fdf7f2 4555 ceph_assert(lock->is_gathering(from));
7c673cae
FG
4556 lock->remove_gather(from);
4557
4558 if (lock->is_gathering()) {
4559 dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
4560 << ", still gathering " << lock->get_gather_set() << dendl;
4561 } else {
4562 dout(7) << "handle_simple_lock " << *lock << " on " << *lock->get_parent() << " from " << from
4563 << ", last one" << dendl;
4564 eval_gather(lock);
4565 }
4566 break;
4567
4568 case LOCK_AC_REQRDLOCK:
4569 handle_reqrdlock(lock, m);
4570 break;
4571
4572 }
7c673cae
FG
4573}
4574
4575/* unused, currently.
4576
4577class C_Locker_SimpleEval : public Context {
4578 Locker *locker;
4579 SimpleLock *lock;
4580public:
4581 C_Locker_SimpleEval(Locker *l, SimpleLock *lk) : locker(l), lock(lk) {}
4582 void finish(int r) {
4583 locker->try_simple_eval(lock);
4584 }
4585};
4586
4587void Locker::try_simple_eval(SimpleLock *lock)
4588{
4589 // unstable and ambiguous auth?
4590 if (!lock->is_stable() &&
4591 lock->get_parent()->is_ambiguous_auth()) {
4592 dout(7) << "simple_eval not stable and ambiguous auth, waiting on " << *lock->get_parent() << dendl;
4593 //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
4594 lock->get_parent()->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, new C_Locker_SimpleEval(this, lock));
4595 return;
4596 }
4597
4598 if (!lock->get_parent()->is_auth()) {
4599 dout(7) << "try_simple_eval not auth for " << *lock->get_parent() << dendl;
4600 return;
4601 }
4602
4603 if (!lock->get_parent()->can_auth_pin()) {
4604 dout(7) << "try_simple_eval can't auth_pin, waiting on " << *lock->get_parent() << dendl;
4605 //if (!lock->get_parent()->is_waiter(MDSCacheObject::WAIT_SINGLEAUTH))
4606 lock->get_parent()->add_waiter(MDSCacheObject::WAIT_UNFREEZE, new C_Locker_SimpleEval(this, lock));
4607 return;
4608 }
4609
4610 if (lock->is_stable())
4611 simple_eval(lock);
4612}
4613*/
4614
4615
4616void Locker::simple_eval(SimpleLock *lock, bool *need_issue)
4617{
4618 dout(10) << "simple_eval " << *lock << " on " << *lock->get_parent() << dendl;
4619
11fdf7f2
TL
4620 ceph_assert(lock->get_parent()->is_auth());
4621 ceph_assert(lock->is_stable());
7c673cae
FG
4622
4623 if (lock->get_parent()->is_freezing_or_frozen()) {
11fdf7f2
TL
4624 // dentry/snap lock in unreadable state can block path traverse
4625 if ((lock->get_type() != CEPH_LOCK_DN &&
9f95a23c
TL
4626 lock->get_type() != CEPH_LOCK_ISNAP &&
4627 lock->get_type() != CEPH_LOCK_IPOLICY) ||
7c673cae 4628 lock->get_state() == LOCK_SYNC ||
11fdf7f2 4629 lock->get_parent()->is_frozen())
7c673cae
FG
4630 return;
4631 }
4632
4633 if (mdcache->is_readonly()) {
4634 if (lock->get_state() != LOCK_SYNC) {
4635 dout(10) << "simple_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
4636 simple_sync(lock, need_issue);
4637 }
4638 return;
4639 }
4640
4641 CInode *in = 0;
4642 int wanted = 0;
11fdf7f2 4643 if (lock->get_cap_shift()) {
7c673cae
FG
4644 in = static_cast<CInode*>(lock->get_parent());
4645 in->get_caps_wanted(&wanted, NULL, lock->get_cap_shift());
4646 }
4647
4648 // -> excl?
4649 if (lock->get_state() != LOCK_EXCL &&
4650 in && in->get_target_loner() >= 0 &&
4651 (wanted & CEPH_CAP_GEXCL)) {
4652 dout(7) << "simple_eval stable, going to excl " << *lock
4653 << " on " << *lock->get_parent() << dendl;
4654 simple_excl(lock, need_issue);
4655 }
4656
4657 // stable -> sync?
4658 else if (lock->get_state() != LOCK_SYNC &&
4659 !lock->is_wrlocked() &&
4660 ((!(wanted & CEPH_CAP_GEXCL) && !lock->is_waiter_for(SimpleLock::WAIT_WR)) ||
4661 (lock->get_state() == LOCK_EXCL && in && in->get_target_loner() < 0))) {
4662 dout(7) << "simple_eval stable, syncing " << *lock
4663 << " on " << *lock->get_parent() << dendl;
4664 simple_sync(lock, need_issue);
4665 }
4666}
4667
4668
4669// mid
4670
4671bool Locker::simple_sync(SimpleLock *lock, bool *need_issue)
4672{
4673 dout(7) << "simple_sync on " << *lock << " on " << *lock->get_parent() << dendl;
11fdf7f2
TL
4674 ceph_assert(lock->get_parent()->is_auth());
4675 ceph_assert(lock->is_stable());
7c673cae
FG
4676
4677 CInode *in = 0;
4678 if (lock->get_cap_shift())
4679 in = static_cast<CInode *>(lock->get_parent());
4680
4681 int old_state = lock->get_state();
4682
4683 if (old_state != LOCK_TSYN) {
4684
4685 switch (lock->get_state()) {
4686 case LOCK_MIX: lock->set_state(LOCK_MIX_SYNC); break;
4687 case LOCK_LOCK: lock->set_state(LOCK_LOCK_SYNC); break;
4688 case LOCK_XSYN: lock->set_state(LOCK_XSYN_SYNC); break;
4689 case LOCK_EXCL: lock->set_state(LOCK_EXCL_SYNC); break;
4690 default: ceph_abort();
4691 }
4692
4693 int gather = 0;
9f95a23c 4694 if (lock->is_wrlocked()) {
7c673cae 4695 gather++;
9f95a23c
TL
4696 if (lock->is_cached())
4697 invalidate_lock_caches(lock);
b3b6e05e
TL
4698
4699 // After a client request is early replied the mdlog won't be flushed
4700 // immediately, but before safe replied the request will hold the write
4701 // locks. So if the client sends another request to a different MDS
4702 // daemon, which then needs to request read lock from current MDS daemon,
4703 // then that daemon maybe stuck at most for 5 seconds. Which will lead
4704 // the client stuck at most 5 seconds.
4705 //
4706 // Let's try to flush the mdlog when the write lock is held, which will
4707 // release the write locks after mdlog is successfully flushed.
4708 mds->mdlog->flush();
9f95a23c 4709 }
7c673cae
FG
4710
4711 if (lock->get_parent()->is_replicated() && old_state == LOCK_MIX) {
4712 send_lock_message(lock, LOCK_AC_SYNC);
4713 lock->init_gather();
4714 gather++;
4715 }
4716
4717 if (in && in->is_head()) {
4718 if (in->issued_caps_need_gather(lock)) {
4719 if (need_issue)
4720 *need_issue = true;
4721 else
4722 issue_caps(in);
4723 gather++;
4724 }
4725 }
4726
4727 bool need_recover = false;
4728 if (lock->get_type() == CEPH_LOCK_IFILE) {
11fdf7f2 4729 ceph_assert(in);
7c673cae
FG
4730 if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
4731 mds->mdcache->queue_file_recover(in);
4732 need_recover = true;
4733 gather++;
4734 }
4735 }
4736
4737 if (!gather && lock->is_dirty()) {
4738 lock->get_parent()->auth_pin(lock);
4739 scatter_writebehind(static_cast<ScatterLock*>(lock));
7c673cae
FG
4740 return false;
4741 }
4742
4743 if (gather) {
4744 lock->get_parent()->auth_pin(lock);
4745 if (need_recover)
4746 mds->mdcache->do_file_recover();
4747 return false;
4748 }
4749 }
4750
4751 if (lock->get_parent()->is_replicated()) { // FIXME
4752 bufferlist data;
4753 lock->encode_locked_state(data);
4754 send_lock_message(lock, LOCK_AC_SYNC, data);
4755 }
4756 lock->set_state(LOCK_SYNC);
4757 lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
4758 if (in && in->is_head()) {
4759 if (need_issue)
4760 *need_issue = true;
4761 else
4762 issue_caps(in);
4763 }
4764 return true;
4765}
4766
4767void Locker::simple_excl(SimpleLock *lock, bool *need_issue)
4768{
4769 dout(7) << "simple_excl on " << *lock << " on " << *lock->get_parent() << dendl;
11fdf7f2
TL
4770 ceph_assert(lock->get_parent()->is_auth());
4771 ceph_assert(lock->is_stable());
7c673cae
FG
4772
4773 CInode *in = 0;
4774 if (lock->get_cap_shift())
4775 in = static_cast<CInode *>(lock->get_parent());
4776
4777 switch (lock->get_state()) {
4778 case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
4779 case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
4780 case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
4781 default: ceph_abort();
4782 }
4783
4784 int gather = 0;
4785 if (lock->is_rdlocked())
4786 gather++;
4787 if (lock->is_wrlocked())
4788 gather++;
9f95a23c
TL
4789 if (gather && lock->is_cached())
4790 invalidate_lock_caches(lock);
7c673cae
FG
4791
4792 if (lock->get_parent()->is_replicated() &&
4793 lock->get_state() != LOCK_LOCK_EXCL &&
4794 lock->get_state() != LOCK_XSYN_EXCL) {
4795 send_lock_message(lock, LOCK_AC_LOCK);
4796 lock->init_gather();
4797 gather++;
4798 }
4799
4800 if (in && in->is_head()) {
4801 if (in->issued_caps_need_gather(lock)) {
4802 if (need_issue)
4803 *need_issue = true;
4804 else
4805 issue_caps(in);
4806 gather++;
4807 }
4808 }
4809
4810 if (gather) {
4811 lock->get_parent()->auth_pin(lock);
4812 } else {
4813 lock->set_state(LOCK_EXCL);
4814 lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
4815 if (in) {
4816 if (need_issue)
4817 *need_issue = true;
4818 else
4819 issue_caps(in);
4820 }
4821 }
4822}
4823
4824void Locker::simple_lock(SimpleLock *lock, bool *need_issue)
4825{
4826 dout(7) << "simple_lock on " << *lock << " on " << *lock->get_parent() << dendl;
11fdf7f2
TL
4827 ceph_assert(lock->get_parent()->is_auth());
4828 ceph_assert(lock->is_stable());
4829 ceph_assert(lock->get_state() != LOCK_LOCK);
7c673cae
FG
4830
4831 CInode *in = 0;
4832 if (lock->get_cap_shift())
4833 in = static_cast<CInode *>(lock->get_parent());
4834
4835 int old_state = lock->get_state();
4836
4837 switch (lock->get_state()) {
4838 case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
b32b8144 4839 case LOCK_XSYN: lock->set_state(LOCK_XSYN_LOCK); break;
7c673cae
FG
4840 case LOCK_EXCL: lock->set_state(LOCK_EXCL_LOCK); break;
4841 case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK);
4842 (static_cast<ScatterLock *>(lock))->clear_unscatter_wanted();
4843 break;
4844 case LOCK_TSYN: lock->set_state(LOCK_TSYN_LOCK); break;
4845 default: ceph_abort();
4846 }
4847
4848 int gather = 0;
4849 if (lock->is_leased()) {
4850 gather++;
4851 revoke_client_leases(lock);
4852 }
9f95a23c
TL
4853 if (lock->is_rdlocked()) {
4854 if (lock->is_cached())
4855 invalidate_lock_caches(lock);
7c673cae 4856 gather++;
9f95a23c 4857 }
7c673cae
FG
4858 if (in && in->is_head()) {
4859 if (in->issued_caps_need_gather(lock)) {
4860 if (need_issue)
4861 *need_issue = true;
4862 else
4863 issue_caps(in);
4864 gather++;
4865 }
4866 }
4867
4868 bool need_recover = false;
4869 if (lock->get_type() == CEPH_LOCK_IFILE) {
11fdf7f2 4870 ceph_assert(in);
7c673cae
FG
4871 if(in->state_test(CInode::STATE_NEEDSRECOVER)) {
4872 mds->mdcache->queue_file_recover(in);
4873 need_recover = true;
4874 gather++;
4875 }
4876 }
4877
4878 if (lock->get_parent()->is_replicated() &&
4879 lock->get_state() == LOCK_MIX_LOCK &&
4880 gather) {
4881 dout(10) << " doing local stage of mix->lock gather before gathering from replicas" << dendl;
4882 } else {
4883 // move to second stage of gather now, so we don't send the lock action later.
4884 if (lock->get_state() == LOCK_MIX_LOCK)
4885 lock->set_state(LOCK_MIX_LOCK2);
4886
4887 if (lock->get_parent()->is_replicated() &&
4888 lock->get_sm()->states[old_state].replica_state != LOCK_LOCK) { // replica may already be LOCK
4889 gather++;
4890 send_lock_message(lock, LOCK_AC_LOCK);
4891 lock->init_gather();
4892 }
4893 }
4894
4895 if (!gather && lock->is_dirty()) {
4896 lock->get_parent()->auth_pin(lock);
4897 scatter_writebehind(static_cast<ScatterLock*>(lock));
7c673cae
FG
4898 return;
4899 }
4900
4901 if (gather) {
4902 lock->get_parent()->auth_pin(lock);
4903 if (need_recover)
4904 mds->mdcache->do_file_recover();
4905 } else {
4906 lock->set_state(LOCK_LOCK);
4907 lock->finish_waiters(ScatterLock::WAIT_XLOCK|ScatterLock::WAIT_WR|ScatterLock::WAIT_STABLE);
4908 }
4909}
4910
4911
4912void Locker::simple_xlock(SimpleLock *lock)
4913{
4914 dout(7) << "simple_xlock on " << *lock << " on " << *lock->get_parent() << dendl;
11fdf7f2 4915 ceph_assert(lock->get_parent()->is_auth());
7c673cae 4916 //assert(lock->is_stable());
11fdf7f2 4917 ceph_assert(lock->get_state() != LOCK_XLOCK);
7c673cae
FG
4918
4919 CInode *in = 0;
4920 if (lock->get_cap_shift())
4921 in = static_cast<CInode *>(lock->get_parent());
4922
4923 if (lock->is_stable())
4924 lock->get_parent()->auth_pin(lock);
4925
4926 switch (lock->get_state()) {
4927 case LOCK_LOCK:
4928 case LOCK_XLOCKDONE: lock->set_state(LOCK_LOCK_XLOCK); break;
4929 default: ceph_abort();
4930 }
4931
4932 int gather = 0;
4933 if (lock->is_rdlocked())
4934 gather++;
4935 if (lock->is_wrlocked())
4936 gather++;
9f95a23c
TL
4937 if (gather && lock->is_cached())
4938 invalidate_lock_caches(lock);
7c673cae
FG
4939
4940 if (in && in->is_head()) {
4941 if (in->issued_caps_need_gather(lock)) {
4942 issue_caps(in);
4943 gather++;
4944 }
4945 }
4946
4947 if (!gather) {
4948 lock->set_state(LOCK_PREXLOCK);
4949 //assert("shouldn't be called if we are already xlockable" == 0);
4950 }
4951}
4952
4953
4954
4955
4956
4957// ==========================================================================
4958// scatter lock
4959
4960/*
4961
4962Some notes on scatterlocks.
4963
4964 - The scatter/gather is driven by the inode lock. The scatter always
4965 brings in the latest metadata from the fragments.
4966
4967 - When in a scattered/MIX state, fragments are only allowed to
4968 update/be written to if the accounted stat matches the inode's
4969 current version.
4970
4971 - That means, on gather, we _only_ assimilate diffs for frag metadata
4972 that match the current version, because those are the only ones
4973 written during this scatter/gather cycle. (Others didn't permit
4974 it.) We increment the version and journal this to disk.
4975
4976 - When possible, we also simultaneously update our local frag
4977 accounted stats to match.
4978
4979 - On scatter, the new inode info is broadcast to frags, both local
4980 and remote. If possible (auth and !frozen), the dirfrag auth
4981 should update the accounted state (if it isn't already up to date).
4982 Note that this may occur on both the local inode auth node and
4983 inode replicas, so there are two potential paths. If it is NOT
4984 possible, they need to mark_stale to prevent any possible writes.
4985
4986 - A scatter can be to MIX (potentially writeable) or to SYNC (read
4987 only). Both are opportunities to update the frag accounted stats,
4988 even though only the MIX case is affected by a stale dirfrag.
4989
4990 - Because many scatter/gather cycles can potentially go by without a
4991 frag being able to update its accounted stats (due to being frozen
4992 by exports/refragments in progress), the frag may have (even very)
4993 old stat versions. That's fine. If when we do want to update it,
4994 we can update accounted_* and the version first.
4995
4996*/
4997
4998class C_Locker_ScatterWB : public LockerLogContext {
4999 ScatterLock *lock;
5000 MutationRef mut;
5001public:
5002 C_Locker_ScatterWB(Locker *l, ScatterLock *sl, MutationRef& m) :
5003 LockerLogContext(l), lock(sl), mut(m) {}
5004 void finish(int r) override {
5005 locker->scatter_writebehind_finish(lock, mut);
5006 }
5007};
5008
5009void Locker::scatter_writebehind(ScatterLock *lock)
5010{
5011 CInode *in = static_cast<CInode*>(lock->get_parent());
f67539c2 5012 dout(10) << "scatter_writebehind " << in->get_inode()->mtime << " on " << *lock << " on " << *in << dendl;
7c673cae
FG
5013
5014 // journal
5015 MutationRef mut(new MutationImpl());
5016 mut->ls = mds->mdlog->get_current_segment();
5017
5018 // forcefully take a wrlock
5019 lock->get_wrlock(true);
9f95a23c 5020 mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
7c673cae
FG
5021
5022 in->pre_cow_old_inode(); // avoid cow mayhem
5023
f67539c2
TL
5024 auto pi = in->project_inode(mut);
5025 pi.inode->version = in->pre_dirty();
7c673cae 5026
f67539c2 5027 in->finish_scatter_gather_update(lock->get_type(), mut);
7c673cae
FG
5028 lock->start_flush();
5029
5030 EUpdate *le = new EUpdate(mds->mdlog, "scatter_writebehind");
5031 mds->mdlog->start_entry(le);
5032
5033 mdcache->predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5034 mdcache->journal_dirty_inode(mut.get(), &le->metablob, in);
5035
f67539c2 5036 in->finish_scatter_gather_update_accounted(lock->get_type(), &le->metablob);
7c673cae
FG
5037
5038 mds->mdlog->submit_entry(le, new C_Locker_ScatterWB(this, lock, mut));
2a845540 5039 mds->mdlog->flush();
7c673cae
FG
5040}
5041
5042void Locker::scatter_writebehind_finish(ScatterLock *lock, MutationRef& mut)
5043{
5044 CInode *in = static_cast<CInode*>(lock->get_parent());
5045 dout(10) << "scatter_writebehind_finish on " << *lock << " on " << *in << dendl;
f67539c2
TL
5046
5047 mut->apply();
7c673cae
FG
5048
5049 lock->finish_flush();
5050
5051 // if replicas may have flushed in a mix->lock state, send another
5052 // message so they can finish_flush().
5053 if (in->is_replicated()) {
5054 switch (lock->get_state()) {
5055 case LOCK_MIX_LOCK:
5056 case LOCK_MIX_LOCK2:
5057 case LOCK_MIX_EXCL:
5058 case LOCK_MIX_TSYN:
5059 send_lock_message(lock, LOCK_AC_LOCKFLUSHED);
5060 }
5061 }
5062
7c673cae
FG
5063 drop_locks(mut.get());
5064 mut->cleanup();
5065
5066 if (lock->is_stable())
5067 lock->finish_waiters(ScatterLock::WAIT_STABLE);
5068
5069 //scatter_eval_gather(lock);
5070}
5071
5072void Locker::scatter_eval(ScatterLock *lock, bool *need_issue)
5073{
5074 dout(10) << "scatter_eval " << *lock << " on " << *lock->get_parent() << dendl;
5075
11fdf7f2
TL
5076 ceph_assert(lock->get_parent()->is_auth());
5077 ceph_assert(lock->is_stable());
7c673cae
FG
5078
5079 if (lock->get_parent()->is_freezing_or_frozen()) {
5080 dout(20) << " freezing|frozen" << dendl;
5081 return;
5082 }
5083
5084 if (mdcache->is_readonly()) {
5085 if (lock->get_state() != LOCK_SYNC) {
5086 dout(10) << "scatter_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
5087 simple_sync(lock, need_issue);
5088 }
5089 return;
5090 }
5091
5092 if (!lock->is_rdlocked() &&
5093 lock->get_state() != LOCK_MIX &&
5094 lock->get_scatter_wanted()) {
5095 dout(10) << "scatter_eval scatter_wanted, bump to mix " << *lock
5096 << " on " << *lock->get_parent() << dendl;
5097 scatter_mix(lock, need_issue);
5098 return;
5099 }
5100
5101 if (lock->get_type() == CEPH_LOCK_INEST) {
5102 // in general, we want to keep INEST writable at all times.
5103 if (!lock->is_rdlocked()) {
5104 if (lock->get_parent()->is_replicated()) {
5105 if (lock->get_state() != LOCK_MIX)
5106 scatter_mix(lock, need_issue);
5107 } else {
5108 if (lock->get_state() != LOCK_LOCK)
5109 simple_lock(lock, need_issue);
5110 }
5111 }
5112 return;
5113 }
5114
5115 CInode *in = static_cast<CInode*>(lock->get_parent());
5116 if (!in->has_subtree_or_exporting_dirfrag() || in->is_base()) {
5117 // i _should_ be sync.
5118 if (!lock->is_wrlocked() &&
5119 lock->get_state() != LOCK_SYNC) {
5120 dout(10) << "scatter_eval no wrlocks|xlocks, not subtree root inode, syncing" << dendl;
5121 simple_sync(lock, need_issue);
5122 }
5123 }
5124}
5125
5126
5127/*
5128 * mark a scatterlock to indicate that the dir fnode has some dirty data
5129 */
5130void Locker::mark_updated_scatterlock(ScatterLock *lock)
5131{
5132 lock->mark_dirty();
5133 if (lock->get_updated_item()->is_on_list()) {
5134 dout(10) << "mark_updated_scatterlock " << *lock
5135 << " - already on list since " << lock->get_update_stamp() << dendl;
5136 } else {
5137 updated_scatterlocks.push_back(lock->get_updated_item());
5138 utime_t now = ceph_clock_now();
5139 lock->set_update_stamp(now);
5140 dout(10) << "mark_updated_scatterlock " << *lock
5141 << " - added at " << now << dendl;
5142 }
5143}
5144
5145/*
5146 * this is called by scatter_tick and LogSegment::try_to_trim() when
5147 * trying to flush dirty scattered data (i.e. updated fnode) back to
5148 * the inode.
5149 *
5150 * we need to lock|scatter in order to push fnode changes into the
5151 * inode.dirstat.
5152 */
11fdf7f2 5153void Locker::scatter_nudge(ScatterLock *lock, MDSContext *c, bool forcelockchange)
7c673cae
FG
5154{
5155 CInode *p = static_cast<CInode *>(lock->get_parent());
5156
5157 if (p->is_frozen() || p->is_freezing()) {
5158 dout(10) << "scatter_nudge waiting for unfreeze on " << *p << dendl;
5159 if (c)
5160 p->add_waiter(MDSCacheObject::WAIT_UNFREEZE, c);
b32b8144 5161 else if (lock->is_dirty())
7c673cae
FG
5162 // just requeue. not ideal.. starvation prone..
5163 updated_scatterlocks.push_back(lock->get_updated_item());
5164 return;
5165 }
5166
5167 if (p->is_ambiguous_auth()) {
5168 dout(10) << "scatter_nudge waiting for single auth on " << *p << dendl;
5169 if (c)
5170 p->add_waiter(MDSCacheObject::WAIT_SINGLEAUTH, c);
b32b8144 5171 else if (lock->is_dirty())
7c673cae
FG
5172 // just requeue. not ideal.. starvation prone..
5173 updated_scatterlocks.push_back(lock->get_updated_item());
5174 return;
5175 }
5176
5177 if (p->is_auth()) {
5178 int count = 0;
5179 while (true) {
5180 if (lock->is_stable()) {
5181 // can we do it now?
5182 // (only if we're not replicated.. if we are, we really do need
5183 // to nudge the lock state!)
5184 /*
5185 actually, even if we're not replicated, we can't stay in MIX, because another mds
5186 could discover and replicate us at any time. if that happens while we're flushing,
5187 they end up in MIX but their inode has the old scatterstat version.
5188
5189 if (!forcelockchange && !lock->get_parent()->is_replicated() && lock->can_wrlock(-1)) {
5190 dout(10) << "scatter_nudge auth, propagating " << *lock << " on " << *p << dendl;
5191 scatter_writebehind(lock);
5192 if (c)
5193 lock->add_waiter(SimpleLock::WAIT_STABLE, c);
5194 return;
5195 }
5196 */
5197
5198 if (mdcache->is_readonly()) {
5199 if (lock->get_state() != LOCK_SYNC) {
5200 dout(10) << "scatter_nudge auth, read-only FS, syncing " << *lock << " on " << *p << dendl;
5201 simple_sync(static_cast<ScatterLock*>(lock));
5202 }
5203 break;
5204 }
5205
5206 // adjust lock state
5207 dout(10) << "scatter_nudge auth, scatter/unscattering " << *lock << " on " << *p << dendl;
5208 switch (lock->get_type()) {
5209 case CEPH_LOCK_IFILE:
5210 if (p->is_replicated() && lock->get_state() != LOCK_MIX)
5211 scatter_mix(static_cast<ScatterLock*>(lock));
5212 else if (lock->get_state() != LOCK_LOCK)
5213 simple_lock(static_cast<ScatterLock*>(lock));
5214 else
5215 simple_sync(static_cast<ScatterLock*>(lock));
5216 break;
5217
5218 case CEPH_LOCK_IDFT:
5219 case CEPH_LOCK_INEST:
5220 if (p->is_replicated() && lock->get_state() != LOCK_MIX)
5221 scatter_mix(lock);
5222 else if (lock->get_state() != LOCK_LOCK)
5223 simple_lock(lock);
5224 else
5225 simple_sync(lock);
5226 break;
5227 default:
5228 ceph_abort();
5229 }
5230 ++count;
5231 if (lock->is_stable() && count == 2) {
5232 dout(10) << "scatter_nudge oh, stable after two cycles." << dendl;
5233 // this should only realy happen when called via
5234 // handle_file_lock due to AC_NUDGE, because the rest of the
5235 // time we are replicated or have dirty data and won't get
5236 // called. bailing here avoids an infinite loop.
11fdf7f2 5237 ceph_assert(!c);
7c673cae
FG
5238 break;
5239 }
5240 } else {
5241 dout(10) << "scatter_nudge auth, waiting for stable " << *lock << " on " << *p << dendl;
5242 if (c)
5243 lock->add_waiter(SimpleLock::WAIT_STABLE, c);
5244 return;
5245 }
5246 }
5247 } else {
5248 dout(10) << "scatter_nudge replica, requesting scatter/unscatter of "
5249 << *lock << " on " << *p << dendl;
5250 // request unscatter?
5251 mds_rank_t auth = lock->get_parent()->authority().first;
11fdf7f2 5252 if (!mds->is_cluster_degraded() || mds->mdsmap->is_clientreplay_or_active_or_stopping(auth)) {
9f95a23c 5253 mds->send_message_mds(make_message<MLock>(lock, LOCK_AC_NUDGE, mds->get_nodeid()), auth);
11fdf7f2 5254 }
7c673cae
FG
5255
5256 // wait...
5257 if (c)
5258 lock->add_waiter(SimpleLock::WAIT_STABLE, c);
5259
5260 // also, requeue, in case we had wrong auth or something
b32b8144
FG
5261 if (lock->is_dirty())
5262 updated_scatterlocks.push_back(lock->get_updated_item());
7c673cae
FG
5263 }
5264}
5265
5266void Locker::scatter_tick()
5267{
5268 dout(10) << "scatter_tick" << dendl;
5269
5270 // updated
5271 utime_t now = ceph_clock_now();
5272 int n = updated_scatterlocks.size();
5273 while (!updated_scatterlocks.empty()) {
5274 ScatterLock *lock = updated_scatterlocks.front();
5275
5276 if (n-- == 0) break; // scatter_nudge() may requeue; avoid looping
5277
5278 if (!lock->is_dirty()) {
5279 updated_scatterlocks.pop_front();
5280 dout(10) << " removing from updated_scatterlocks "
5281 << *lock << " " << *lock->get_parent() << dendl;
5282 continue;
5283 }
11fdf7f2 5284 if (now - lock->get_update_stamp() < g_conf()->mds_scatter_nudge_interval)
7c673cae
FG
5285 break;
5286 updated_scatterlocks.pop_front();
5287 scatter_nudge(lock, 0);
5288 }
5289 mds->mdlog->flush();
5290}
5291
5292
5293void Locker::scatter_tempsync(ScatterLock *lock, bool *need_issue)
5294{
5295 dout(10) << "scatter_tempsync " << *lock
5296 << " on " << *lock->get_parent() << dendl;
11fdf7f2
TL
5297 ceph_assert(lock->get_parent()->is_auth());
5298 ceph_assert(lock->is_stable());
7c673cae 5299
11fdf7f2 5300 ceph_abort_msg("not fully implemented, at least not for filelock");
7c673cae
FG
5301
5302 CInode *in = static_cast<CInode *>(lock->get_parent());
5303
5304 switch (lock->get_state()) {
5305 case LOCK_SYNC: ceph_abort(); // this shouldn't happen
5306 case LOCK_LOCK: lock->set_state(LOCK_LOCK_TSYN); break;
5307 case LOCK_MIX: lock->set_state(LOCK_MIX_TSYN); break;
5308 default: ceph_abort();
5309 }
5310
5311 int gather = 0;
9f95a23c
TL
5312 if (lock->is_wrlocked()) {
5313 if (lock->is_cached())
5314 invalidate_lock_caches(lock);
7c673cae 5315 gather++;
9f95a23c 5316 }
7c673cae
FG
5317
5318 if (lock->get_cap_shift() &&
5319 in->is_head() &&
5320 in->issued_caps_need_gather(lock)) {
5321 if (need_issue)
5322 *need_issue = true;
5323 else
5324 issue_caps(in);
5325 gather++;
5326 }
5327
5328 if (lock->get_state() == LOCK_MIX_TSYN &&
5329 in->is_replicated()) {
5330 lock->init_gather();
5331 send_lock_message(lock, LOCK_AC_LOCK);
5332 gather++;
5333 }
5334
5335 if (gather) {
5336 in->auth_pin(lock);
5337 } else {
5338 // do tempsync
5339 lock->set_state(LOCK_TSYN);
5340 lock->finish_waiters(ScatterLock::WAIT_RD|ScatterLock::WAIT_STABLE);
5341 if (lock->get_cap_shift()) {
5342 if (need_issue)
5343 *need_issue = true;
5344 else
5345 issue_caps(in);
5346 }
5347 }
5348}
5349
5350
5351
5352// ==========================================================================
5353// local lock
5354
f67539c2 5355void Locker::local_wrlock_grab(LocalLockC *lock, MutationRef& mut)
7c673cae
FG
5356{
5357 dout(7) << "local_wrlock_grab on " << *lock
5358 << " on " << *lock->get_parent() << dendl;
5359
11fdf7f2
TL
5360 ceph_assert(lock->get_parent()->is_auth());
5361 ceph_assert(lock->can_wrlock());
7c673cae 5362 lock->get_wrlock(mut->get_client());
11fdf7f2 5363
9f95a23c
TL
5364 auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
5365 ceph_assert(it->is_wrlock());
7c673cae
FG
5366}
5367
f67539c2 5368bool Locker::local_wrlock_start(LocalLockC *lock, MDRequestRef& mut)
7c673cae
FG
5369{
5370 dout(7) << "local_wrlock_start on " << *lock
5371 << " on " << *lock->get_parent() << dendl;
5372
11fdf7f2 5373 ceph_assert(lock->get_parent()->is_auth());
7c673cae 5374 if (lock->can_wrlock()) {
7c673cae 5375 lock->get_wrlock(mut->get_client());
9f95a23c 5376 auto it = mut->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
11fdf7f2 5377 ceph_assert(it->is_wrlock());
7c673cae
FG
5378 return true;
5379 } else {
5380 lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
5381 return false;
5382 }
5383}
5384
11fdf7f2 5385void Locker::local_wrlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
7c673cae 5386{
11fdf7f2 5387 ceph_assert(it->is_wrlock());
f67539c2 5388 LocalLockC *lock = static_cast<LocalLockC*>(it->lock);
7c673cae
FG
5389 dout(7) << "local_wrlock_finish on " << *lock
5390 << " on " << *lock->get_parent() << dendl;
5391 lock->put_wrlock();
11fdf7f2 5392 mut->locks.erase(it);
7c673cae
FG
5393 if (lock->get_num_wrlocks() == 0) {
5394 lock->finish_waiters(SimpleLock::WAIT_STABLE |
5395 SimpleLock::WAIT_WR |
5396 SimpleLock::WAIT_RD);
5397 }
5398}
5399
f67539c2 5400bool Locker::local_xlock_start(LocalLockC *lock, MDRequestRef& mut)
7c673cae
FG
5401{
5402 dout(7) << "local_xlock_start on " << *lock
5403 << " on " << *lock->get_parent() << dendl;
5404
11fdf7f2 5405 ceph_assert(lock->get_parent()->is_auth());
7c673cae
FG
5406 if (!lock->can_xlock_local()) {
5407 lock->add_waiter(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE, new C_MDS_RetryRequest(mdcache, mut));
5408 return false;
5409 }
5410
5411 lock->get_xlock(mut, mut->get_client());
9f95a23c 5412 mut->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
5413 return true;
5414}
5415
11fdf7f2 5416void Locker::local_xlock_finish(const MutationImpl::lock_iterator& it, MutationImpl *mut)
7c673cae 5417{
11fdf7f2 5418 ceph_assert(it->is_xlock());
f67539c2 5419 LocalLockC *lock = static_cast<LocalLockC*>(it->lock);
7c673cae
FG
5420 dout(7) << "local_xlock_finish on " << *lock
5421 << " on " << *lock->get_parent() << dendl;
5422 lock->put_xlock();
11fdf7f2 5423 mut->locks.erase(it);
7c673cae
FG
5424
5425 lock->finish_waiters(SimpleLock::WAIT_STABLE |
5426 SimpleLock::WAIT_WR |
5427 SimpleLock::WAIT_RD);
5428}
5429
5430
5431
5432// ==========================================================================
5433// file lock
5434
5435
5436void Locker::file_eval(ScatterLock *lock, bool *need_issue)
5437{
5438 CInode *in = static_cast<CInode*>(lock->get_parent());
5439 int loner_wanted, other_wanted;
5440 int wanted = in->get_caps_wanted(&loner_wanted, &other_wanted, CEPH_CAP_SFILE);
5441 dout(7) << "file_eval wanted=" << gcap_string(wanted)
5442 << " loner_wanted=" << gcap_string(loner_wanted)
5443 << " other_wanted=" << gcap_string(other_wanted)
5444 << " filelock=" << *lock << " on " << *lock->get_parent()
5445 << dendl;
5446
11fdf7f2
TL
5447 ceph_assert(lock->get_parent()->is_auth());
5448 ceph_assert(lock->is_stable());
7c673cae
FG
5449
5450 if (lock->get_parent()->is_freezing_or_frozen())
5451 return;
5452
5453 if (mdcache->is_readonly()) {
5454 if (lock->get_state() != LOCK_SYNC) {
5455 dout(10) << "file_eval read-only FS, syncing " << *lock << " on " << *lock->get_parent() << dendl;
5456 simple_sync(lock, need_issue);
5457 }
5458 return;
5459 }
5460
5461 // excl -> *?
5462 if (lock->get_state() == LOCK_EXCL) {
5463 dout(20) << " is excl" << dendl;
5464 int loner_issued, other_issued, xlocker_issued;
5465 in->get_caps_issued(&loner_issued, &other_issued, &xlocker_issued, CEPH_CAP_SFILE);
5466 dout(7) << "file_eval loner_issued=" << gcap_string(loner_issued)
5467 << " other_issued=" << gcap_string(other_issued)
5468 << " xlocker_issued=" << gcap_string(xlocker_issued)
5469 << dendl;
5470 if (!((loner_wanted|loner_issued) & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)) ||
9f95a23c
TL
5471 (other_wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GRD)) ||
5472 (in->is_dir() && in->multiple_nonstale_caps())) { // FIXME.. :/
7c673cae
FG
5473 dout(20) << " should lose it" << dendl;
5474 // we should lose it.
5475 // loner other want
5476 // R R SYNC
5477 // R R|W MIX
5478 // R W MIX
5479 // R|W R MIX
5480 // R|W R|W MIX
5481 // R|W W MIX
5482 // W R MIX
5483 // W R|W MIX
5484 // W W MIX
5485 // -> any writer means MIX; RD doesn't matter.
5486 if (((other_wanted|loner_wanted) & CEPH_CAP_GWR) ||
5487 lock->is_waiter_for(SimpleLock::WAIT_WR))
5488 scatter_mix(lock, need_issue);
5489 else if (!lock->is_wrlocked()) // let excl wrlocks drain first
5490 simple_sync(lock, need_issue);
5491 else
5492 dout(10) << " waiting for wrlock to drain" << dendl;
5493 }
5494 }
5495
5496 // * -> excl?
5497 else if (lock->get_state() != LOCK_EXCL &&
5498 !lock->is_rdlocked() &&
5499 //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
9f95a23c
TL
5500 in->get_target_loner() >= 0 &&
5501 (in->is_dir() ?
5502 !in->has_subtree_or_exporting_dirfrag() :
5503 (wanted & (CEPH_CAP_GEXCL|CEPH_CAP_GWR|CEPH_CAP_GBUFFER)))) {
7c673cae
FG
5504 dout(7) << "file_eval stable, bump to loner " << *lock
5505 << " on " << *lock->get_parent() << dendl;
5506 file_excl(lock, need_issue);
5507 }
5508
5509 // * -> mixed?
5510 else if (lock->get_state() != LOCK_MIX &&
5511 !lock->is_rdlocked() &&
5512 //!lock->is_waiter_for(SimpleLock::WAIT_WR) &&
5513 (lock->get_scatter_wanted() ||
b32b8144 5514 (in->get_target_loner() < 0 && (wanted & CEPH_CAP_GWR)))) {
7c673cae
FG
5515 dout(7) << "file_eval stable, bump to mixed " << *lock
5516 << " on " << *lock->get_parent() << dendl;
5517 scatter_mix(lock, need_issue);
5518 }
5519
5520 // * -> sync?
5521 else if (lock->get_state() != LOCK_SYNC &&
5522 !lock->is_wrlocked() && // drain wrlocks first!
5523 !lock->is_waiter_for(SimpleLock::WAIT_WR) &&
f64942e4 5524 !(wanted & CEPH_CAP_GWR) &&
7c673cae
FG
5525 !((lock->get_state() == LOCK_MIX) &&
5526 in->is_dir() && in->has_subtree_or_exporting_dirfrag()) // if we are a delegation point, stay where we are
5527 //((wanted & CEPH_CAP_RD) ||
5528 //in->is_replicated() ||
b32b8144 5529 //lock->is_leased() ||
7c673cae
FG
5530 //(!loner && lock->get_state() == LOCK_EXCL)) &&
5531 ) {
5532 dout(7) << "file_eval stable, bump to sync " << *lock
5533 << " on " << *lock->get_parent() << dendl;
5534 simple_sync(lock, need_issue);
5535 }
f91f0fd5
TL
5536 else if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
5537 mds->mdcache->queue_file_recover(in);
b3b6e05e 5538 mds->mdcache->do_file_recover();
f91f0fd5 5539 }
7c673cae
FG
5540}
5541
5542
5543
5544void Locker::scatter_mix(ScatterLock *lock, bool *need_issue)
5545{
5546 dout(7) << "scatter_mix " << *lock << " on " << *lock->get_parent() << dendl;
5547
5548 CInode *in = static_cast<CInode*>(lock->get_parent());
11fdf7f2
TL
5549 ceph_assert(in->is_auth());
5550 ceph_assert(lock->is_stable());
7c673cae
FG
5551
5552 if (lock->get_state() == LOCK_LOCK) {
5553 in->start_scatter(lock);
5554 if (in->is_replicated()) {
5555 // data
5556 bufferlist softdata;
5557 lock->encode_locked_state(softdata);
5558
5559 // bcast to replicas
5560 send_lock_message(lock, LOCK_AC_MIX, softdata);
5561 }
5562
5563 // change lock
5564 lock->set_state(LOCK_MIX);
5565 lock->clear_scatter_wanted();
5566 if (lock->get_cap_shift()) {
5567 if (need_issue)
5568 *need_issue = true;
5569 else
5570 issue_caps(in);
5571 }
5572 } else {
5573 // gather?
5574 switch (lock->get_state()) {
5575 case LOCK_SYNC: lock->set_state(LOCK_SYNC_MIX); break;
7c673cae 5576 case LOCK_EXCL: lock->set_state(LOCK_EXCL_MIX); break;
b32b8144 5577 case LOCK_XSYN: lock->set_state(LOCK_XSYN_MIX); break;
7c673cae
FG
5578 case LOCK_TSYN: lock->set_state(LOCK_TSYN_MIX); break;
5579 default: ceph_abort();
5580 }
5581
5582 int gather = 0;
9f95a23c
TL
5583 if (lock->is_rdlocked()) {
5584 if (lock->is_cached())
5585 invalidate_lock_caches(lock);
7c673cae 5586 gather++;
9f95a23c 5587 }
7c673cae 5588 if (in->is_replicated()) {
b32b8144 5589 if (lock->get_state() == LOCK_SYNC_MIX) { // for the rest states, replicas are already LOCK
7c673cae
FG
5590 send_lock_message(lock, LOCK_AC_MIX);
5591 lock->init_gather();
5592 gather++;
5593 }
5594 }
5595 if (lock->is_leased()) {
5596 revoke_client_leases(lock);
5597 gather++;
5598 }
5599 if (lock->get_cap_shift() &&
5600 in->is_head() &&
5601 in->issued_caps_need_gather(lock)) {
5602 if (need_issue)
5603 *need_issue = true;
5604 else
5605 issue_caps(in);
5606 gather++;
5607 }
5608 bool need_recover = false;
5609 if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
5610 mds->mdcache->queue_file_recover(in);
5611 need_recover = true;
5612 gather++;
5613 }
5614
5615 if (gather) {
5616 lock->get_parent()->auth_pin(lock);
5617 if (need_recover)
5618 mds->mdcache->do_file_recover();
5619 } else {
5620 in->start_scatter(lock);
5621 lock->set_state(LOCK_MIX);
5622 lock->clear_scatter_wanted();
5623 if (in->is_replicated()) {
5624 bufferlist softdata;
5625 lock->encode_locked_state(softdata);
5626 send_lock_message(lock, LOCK_AC_MIX, softdata);
5627 }
5628 if (lock->get_cap_shift()) {
5629 if (need_issue)
5630 *need_issue = true;
5631 else
5632 issue_caps(in);
5633 }
5634 }
5635 }
5636}
5637
5638
5639void Locker::file_excl(ScatterLock *lock, bool *need_issue)
5640{
5641 CInode *in = static_cast<CInode*>(lock->get_parent());
5642 dout(7) << "file_excl " << *lock << " on " << *lock->get_parent() << dendl;
5643
11fdf7f2
TL
5644 ceph_assert(in->is_auth());
5645 ceph_assert(lock->is_stable());
7c673cae 5646
11fdf7f2 5647 ceph_assert((in->get_loner() >= 0 && in->get_mds_caps_wanted().empty()) ||
7c673cae
FG
5648 (lock->get_state() == LOCK_XSYN)); // must do xsyn -> excl -> <anything else>
5649
5650 switch (lock->get_state()) {
5651 case LOCK_SYNC: lock->set_state(LOCK_SYNC_EXCL); break;
5652 case LOCK_MIX: lock->set_state(LOCK_MIX_EXCL); break;
5653 case LOCK_LOCK: lock->set_state(LOCK_LOCK_EXCL); break;
5654 case LOCK_XSYN: lock->set_state(LOCK_XSYN_EXCL); break;
5655 default: ceph_abort();
5656 }
5657 int gather = 0;
5658
5659 if (lock->is_rdlocked())
5660 gather++;
5661 if (lock->is_wrlocked())
5662 gather++;
9f95a23c
TL
5663 if (gather && lock->is_cached())
5664 invalidate_lock_caches(lock);
7c673cae
FG
5665
5666 if (in->is_replicated() &&
5667 lock->get_state() != LOCK_LOCK_EXCL &&
5668 lock->get_state() != LOCK_XSYN_EXCL) { // if we were lock, replicas are already lock.
5669 send_lock_message(lock, LOCK_AC_LOCK);
5670 lock->init_gather();
5671 gather++;
5672 }
5673 if (lock->is_leased()) {
5674 revoke_client_leases(lock);
5675 gather++;
5676 }
5677 if (in->is_head() &&
5678 in->issued_caps_need_gather(lock)) {
5679 if (need_issue)
5680 *need_issue = true;
5681 else
5682 issue_caps(in);
5683 gather++;
5684 }
5685 bool need_recover = false;
5686 if (in->state_test(CInode::STATE_NEEDSRECOVER)) {
5687 mds->mdcache->queue_file_recover(in);
5688 need_recover = true;
5689 gather++;
5690 }
5691
5692 if (gather) {
5693 lock->get_parent()->auth_pin(lock);
5694 if (need_recover)
5695 mds->mdcache->do_file_recover();
5696 } else {
5697 lock->set_state(LOCK_EXCL);
5698 if (need_issue)
5699 *need_issue = true;
5700 else
5701 issue_caps(in);
5702 }
5703}
5704
5705void Locker::file_xsyn(SimpleLock *lock, bool *need_issue)
5706{
5707 dout(7) << "file_xsyn on " << *lock << " on " << *lock->get_parent() << dendl;
5708 CInode *in = static_cast<CInode *>(lock->get_parent());
11fdf7f2
TL
5709 ceph_assert(in->is_auth());
5710 ceph_assert(in->get_loner() >= 0 && in->get_mds_caps_wanted().empty());
7c673cae
FG
5711
5712 switch (lock->get_state()) {
5713 case LOCK_EXCL: lock->set_state(LOCK_EXCL_XSYN); break;
5714 default: ceph_abort();
5715 }
5716
5717 int gather = 0;
9f95a23c
TL
5718 if (lock->is_wrlocked()) {
5719 if (lock->is_cached())
5720 invalidate_lock_caches(lock);
7c673cae 5721 gather++;
9f95a23c 5722 }
7c673cae
FG
5723
5724 if (in->is_head() &&
5725 in->issued_caps_need_gather(lock)) {
5726 if (need_issue)
5727 *need_issue = true;
5728 else
5729 issue_caps(in);
5730 gather++;
5731 }
5732
5733 if (gather) {
5734 lock->get_parent()->auth_pin(lock);
5735 } else {
5736 lock->set_state(LOCK_XSYN);
5737 lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
5738 if (need_issue)
5739 *need_issue = true;
5740 else
5741 issue_caps(in);
5742 }
5743}
5744
5745void Locker::file_recover(ScatterLock *lock)
5746{
5747 CInode *in = static_cast<CInode *>(lock->get_parent());
5748 dout(7) << "file_recover " << *lock << " on " << *in << dendl;
5749
11fdf7f2 5750 ceph_assert(in->is_auth());
7c673cae 5751 //assert(lock->is_stable());
11fdf7f2 5752 ceph_assert(lock->get_state() == LOCK_PRE_SCAN); // only called from MDCache::start_files_to_recover()
7c673cae
FG
5753
5754 int gather = 0;
5755
5756 /*
5757 if (in->is_replicated()
5758 lock->get_sm()->states[oldstate].replica_state != LOCK_LOCK) {
5759 send_lock_message(lock, LOCK_AC_LOCK);
5760 lock->init_gather();
5761 gather++;
5762 }
5763 */
5764 if (in->is_head() &&
5765 in->issued_caps_need_gather(lock)) {
5766 issue_caps(in);
5767 gather++;
5768 }
5769
5770 lock->set_state(LOCK_SCAN);
5771 if (gather)
5772 in->state_set(CInode::STATE_NEEDSRECOVER);
5773 else
5774 mds->mdcache->queue_file_recover(in);
5775}
5776
5777
5778// messenger
9f95a23c 5779void Locker::handle_file_lock(ScatterLock *lock, const cref_t<MLock> &m)
7c673cae
FG
5780{
5781 CInode *in = static_cast<CInode*>(lock->get_parent());
5782 int from = m->get_asker();
5783
5784 if (mds->is_rejoin()) {
5785 if (in->is_rejoining()) {
5786 dout(7) << "handle_file_lock still rejoining " << *in
5787 << ", dropping " << *m << dendl;
7c673cae
FG
5788 return;
5789 }
5790 }
5791
11fdf7f2 5792 dout(7) << "handle_file_lock a=" << lock->get_lock_action_name(m->get_action())
7c673cae
FG
5793 << " on " << *lock
5794 << " from mds." << from << " "
5795 << *in << dendl;
5796
5797 bool caps = lock->get_cap_shift();
5798
5799 switch (m->get_action()) {
5800 // -- replica --
5801 case LOCK_AC_SYNC:
11fdf7f2 5802 ceph_assert(lock->get_state() == LOCK_LOCK ||
7c673cae 5803 lock->get_state() == LOCK_MIX ||
f38dd50b 5804 lock->get_state() == LOCK_MIX_SYNC ||
7c673cae
FG
5805 lock->get_state() == LOCK_MIX_SYNC2);
5806
5807 if (lock->get_state() == LOCK_MIX) {
5808 lock->set_state(LOCK_MIX_SYNC);
5809 eval_gather(lock, true);
9f95a23c
TL
5810 if (lock->is_unstable_and_locked()) {
5811 if (lock->is_cached())
5812 invalidate_lock_caches(lock);
31f18b77 5813 mds->mdlog->flush();
9f95a23c 5814 }
7c673cae
FG
5815 break;
5816 }
5817
5818 (static_cast<ScatterLock *>(lock))->finish_flush();
5819 (static_cast<ScatterLock *>(lock))->clear_flushed();
5820
5821 // ok
5822 lock->decode_locked_state(m->get_data());
5823 lock->set_state(LOCK_SYNC);
5824
5825 lock->get_rdlock();
5826 if (caps)
5827 issue_caps(in);
5828 lock->finish_waiters(SimpleLock::WAIT_RD|SimpleLock::WAIT_STABLE);
5829 lock->put_rdlock();
5830 break;
5831
5832 case LOCK_AC_LOCK:
5833 switch (lock->get_state()) {
5834 case LOCK_SYNC: lock->set_state(LOCK_SYNC_LOCK); break;
5835 case LOCK_MIX: lock->set_state(LOCK_MIX_LOCK); break;
5836 default: ceph_abort();
5837 }
5838
5839 eval_gather(lock, true);
9f95a23c
TL
5840 if (lock->is_unstable_and_locked()) {
5841 if (lock->is_cached())
5842 invalidate_lock_caches(lock);
31f18b77 5843 mds->mdlog->flush();
9f95a23c 5844 }
31f18b77 5845
7c673cae
FG
5846 break;
5847
5848 case LOCK_AC_LOCKFLUSHED:
5849 (static_cast<ScatterLock *>(lock))->finish_flush();
5850 (static_cast<ScatterLock *>(lock))->clear_flushed();
5851 // wake up scatter_nudge waiters
5852 if (lock->is_stable())
5853 lock->finish_waiters(SimpleLock::WAIT_STABLE);
5854 break;
5855
5856 case LOCK_AC_MIX:
11fdf7f2 5857 ceph_assert(lock->get_state() == LOCK_SYNC ||
7c673cae
FG
5858 lock->get_state() == LOCK_LOCK ||
5859 lock->get_state() == LOCK_SYNC_MIX2);
5860
5861 if (lock->get_state() == LOCK_SYNC) {
5862 // MIXED
5863 lock->set_state(LOCK_SYNC_MIX);
5864 eval_gather(lock, true);
9f95a23c
TL
5865 if (lock->is_unstable_and_locked()) {
5866 if (lock->is_cached())
5867 invalidate_lock_caches(lock);
31f18b77 5868 mds->mdlog->flush();
9f95a23c 5869 }
7c673cae
FG
5870 break;
5871 }
c07f9fc5 5872
7c673cae 5873 // ok
7c673cae 5874 lock->set_state(LOCK_MIX);
c07f9fc5 5875 lock->decode_locked_state(m->get_data());
7c673cae
FG
5876
5877 if (caps)
5878 issue_caps(in);
5879
5880 lock->finish_waiters(SimpleLock::WAIT_WR|SimpleLock::WAIT_STABLE);
5881 break;
5882
5883
5884 // -- auth --
5885 case LOCK_AC_LOCKACK:
11fdf7f2 5886 ceph_assert(lock->get_state() == LOCK_SYNC_LOCK ||
7c673cae
FG
5887 lock->get_state() == LOCK_MIX_LOCK ||
5888 lock->get_state() == LOCK_MIX_LOCK2 ||
5889 lock->get_state() == LOCK_MIX_EXCL ||
5890 lock->get_state() == LOCK_SYNC_EXCL ||
5891 lock->get_state() == LOCK_SYNC_MIX ||
5892 lock->get_state() == LOCK_MIX_TSYN);
11fdf7f2 5893 ceph_assert(lock->is_gathering(from));
7c673cae
FG
5894 lock->remove_gather(from);
5895
5896 if (lock->get_state() == LOCK_MIX_LOCK ||
5897 lock->get_state() == LOCK_MIX_LOCK2 ||
5898 lock->get_state() == LOCK_MIX_EXCL ||
5899 lock->get_state() == LOCK_MIX_TSYN) {
5900 lock->decode_locked_state(m->get_data());
5901 // replica is waiting for AC_LOCKFLUSHED, eval_gather() should not
5902 // delay calling scatter_writebehind().
5903 lock->clear_flushed();
5904 }
5905
5906 if (lock->is_gathering()) {
5907 dout(7) << "handle_file_lock " << *in << " from " << from
5908 << ", still gathering " << lock->get_gather_set() << dendl;
5909 } else {
5910 dout(7) << "handle_file_lock " << *in << " from " << from
5911 << ", last one" << dendl;
5912 eval_gather(lock);
5913 }
5914 break;
5915
5916 case LOCK_AC_SYNCACK:
11fdf7f2
TL
5917 ceph_assert(lock->get_state() == LOCK_MIX_SYNC);
5918 ceph_assert(lock->is_gathering(from));
7c673cae
FG
5919 lock->remove_gather(from);
5920
5921 lock->decode_locked_state(m->get_data());
5922
5923 if (lock->is_gathering()) {
5924 dout(7) << "handle_file_lock " << *in << " from " << from
5925 << ", still gathering " << lock->get_gather_set() << dendl;
5926 } else {
5927 dout(7) << "handle_file_lock " << *in << " from " << from
5928 << ", last one" << dendl;
5929 eval_gather(lock);
5930 }
5931 break;
5932
5933 case LOCK_AC_MIXACK:
11fdf7f2
TL
5934 ceph_assert(lock->get_state() == LOCK_SYNC_MIX);
5935 ceph_assert(lock->is_gathering(from));
7c673cae
FG
5936 lock->remove_gather(from);
5937
5938 if (lock->is_gathering()) {
5939 dout(7) << "handle_file_lock " << *in << " from " << from
5940 << ", still gathering " << lock->get_gather_set() << dendl;
5941 } else {
5942 dout(7) << "handle_file_lock " << *in << " from " << from
5943 << ", last one" << dendl;
5944 eval_gather(lock);
5945 }
5946 break;
5947
5948
5949 // requests....
5950 case LOCK_AC_REQSCATTER:
5951 if (lock->is_stable()) {
5952 /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
5953 * because the replica should be holding an auth_pin if they're
5954 * doing this (and thus, we are freezing, not frozen, and indefinite
5955 * starvation isn't an issue).
5956 */
5957 dout(7) << "handle_file_lock got scatter request on " << *lock
5958 << " on " << *lock->get_parent() << dendl;
5959 if (lock->get_state() != LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter
5960 scatter_mix(lock);
5961 } else {
5962 dout(7) << "handle_file_lock got scatter request, !stable, marking scatter_wanted on " << *lock
5963 << " on " << *lock->get_parent() << dendl;
5964 lock->set_scatter_wanted();
5965 }
5966 break;
5967
5968 case LOCK_AC_REQUNSCATTER:
5969 if (lock->is_stable()) {
5970 /* NOTE: we can do this _even_ if !can_auth_pin (i.e. freezing)
5971 * because the replica should be holding an auth_pin if they're
5972 * doing this (and thus, we are freezing, not frozen, and indefinite
5973 * starvation isn't an issue).
5974 */
5975 dout(7) << "handle_file_lock got unscatter request on " << *lock
5976 << " on " << *lock->get_parent() << dendl;
5977 if (lock->get_state() == LOCK_MIX) // i.e., the reqscatter didn't race with an actual mix/scatter
5978 simple_lock(lock); // FIXME tempsync?
5979 } else {
5980 dout(7) << "handle_file_lock ignoring unscatter request on " << *lock
5981 << " on " << *lock->get_parent() << dendl;
5982 lock->set_unscatter_wanted();
5983 }
5984 break;
5985
5986 case LOCK_AC_REQRDLOCK:
5987 handle_reqrdlock(lock, m);
5988 break;
5989
5990 case LOCK_AC_NUDGE:
5991 if (!lock->get_parent()->is_auth()) {
5992 dout(7) << "handle_file_lock IGNORING nudge on non-auth " << *lock
5993 << " on " << *lock->get_parent() << dendl;
5994 } else if (!lock->get_parent()->is_replicated()) {
5995 dout(7) << "handle_file_lock IGNORING nudge on non-replicated " << *lock
5996 << " on " << *lock->get_parent() << dendl;
5997 } else {
5998 dout(7) << "handle_file_lock trying nudge on " << *lock
5999 << " on " << *lock->get_parent() << dendl;
6000 scatter_nudge(lock, 0, true);
6001 mds->mdlog->flush();
6002 }
6003 break;
6004
6005 default:
6006 ceph_abort();
6007 }
7c673cae 6008}