]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import 14.2.4 nautilus point release
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
26#include <sys/utsname.h>
27#include <sys/uio.h>
28
29#include <boost/lexical_cast.hpp>
30#include <boost/fusion/include/std_pair.hpp>
31
32#if defined(__FreeBSD__)
33#define XATTR_CREATE 0x1
34#define XATTR_REPLACE 0x2
35#else
36#include <sys/xattr.h>
37#endif
38
39#if defined(__linux__)
40#include <linux/falloc.h>
41#endif
42
43#include <sys/statvfs.h>
44
45#include "common/config.h"
46#include "common/version.h"
47
11fdf7f2
TL
48#include "mon/MonClient.h"
49
50#include "messages/MClientCaps.h"
51#include "messages/MClientLease.h"
52#include "messages/MClientQuota.h"
53#include "messages/MClientReclaim.h"
54#include "messages/MClientReclaimReply.h"
7c673cae 55#include "messages/MClientReconnect.h"
11fdf7f2 56#include "messages/MClientReply.h"
7c673cae
FG
57#include "messages/MClientRequest.h"
58#include "messages/MClientRequestForward.h"
11fdf7f2 59#include "messages/MClientSession.h"
7c673cae
FG
60#include "messages/MClientSnap.h"
61#include "messages/MCommandReply.h"
7c673cae
FG
62#include "messages/MFSMap.h"
63#include "messages/MFSMapUser.h"
11fdf7f2
TL
64#include "messages/MMDSMap.h"
65#include "messages/MOSDMap.h"
7c673cae
FG
66
67#include "mds/flock.h"
11fdf7f2 68#include "mds/cephfs_features.h"
7c673cae
FG
69#include "osd/OSDMap.h"
70#include "osdc/Filer.h"
71
72#include "common/Cond.h"
73#include "common/Mutex.h"
74#include "common/perf_counters.h"
75#include "common/admin_socket.h"
76#include "common/errno.h"
77#include "include/str_list.h"
78
79#define dout_subsys ceph_subsys_client
80
81#include "include/lru.h"
82#include "include/compat.h"
83#include "include/stringify.h"
84
85#include "Client.h"
86#include "Inode.h"
87#include "Dentry.h"
b32b8144 88#include "Delegation.h"
7c673cae
FG
89#include "Dir.h"
90#include "ClientSnapRealm.h"
91#include "Fh.h"
92#include "MetaSession.h"
93#include "MetaRequest.h"
94#include "ObjecterWriteback.h"
95#include "posix_acl.h"
96
11fdf7f2 97#include "include/ceph_assert.h"
7c673cae
FG
98#include "include/stat.h"
99
100#include "include/cephfs/ceph_statx.h"
101
102#if HAVE_GETGROUPLIST
103#include <grp.h>
104#include <pwd.h>
105#include <unistd.h>
106#endif
107
108#undef dout_prefix
109#define dout_prefix *_dout << "client." << whoami << " "
110
111#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
112
113// FreeBSD fails to define this
114#ifndef O_DSYNC
115#define O_DSYNC 0x0
116#endif
117// Darwin fails to define this
118#ifndef O_RSYNC
119#define O_RSYNC 0x0
120#endif
121
122#ifndef O_DIRECT
123#define O_DIRECT 0x0
124#endif
125
126#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
127
128void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
129{
130 Client *client = static_cast<Client*>(p);
131 client->flush_set_callback(oset);
132}
133
134
135// -------------
136
137Client::CommandHook::CommandHook(Client *client) :
138 m_client(client)
139{
140}
141
11fdf7f2
TL
142bool Client::CommandHook::call(std::string_view command,
143 const cmdmap_t& cmdmap,
144 std::string_view format, bufferlist& out)
7c673cae 145{
11fdf7f2 146 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae
FG
147 f->open_object_section("result");
148 m_client->client_lock.Lock();
149 if (command == "mds_requests")
11fdf7f2 150 m_client->dump_mds_requests(f.get());
7c673cae 151 else if (command == "mds_sessions")
11fdf7f2 152 m_client->dump_mds_sessions(f.get());
7c673cae 153 else if (command == "dump_cache")
11fdf7f2 154 m_client->dump_cache(f.get());
7c673cae
FG
155 else if (command == "kick_stale_sessions")
156 m_client->_kick_stale_sessions();
157 else if (command == "status")
11fdf7f2 158 m_client->dump_status(f.get());
7c673cae 159 else
11fdf7f2 160 ceph_abort_msg("bad command registered");
7c673cae
FG
161 m_client->client_lock.Unlock();
162 f->close_section();
163 f->flush(out);
7c673cae
FG
164 return true;
165}
166
167
168// -------------
169
170dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
171 : inode(in), offset(0), next_offset(2),
172 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 perms(perms)
174 { }
175
176void Client::_reset_faked_inos()
177{
178 ino_t start = 1024;
179 free_faked_inos.clear();
180 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
181 last_used_faked_ino = 0;
11fdf7f2 182 last_used_faked_root = 0;
7c673cae
FG
183 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
184}
185
186void Client::_assign_faked_ino(Inode *in)
187{
11fdf7f2
TL
188 if (0 == last_used_faked_ino)
189 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
190 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
191 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 192 last_used_faked_ino = 2048;
7c673cae
FG
193 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
194 }
11fdf7f2 195 ceph_assert(it != free_faked_inos.end());
7c673cae 196 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 197 ceph_assert(it.get_len() > 0);
7c673cae
FG
198 last_used_faked_ino = it.get_start();
199 } else {
200 ++last_used_faked_ino;
11fdf7f2 201 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
202 }
203 in->faked_ino = last_used_faked_ino;
204 free_faked_inos.erase(in->faked_ino);
205 faked_ino_map[in->faked_ino] = in->vino();
206}
207
11fdf7f2
TL
208/*
209 * In the faked mode, if you export multiple subdirectories,
210 * you will see that the inode numbers of the exported subdirectories
211 * are the same. so we distinguish the mount point by reserving
212 * the "fake ids" between "1024~2048" and combining the last
213 * 10bits(0x3ff) of the "root inodes".
214*/
215void Client::_assign_faked_root(Inode *in)
216{
217 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
218 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
219 last_used_faked_root = 0;
220 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
221 }
222 assert(it != free_faked_inos.end());
223 vinodeno_t inode_info = in->vino();
224 uint64_t inode_num = (uint64_t)inode_info.ino;
225 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
226 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
227 assert(it.get_start() + it.get_len() > last_used_faked_root);
228
229 in->faked_ino = last_used_faked_root;
230 free_faked_inos.erase(in->faked_ino);
231 faked_ino_map[in->faked_ino] = in->vino();
232}
233
7c673cae
FG
234void Client::_release_faked_ino(Inode *in)
235{
236 free_faked_inos.insert(in->faked_ino);
237 faked_ino_map.erase(in->faked_ino);
238}
239
240vinodeno_t Client::_map_faked_ino(ino_t ino)
241{
242 vinodeno_t vino;
243 if (ino == 1)
244 vino = root->vino();
245 else if (faked_ino_map.count(ino))
246 vino = faked_ino_map[ino];
247 else
248 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 249 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
250 return vino;
251}
252
253vinodeno_t Client::map_faked_ino(ino_t ino)
254{
11fdf7f2 255 std::lock_guard lock(client_lock);
7c673cae
FG
256 return _map_faked_ino(ino);
257}
258
259// cons/des
260
261Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
262 : Dispatcher(m->cct),
7c673cae 263 timer(m->cct, client_lock),
11fdf7f2
TL
264 client_lock("Client::client_lock"),
265 messenger(m),
266 monclient(mc),
267 objecter(objecter_),
268 whoami(mc->get_global_id()),
7c673cae
FG
269 async_ino_invalidator(m->cct),
270 async_dentry_invalidator(m->cct),
271 interrupt_finisher(m->cct),
272 remount_finisher(m->cct),
273 objecter_finisher(m->cct),
11fdf7f2
TL
274 m_command_hook(this),
275 fscid(0)
7c673cae
FG
276{
277 _reset_faked_inos();
7c673cae 278
7c673cae
FG
279 user_id = cct->_conf->client_mount_uid;
280 group_id = cct->_conf->client_mount_gid;
281
7c673cae
FG
282 if (cct->_conf->client_acl_type == "posix_acl")
283 acl_type = POSIX_ACL;
284
7c673cae
FG
285 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
286
287 // file handles
288 free_fd_set.insert(10, 1<<30);
289
290 mdsmap.reset(new MDSMap);
291
292 // osd interfaces
293 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
294 &client_lock));
295 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
296 client_flush_set_callback, // all commit callback
297 (void*)this,
298 cct->_conf->client_oc_size,
299 cct->_conf->client_oc_max_objects,
300 cct->_conf->client_oc_max_dirty,
301 cct->_conf->client_oc_target_dirty,
302 cct->_conf->client_oc_max_dirty_age,
303 true));
304 objecter_finisher.start();
305 filer.reset(new Filer(objecter, &objecter_finisher));
31f18b77 306 objecter->enable_blacklist_events();
7c673cae
FG
307}
308
309
310Client::~Client()
311{
11fdf7f2 312 ceph_assert(!client_lock.is_locked());
7c673cae 313
31f18b77
FG
314 // It is necessary to hold client_lock, because any inode destruction
315 // may call into ObjectCacher, which asserts that it's lock (which is
316 // client_lock) is held.
317 client_lock.Lock();
7c673cae 318 tear_down_cache();
31f18b77 319 client_lock.Unlock();
7c673cae
FG
320}
321
322void Client::tear_down_cache()
323{
324 // fd's
325 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
326 it != fd_map.end();
327 ++it) {
328 Fh *fh = it->second;
11fdf7f2 329 ldout(cct, 1) << __func__ << " forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
7c673cae
FG
330 _release_fh(fh);
331 }
332 fd_map.clear();
333
334 while (!opened_dirs.empty()) {
335 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 336 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
337 _closedir(dirp);
338 }
339
340 // caps!
341 // *** FIXME ***
342
343 // empty lru
7c673cae 344 trim_cache();
11fdf7f2 345 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
346
347 // close root ino
11fdf7f2 348 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae
FG
349 if (root && inode_map.size() == 1 + root_parents.size()) {
350 delete root;
351 root = 0;
352 root_ancestor = 0;
353 while (!root_parents.empty())
354 root_parents.erase(root_parents.begin());
355 inode_map.clear();
356 _reset_faked_inos();
357 }
358
11fdf7f2 359 ceph_assert(inode_map.empty());
7c673cae
FG
360}
361
362inodeno_t Client::get_root_ino()
363{
11fdf7f2 364 std::lock_guard l(client_lock);
7c673cae
FG
365 if (use_faked_inos())
366 return root->faked_ino;
367 else
368 return root->ino;
369}
370
371Inode *Client::get_root()
372{
11fdf7f2 373 std::lock_guard l(client_lock);
7c673cae
FG
374 root->ll_get();
375 return root;
376}
377
378
379// debug crapola
380
381void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
382{
383 filepath path;
384 in->make_long_path(path);
385 ldout(cct, 1) << "dump_inode: "
386 << (disconnected ? "DISCONNECTED ":"")
387 << "inode " << in->ino
388 << " " << path
389 << " ref " << in->get_num_ref()
390 << *in << dendl;
391
392 if (f) {
393 f->open_object_section("inode");
394 f->dump_stream("path") << path;
395 if (disconnected)
396 f->dump_int("disconnected", 1);
397 in->dump(f);
398 f->close_section();
399 }
400
401 did.insert(in);
402 if (in->dir) {
403 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
404 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
405 it != in->dir->dentries.end();
406 ++it) {
407 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
408 if (f) {
409 f->open_object_section("dentry");
410 it->second->dump(f);
411 f->close_section();
412 }
413 if (it->second->inode)
414 dump_inode(f, it->second->inode.get(), did, false);
415 }
416 }
417}
418
419void Client::dump_cache(Formatter *f)
420{
421 set<Inode*> did;
422
11fdf7f2 423 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
424
425 if (f)
426 f->open_array_section("cache");
427
428 if (root)
429 dump_inode(f, root, did, true);
430
431 // make a second pass to catch anything disconnected
432 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
433 it != inode_map.end();
434 ++it) {
435 if (did.count(it->second))
436 continue;
437 dump_inode(f, it->second, did, true);
438 }
439
440 if (f)
441 f->close_section();
442}
443
444void Client::dump_status(Formatter *f)
445{
11fdf7f2 446 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
447
448 ldout(cct, 1) << __func__ << dendl;
449
450 const epoch_t osd_epoch
451 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
452
453 if (f) {
454 f->open_object_section("metadata");
455 for (const auto& kv : metadata)
456 f->dump_string(kv.first.c_str(), kv.second);
457 f->close_section();
458
459 f->dump_int("dentry_count", lru.lru_get_size());
460 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
461 f->dump_int("id", get_nodeid().v);
11fdf7f2 462 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 463 f->dump_object("inst", inst);
11fdf7f2
TL
464 f->dump_object("addr", inst.addr);
465 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
466 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
467 f->dump_int("inode_count", inode_map.size());
468 f->dump_int("mds_epoch", mdsmap->get_epoch());
469 f->dump_int("osd_epoch", osd_epoch);
470 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f64942e4 471 f->dump_bool("blacklisted", blacklisted);
7c673cae
FG
472 }
473}
474
475int Client::init()
476{
477 timer.init();
478 objectcacher->start();
479
480 client_lock.Lock();
11fdf7f2 481 ceph_assert(!initialized);
7c673cae
FG
482
483 messenger->add_dispatcher_tail(this);
484 client_lock.Unlock();
485
486 _finish_init();
487 return 0;
488}
489
490void Client::_finish_init()
491{
492 client_lock.Lock();
493 // logger
494 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
495 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
496 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
497 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
11fdf7f2
TL
498 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
499 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
7c673cae
FG
500 logger.reset(plb.create_perf_counters());
501 cct->get_perfcounters_collection()->add(logger.get());
502
503 client_lock.Unlock();
504
11fdf7f2 505 cct->_conf.add_observer(this);
7c673cae
FG
506
507 AdminSocket* admin_socket = cct->get_admin_socket();
508 int ret = admin_socket->register_command("mds_requests",
509 "mds_requests",
510 &m_command_hook,
511 "show in-progress mds requests");
512 if (ret < 0) {
513 lderr(cct) << "error registering admin socket command: "
514 << cpp_strerror(-ret) << dendl;
515 }
516 ret = admin_socket->register_command("mds_sessions",
517 "mds_sessions",
518 &m_command_hook,
519 "show mds session state");
520 if (ret < 0) {
521 lderr(cct) << "error registering admin socket command: "
522 << cpp_strerror(-ret) << dendl;
523 }
524 ret = admin_socket->register_command("dump_cache",
525 "dump_cache",
526 &m_command_hook,
527 "show in-memory metadata cache contents");
528 if (ret < 0) {
529 lderr(cct) << "error registering admin socket command: "
530 << cpp_strerror(-ret) << dendl;
531 }
532 ret = admin_socket->register_command("kick_stale_sessions",
533 "kick_stale_sessions",
534 &m_command_hook,
535 "kick sessions that were remote reset");
536 if (ret < 0) {
537 lderr(cct) << "error registering admin socket command: "
538 << cpp_strerror(-ret) << dendl;
539 }
540 ret = admin_socket->register_command("status",
541 "status",
542 &m_command_hook,
543 "show overall client status");
544 if (ret < 0) {
545 lderr(cct) << "error registering admin socket command: "
546 << cpp_strerror(-ret) << dendl;
547 }
548
549 client_lock.Lock();
550 initialized = true;
551 client_lock.Unlock();
552}
553
554void Client::shutdown()
555{
11fdf7f2 556 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
557
558 // If we were not mounted, but were being used for sending
559 // MDS commands, we may have sessions that need closing.
560 client_lock.Lock();
561 _close_sessions();
562 client_lock.Unlock();
563
11fdf7f2 564 cct->_conf.remove_observer(this);
7c673cae 565
11fdf7f2 566 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
567
568 if (ino_invalidate_cb) {
569 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
570 async_ino_invalidator.wait_for_empty();
571 async_ino_invalidator.stop();
572 }
573
574 if (dentry_invalidate_cb) {
575 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
576 async_dentry_invalidator.wait_for_empty();
577 async_dentry_invalidator.stop();
578 }
579
580 if (switch_interrupt_cb) {
581 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
582 interrupt_finisher.wait_for_empty();
583 interrupt_finisher.stop();
584 }
585
586 if (remount_cb) {
587 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
588 remount_finisher.wait_for_empty();
589 remount_finisher.stop();
590 }
591
592 objectcacher->stop(); // outside of client_lock! this does a join.
593
594 client_lock.Lock();
11fdf7f2 595 ceph_assert(initialized);
7c673cae
FG
596 initialized = false;
597 timer.shutdown();
598 client_lock.Unlock();
599
600 objecter_finisher.wait_for_empty();
601 objecter_finisher.stop();
602
603 if (logger) {
604 cct->get_perfcounters_collection()->remove(logger.get());
605 logger.reset();
606 }
607}
608
609
610// ===================
611// metadata cache stuff
612
613void Client::trim_cache(bool trim_kernel_dcache)
614{
181888fb
FG
615 uint64_t max = cct->_conf->client_cache_size;
616 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
617 unsigned last = 0;
618 while (lru.lru_get_size() != last) {
619 last = lru.lru_get_size();
620
181888fb 621 if (!unmounting && lru.lru_get_size() <= max) break;
7c673cae
FG
622
623 // trim!
31f18b77 624 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
625 if (!dn)
626 break; // done
627
628 trim_dentry(dn);
629 }
630
181888fb 631 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
632 _invalidate_kernel_dcache();
633
634 // hose root?
635 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
636 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
637 delete root;
638 root = 0;
639 root_ancestor = 0;
640 while (!root_parents.empty())
641 root_parents.erase(root_parents.begin());
642 inode_map.clear();
643 _reset_faked_inos();
644 }
645}
646
647void Client::trim_cache_for_reconnect(MetaSession *s)
648{
649 mds_rank_t mds = s->mds_num;
11fdf7f2 650 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
651
652 int trimmed = 0;
653 list<Dentry*> skipped;
654 while (lru.lru_get_size() > 0) {
655 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
656 if (!dn)
657 break;
658
659 if ((dn->inode && dn->inode->caps.count(mds)) ||
660 dn->dir->parent_inode->caps.count(mds)) {
661 trim_dentry(dn);
662 trimmed++;
663 } else
664 skipped.push_back(dn);
665 }
666
667 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
668 lru.lru_insert_mid(*p);
669
11fdf7f2 670 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
671 << " trimmed " << trimmed << " dentries" << dendl;
672
673 if (s->caps.size() > 0)
674 _invalidate_kernel_dcache();
675}
676
677void Client::trim_dentry(Dentry *dn)
678{
679 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
680 << " in dir "
681 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
682 << dendl;
683 if (dn->inode) {
684 Inode *diri = dn->dir->parent_inode;
685 diri->dir_release_count++;
686 clear_dir_complete_and_ordered(diri, true);
687 }
688 unlink(dn, false, false); // drop dir, drop dentry
689}
690
691
1adf2230
AA
692void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
693 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 694{
7c673cae
FG
695 uint64_t prior_size = in->size;
696
7c673cae
FG
697 if (truncate_seq > in->truncate_seq ||
698 (truncate_seq == in->truncate_seq && size > in->size)) {
699 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
700 in->size = size;
701 in->reported_size = size;
702 if (truncate_seq != in->truncate_seq) {
703 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
704 << truncate_seq << dendl;
705 in->truncate_seq = truncate_seq;
706 in->oset.truncate_seq = truncate_seq;
707
708 // truncate cached file data
709 if (prior_size > size) {
710 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
711 }
712 }
713
714 // truncate inline data
715 if (in->inline_version < CEPH_INLINE_NONE) {
716 uint32_t len = in->inline_data.length();
717 if (size < len)
718 in->inline_data.splice(size, len - size);
719 }
720 }
721 if (truncate_seq >= in->truncate_seq &&
722 in->truncate_size != truncate_size) {
723 if (in->is_file()) {
724 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
725 << truncate_size << dendl;
726 in->truncate_size = truncate_size;
727 in->oset.truncate_size = truncate_size;
728 } else {
729 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
730 }
731 }
1adf2230
AA
732}
733
734void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
735 utime_t ctime, utime_t mtime, utime_t atime)
736{
737 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
738 << " ctime " << ctime << " mtime " << mtime << dendl;
739
740 if (time_warp_seq > in->time_warp_seq)
741 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
742 << " is higher than local time_warp_seq "
743 << in->time_warp_seq << dendl;
744
745 int warn = false;
7c673cae
FG
746 // be careful with size, mtime, atime
747 if (issued & (CEPH_CAP_FILE_EXCL|
748 CEPH_CAP_FILE_WR|
749 CEPH_CAP_FILE_BUFFER|
750 CEPH_CAP_AUTH_EXCL|
751 CEPH_CAP_XATTR_EXCL)) {
752 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
753 if (ctime > in->ctime)
754 in->ctime = ctime;
755 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
756 //the mds updated times, so take those!
757 in->mtime = mtime;
758 in->atime = atime;
759 in->time_warp_seq = time_warp_seq;
760 } else if (time_warp_seq == in->time_warp_seq) {
761 //take max times
762 if (mtime > in->mtime)
763 in->mtime = mtime;
764 if (atime > in->atime)
765 in->atime = atime;
766 } else if (issued & CEPH_CAP_FILE_EXCL) {
767 //ignore mds values as we have a higher seq
768 } else warn = true;
769 } else {
770 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
771 if (time_warp_seq >= in->time_warp_seq) {
772 in->ctime = ctime;
773 in->mtime = mtime;
774 in->atime = atime;
775 in->time_warp_seq = time_warp_seq;
776 } else warn = true;
777 }
778 if (warn) {
779 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
780 << time_warp_seq << " is lower than local time_warp_seq "
781 << in->time_warp_seq
782 << dendl;
783 }
784}
785
786void Client::_fragmap_remove_non_leaves(Inode *in)
787{
788 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
789 if (!in->dirfragtree.is_leaf(p->first))
790 in->fragmap.erase(p++);
791 else
792 ++p;
793}
794
795void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
796{
797 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
798 if (p->second == mds)
799 in->fragmap.erase(p++);
800 else
801 ++p;
802}
803
804Inode * Client::add_update_inode(InodeStat *st, utime_t from,
805 MetaSession *session,
806 const UserPerm& request_perms)
807{
808 Inode *in;
809 bool was_new = false;
810 if (inode_map.count(st->vino)) {
811 in = inode_map[st->vino];
11fdf7f2 812 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
813 } else {
814 in = new Inode(this, st->vino, &st->layout);
815 inode_map[st->vino] = in;
816
817 if (use_faked_inos())
818 _assign_faked_ino(in);
819
820 if (!root) {
821 root = in;
11fdf7f2
TL
822 if (use_faked_inos())
823 _assign_faked_root(root);
7c673cae
FG
824 root_ancestor = in;
825 cwd = root;
826 } else if (!mounted) {
827 root_parents[root_ancestor] = in;
828 root_ancestor = in;
829 }
830
831 // immutable bits
832 in->ino = st->vino.ino;
833 in->snapid = st->vino.snapid;
834 in->mode = st->mode & S_IFMT;
835 was_new = true;
836 }
837
838 in->rdev = st->rdev;
839 if (in->is_symlink())
840 in->symlink = st->symlink;
841
7c673cae 842 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
843 bool new_version = false;
844 if (in->version == 0 ||
845 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
846 (in->version & ~1) < st->version))
847 new_version = true;
7c673cae 848
1adf2230
AA
849 int issued;
850 in->caps_issued(&issued);
851 issued |= in->caps_dirty();
852 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 853
1adf2230
AA
854 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
855 !(issued & CEPH_CAP_AUTH_EXCL)) {
856 in->mode = st->mode;
857 in->uid = st->uid;
858 in->gid = st->gid;
859 in->btime = st->btime;
81eedcae 860 in->snap_btime = st->snap_btime;
1adf2230 861 }
7c673cae 862
1adf2230
AA
863 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
864 !(issued & CEPH_CAP_LINK_EXCL)) {
865 in->nlink = st->nlink;
866 }
7c673cae 867
1adf2230
AA
868 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
869 update_inode_file_time(in, issued, st->time_warp_seq,
870 st->ctime, st->mtime, st->atime);
871 }
7c673cae 872
1adf2230
AA
873 if (new_version ||
874 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 875 in->layout = st->layout;
1adf2230
AA
876 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
877 }
7c673cae 878
1adf2230
AA
879 if (in->is_dir()) {
880 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
881 in->dirstat = st->dirstat;
882 }
883 // dir_layout/rstat/quota are not tracked by capability, update them only if
884 // the inode stat is from auth mds
885 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
886 in->dir_layout = st->dir_layout;
887 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
888 in->rstat = st->rstat;
889 in->quota = st->quota;
11fdf7f2 890 in->dir_pin = st->dir_pin;
1adf2230
AA
891 }
892 // move me if/when version reflects fragtree changes.
893 if (in->dirfragtree != st->dirfragtree) {
894 in->dirfragtree = st->dirfragtree;
895 _fragmap_remove_non_leaves(in);
7c673cae 896 }
7c673cae
FG
897 }
898
899 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
900 st->xattrbl.length() &&
901 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
902 auto p = st->xattrbl.cbegin();
903 decode(in->xattrs, p);
7c673cae
FG
904 in->xattr_version = st->xattr_version;
905 }
906
1adf2230
AA
907 if (st->inline_version > in->inline_version) {
908 in->inline_data = st->inline_data;
909 in->inline_version = st->inline_version;
7c673cae
FG
910 }
911
1adf2230
AA
912 /* always take a newer change attr */
913 if (st->change_attr > in->change_attr)
914 in->change_attr = st->change_attr;
915
916 if (st->version > in->version)
917 in->version = st->version;
918
919 if (was_new)
920 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
921
922 if (!st->cap.caps)
923 return in; // as with readdir returning indoes in different snaprealms (no caps!)
924
7c673cae 925 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
926 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
927 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
928 st->cap.flags, request_perms);
28e407b8 929 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 930 in->max_size = st->max_size;
28e407b8
AA
931 in->rstat = st->rstat;
932 }
7c673cae 933
1adf2230
AA
934 // setting I_COMPLETE needs to happen after adding the cap
935 if (in->is_dir() &&
936 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
937 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
938 in->dirstat.nfiles == 0 &&
939 in->dirstat.nsubdirs == 0) {
940 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
941 in->flags |= I_COMPLETE | I_DIR_ORDERED;
942 if (in->dir) {
943 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
944 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
945 in->dir->readdir_cache.clear();
946 for (const auto& p : in->dir->dentries) {
947 unlink(p.second, true, true); // keep dir, keep dentry
948 }
949 if (in->dir->dentries.empty())
950 close_dir(in->dir);
7c673cae 951 }
7c673cae 952 }
1adf2230
AA
953 } else {
954 in->snap_caps |= st->cap.caps;
7c673cae
FG
955 }
956
957 return in;
958}
959
960
961/*
962 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
963 */
964Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
965 Inode *in, utime_t from, MetaSession *session,
966 Dentry *old_dentry)
967{
968 Dentry *dn = NULL;
969 if (dir->dentries.count(dname))
970 dn = dir->dentries[dname];
971
11fdf7f2 972 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
973 << " in dir " << dir->parent_inode->vino() << " dn " << dn
974 << dendl;
975
976 if (dn && dn->inode) {
977 if (dn->inode->vino() == in->vino()) {
978 touch_dn(dn);
979 ldout(cct, 12) << " had dentry " << dname
980 << " with correct vino " << dn->inode->vino()
981 << dendl;
982 } else {
983 ldout(cct, 12) << " had dentry " << dname
984 << " with WRONG vino " << dn->inode->vino()
985 << dendl;
986 unlink(dn, true, true); // keep dir, keep dentry
987 }
988 }
989
990 if (!dn || !dn->inode) {
991 InodeRef tmp_ref(in);
992 if (old_dentry) {
993 if (old_dentry->dir != dir) {
994 Inode *old_diri = old_dentry->dir->parent_inode;
995 old_diri->dir_ordered_count++;
996 clear_dir_complete_and_ordered(old_diri, false);
997 }
998 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
999 }
1000 Inode *diri = dir->parent_inode;
1001 diri->dir_ordered_count++;
1002 clear_dir_complete_and_ordered(diri, false);
1003 dn = link(dir, dname, in, dn);
1004 }
1005
1006 update_dentry_lease(dn, dlease, from, session);
1007 return dn;
1008}
1009
1010void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1011{
1012 utime_t dttl = from;
1013 dttl += (float)dlease->duration_ms / 1000.0;
1014
11fdf7f2 1015 ceph_assert(dn);
7c673cae
FG
1016
1017 if (dlease->mask & CEPH_LOCK_DN) {
1018 if (dttl > dn->lease_ttl) {
1019 ldout(cct, 10) << "got dentry lease on " << dn->name
1020 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1021 dn->lease_ttl = dttl;
1022 dn->lease_mds = session->mds_num;
1023 dn->lease_seq = dlease->seq;
1024 dn->lease_gen = session->cap_gen;
1025 }
1026 }
1027 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1028}
1029
1030
1031/*
1032 * update MDS location cache for a single inode
1033 */
1034void Client::update_dir_dist(Inode *in, DirStat *dst)
1035{
1036 // auth
1037 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1038 if (dst->auth >= 0) {
1039 in->fragmap[dst->frag] = dst->auth;
1040 } else {
1041 in->fragmap.erase(dst->frag);
1042 }
1043 if (!in->dirfragtree.is_leaf(dst->frag)) {
1044 in->dirfragtree.force_to_leaf(cct, dst->frag);
1045 _fragmap_remove_non_leaves(in);
1046 }
1047
1048 // replicated
1049 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1050
1051 // dist
1052 /*
1053 if (!st->dirfrag_dist.empty()) { // FIXME
1054 set<int> dist = st->dirfrag_dist.begin()->second;
1055 if (dist.empty() && !in->dir_contacts.empty())
1056 ldout(cct, 9) << "lost dist spec for " << in->ino
1057 << " " << dist << dendl;
1058 if (!dist.empty() && in->dir_contacts.empty())
1059 ldout(cct, 9) << "got dist spec for " << in->ino
1060 << " " << dist << dendl;
1061 in->dir_contacts = dist;
1062 }
1063 */
1064}
1065
1066void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1067{
1068 if (diri->flags & I_COMPLETE) {
1069 if (complete) {
1070 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1071 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1072 } else {
1073 if (diri->flags & I_DIR_ORDERED) {
1074 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1075 diri->flags &= ~I_DIR_ORDERED;
1076 }
1077 }
1078 if (diri->dir)
1079 diri->dir->readdir_cache.clear();
1080 }
1081}
1082
1083/*
1084 * insert results from readdir or lssnap into the metadata cache.
1085 */
1086void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1087
11fdf7f2 1088 auto& reply = request->reply;
7c673cae 1089 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1090 uint64_t features;
1091 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1092 features = (uint64_t)-1;
1093 }
1094 else {
1095 features = con->get_features();
1096 }
7c673cae
FG
1097
1098 dir_result_t *dirp = request->dirp;
11fdf7f2 1099 ceph_assert(dirp);
7c673cae
FG
1100
1101 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1102 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1103 if (!p.end()) {
1104 // snapdir?
1105 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1106 ceph_assert(diri);
7c673cae
FG
1107 diri = open_snapdir(diri);
1108 }
1109
1110 // only open dir if we're actually adding stuff to it!
1111 Dir *dir = diri->open_dir();
11fdf7f2 1112 ceph_assert(dir);
7c673cae
FG
1113
1114 // dirstat
11fdf7f2 1115 DirStat dst(p, features);
7c673cae
FG
1116 __u32 numdn;
1117 __u16 flags;
11fdf7f2
TL
1118 decode(numdn, p);
1119 decode(flags, p);
7c673cae
FG
1120
1121 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1122 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1123
1124 frag_t fg = (unsigned)request->head.args.readdir.frag;
1125 unsigned readdir_offset = dirp->next_offset;
1126 string readdir_start = dirp->last_name;
11fdf7f2 1127 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1128
1129 unsigned last_hash = 0;
1130 if (hash_order) {
1131 if (!readdir_start.empty()) {
1132 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1133 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1134 /* mds understands offset_hash */
1135 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1136 }
1137 }
1138
1139 if (fg != dst.frag) {
1140 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1141 fg = dst.frag;
1142 if (!hash_order) {
1143 readdir_offset = 2;
1144 readdir_start.clear();
1145 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1146 }
1147 }
1148
1149 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1150 << ", hash_order=" << hash_order
1151 << ", readdir_start " << readdir_start
1152 << ", last_hash " << last_hash
1153 << ", next_offset " << readdir_offset << dendl;
1154
1155 if (diri->snapid != CEPH_SNAPDIR &&
1156 fg.is_leftmost() && readdir_offset == 2 &&
1157 !(hash_order && last_hash)) {
1158 dirp->release_count = diri->dir_release_count;
1159 dirp->ordered_count = diri->dir_ordered_count;
1160 dirp->start_shared_gen = diri->shared_gen;
1161 dirp->cache_index = 0;
1162 }
1163
1164 dirp->buffer_frag = fg;
1165
1166 _readdir_drop_dirp_buffer(dirp);
1167 dirp->buffer.reserve(numdn);
1168
1169 string dname;
1170 LeaseStat dlease;
1171 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1172 decode(dname, p);
1173 dlease.decode(p, features);
7c673cae
FG
1174 InodeStat ist(p, features);
1175
1176 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1177
1178 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1179 request->perms);
1180 Dentry *dn;
1181 if (diri->dir->dentries.count(dname)) {
1182 Dentry *olddn = diri->dir->dentries[dname];
1183 if (olddn->inode != in) {
1184 // replace incorrect dentry
1185 unlink(olddn, true, true); // keep dir, dentry
1186 dn = link(dir, dname, in, olddn);
11fdf7f2 1187 ceph_assert(dn == olddn);
7c673cae
FG
1188 } else {
1189 // keep existing dn
1190 dn = olddn;
1191 touch_dn(dn);
1192 }
1193 } else {
1194 // new dn
1195 dn = link(dir, dname, in, NULL);
1196 }
1197
1198 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1199 if (hash_order) {
1200 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1201 if (hash != last_hash)
1202 readdir_offset = 2;
1203 last_hash = hash;
1204 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1205 } else {
1206 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1207 }
1208 // add to readdir cache
1209 if (dirp->release_count == diri->dir_release_count &&
1210 dirp->ordered_count == diri->dir_ordered_count &&
1211 dirp->start_shared_gen == diri->shared_gen) {
1212 if (dirp->cache_index == dir->readdir_cache.size()) {
1213 if (i == 0) {
11fdf7f2 1214 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1215 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1216 }
1217 dir->readdir_cache.push_back(dn);
1218 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1219 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1220 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1221 else
1222 dir->readdir_cache[dirp->cache_index] = dn;
1223 } else {
11fdf7f2 1224 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1225 }
1226 dirp->cache_index++;
1227 }
1228 // add to cached result list
1229 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1230 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1231 }
1232
1233 if (numdn > 0)
1234 dirp->last_name = dname;
1235 if (end)
1236 dirp->next_offset = 2;
1237 else
1238 dirp->next_offset = readdir_offset;
1239
1240 if (dir->is_empty())
1241 close_dir(dir);
1242 }
1243}
1244
1245/** insert_trace
1246 *
1247 * insert a trace from a MDS reply into the cache.
1248 */
1249Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1250{
11fdf7f2 1251 auto& reply = request->reply;
7c673cae
FG
1252 int op = request->get_op();
1253
1254 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1255 << " is_target=" << (int)reply->head.is_target
1256 << " is_dentry=" << (int)reply->head.is_dentry
1257 << dendl;
1258
11fdf7f2 1259 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1260 if (request->got_unsafe) {
1261 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1262 ceph_assert(p.end());
7c673cae
FG
1263 return NULL;
1264 }
1265
1266 if (p.end()) {
1267 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1268
1269 Dentry *d = request->dentry();
1270 if (d) {
1271 Inode *diri = d->dir->parent_inode;
1272 diri->dir_release_count++;
1273 clear_dir_complete_and_ordered(diri, true);
1274 }
1275
1276 if (d && reply->get_result() == 0) {
1277 if (op == CEPH_MDS_OP_RENAME) {
1278 // rename
1279 Dentry *od = request->old_dentry();
1280 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1281 ceph_assert(od);
7c673cae
FG
1282 unlink(od, true, true); // keep dir, dentry
1283 } else if (op == CEPH_MDS_OP_RMDIR ||
1284 op == CEPH_MDS_OP_UNLINK) {
1285 // unlink, rmdir
1286 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1287 unlink(d, true, true); // keep dir, dentry
1288 }
1289 }
1290 return NULL;
1291 }
1292
1293 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1294 uint64_t features;
1295 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1296 features = (uint64_t)-1;
1297 }
1298 else {
1299 features = con->get_features();
1300 }
7c673cae
FG
1301 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1302
1303 // snap trace
1304 SnapRealm *realm = NULL;
1305 if (reply->snapbl.length())
1306 update_snap_trace(reply->snapbl, &realm);
1307
1308 ldout(cct, 10) << " hrm "
1309 << " is_target=" << (int)reply->head.is_target
1310 << " is_dentry=" << (int)reply->head.is_dentry
1311 << dendl;
1312
1313 InodeStat dirst;
1314 DirStat dst;
1315 string dname;
1316 LeaseStat dlease;
1317 InodeStat ist;
1318
1319 if (reply->head.is_dentry) {
1320 dirst.decode(p, features);
11fdf7f2
TL
1321 dst.decode(p, features);
1322 decode(dname, p);
1323 dlease.decode(p, features);
7c673cae
FG
1324 }
1325
1326 Inode *in = 0;
1327 if (reply->head.is_target) {
1328 ist.decode(p, features);
1329 if (cct->_conf->client_debug_getattr_caps) {
1330 unsigned wanted = 0;
1331 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1332 wanted = request->head.args.getattr.mask;
1333 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1334 wanted = request->head.args.open.mask;
1335
1336 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1337 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1338 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1339 }
1340
1341 in = add_update_inode(&ist, request->sent_stamp, session,
1342 request->perms);
1343 }
1344
1345 Inode *diri = NULL;
1346 if (reply->head.is_dentry) {
1347 diri = add_update_inode(&dirst, request->sent_stamp, session,
1348 request->perms);
1349 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1350
1351 if (in) {
1352 Dir *dir = diri->open_dir();
1353 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1354 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1355 } else {
1356 Dentry *dn = NULL;
1357 if (diri->dir && diri->dir->dentries.count(dname)) {
1358 dn = diri->dir->dentries[dname];
1359 if (dn->inode) {
1360 diri->dir_ordered_count++;
1361 clear_dir_complete_and_ordered(diri, false);
1362 unlink(dn, true, true); // keep dir, dentry
1363 }
1364 }
1365 if (dlease.duration_ms > 0) {
1366 if (!dn) {
1367 Dir *dir = diri->open_dir();
1368 dn = link(dir, dname, NULL, NULL);
1369 }
1370 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1371 }
1372 }
1373 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1374 op == CEPH_MDS_OP_MKSNAP) {
1375 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1376 // fake it for snap lookup
1377 vinodeno_t vino = ist.vino;
1378 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1379 ceph_assert(inode_map.count(vino));
7c673cae
FG
1380 diri = inode_map[vino];
1381
1382 string dname = request->path.last_dentry();
1383
1384 LeaseStat dlease;
1385 dlease.duration_ms = 0;
1386
1387 if (in) {
1388 Dir *dir = diri->open_dir();
1389 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1390 } else {
1391 if (diri->dir && diri->dir->dentries.count(dname)) {
1392 Dentry *dn = diri->dir->dentries[dname];
1393 if (dn->inode)
1394 unlink(dn, true, true); // keep dir, dentry
1395 }
1396 }
1397 }
1398
1399 if (in) {
1400 if (op == CEPH_MDS_OP_READDIR ||
1401 op == CEPH_MDS_OP_LSSNAP) {
1402 insert_readdir_results(request, session, in);
1403 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1404 // hack: return parent inode instead
1405 in = diri;
1406 }
1407
1408 if (request->dentry() == NULL && in != request->inode()) {
1409 // pin the target inode if its parent dentry is not pinned
1410 request->set_other_inode(in);
1411 }
1412 }
1413
1414 if (realm)
1415 put_snap_realm(realm);
1416
1417 request->target = in;
1418 return in;
1419}
1420
1421// -------
1422
1423mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1424{
1425 mds_rank_t mds = MDS_RANK_NONE;
1426 __u32 hash = 0;
1427 bool is_hash = false;
1428
1429 Inode *in = NULL;
1430 Dentry *de = NULL;
7c673cae
FG
1431
1432 if (req->resend_mds >= 0) {
1433 mds = req->resend_mds;
1434 req->resend_mds = -1;
11fdf7f2 1435 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1436 goto out;
1437 }
1438
1439 if (cct->_conf->client_use_random_mds)
1440 goto random_mds;
1441
1442 in = req->inode();
1443 de = req->dentry();
1444 if (in) {
11fdf7f2 1445 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1446 if (req->path.depth()) {
1447 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1448 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1449 << " on " << req->path[0]
1450 << " => " << hash << dendl;
1451 is_hash = true;
1452 }
1453 } else if (de) {
1454 if (de->inode) {
1455 in = de->inode.get();
11fdf7f2 1456 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1457 } else {
1458 in = de->dir->parent_inode;
1459 hash = in->hash_dentry_name(de->name);
11fdf7f2 1460 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1461 << " on " << de->name
1462 << " => " << hash << dendl;
1463 is_hash = true;
1464 }
1465 }
1466 if (in) {
1467 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1468 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1469 while (in->snapid != CEPH_NOSNAP) {
1470 if (in->snapid == CEPH_SNAPDIR)
1471 in = in->snapdir_parent.get();
11fdf7f2 1472 else if (!in->dentries.empty())
7c673cae
FG
1473 /* In most cases there will only be one dentry, so getting it
1474 * will be the correct action. If there are multiple hard links,
1475 * I think the MDS should be able to redirect as needed*/
1476 in = in->get_first_parent()->dir->parent_inode;
1477 else {
1478 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1479 break;
1480 }
1481 }
1482 is_hash = false;
1483 }
1484
11fdf7f2 1485 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1486 << " hash=" << hash << dendl;
1487
1488 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1489 frag_t fg = in->dirfragtree[hash];
1490 if (in->fragmap.count(fg)) {
1491 mds = in->fragmap[fg];
1492 if (phash_diri)
1493 *phash_diri = in;
91327a77
AA
1494 } else if (in->auth_cap) {
1495 mds = in->auth_cap->session->mds_num;
1496 }
1497 if (mds >= 0) {
11fdf7f2 1498 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1499 goto out;
1500 }
1501 }
1502
11fdf7f2
TL
1503 if (in->auth_cap && req->auth_is_best()) {
1504 mds = in->auth_cap->session->mds_num;
1505 } else if (!in->caps.empty()) {
1506 mds = in->caps.begin()->second.session->mds_num;
1507 } else {
7c673cae 1508 goto random_mds;
11fdf7f2
TL
1509 }
1510 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1511
1512 goto out;
1513 }
1514
1515random_mds:
1516 if (mds < 0) {
1517 mds = _get_random_up_mds();
1518 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1519 }
1520
1521out:
1522 ldout(cct, 20) << "mds is " << mds << dendl;
1523 return mds;
1524}
1525
1526
1527void Client::connect_mds_targets(mds_rank_t mds)
1528{
11fdf7f2
TL
1529 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1530 ceph_assert(mds_sessions.count(mds));
7c673cae
FG
1531 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1532 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1533 q != info.export_targets.end();
1534 ++q) {
1535 if (mds_sessions.count(*q) == 0 &&
1536 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1537 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1538 << " export target mds." << *q << dendl;
1539 _open_mds_session(*q);
1540 }
1541 }
1542}
1543
1544void Client::dump_mds_sessions(Formatter *f)
1545{
1546 f->dump_int("id", get_nodeid().v);
11fdf7f2 1547 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1548 f->dump_object("inst", inst);
1549 f->dump_stream("inst_str") << inst;
1550 f->dump_stream("addr_str") << inst.addr;
7c673cae 1551 f->open_array_section("sessions");
11fdf7f2 1552 for (const auto &p : mds_sessions) {
7c673cae 1553 f->open_object_section("session");
11fdf7f2 1554 p.second.dump(f);
7c673cae
FG
1555 f->close_section();
1556 }
1557 f->close_section();
1558 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1559}
1560void Client::dump_mds_requests(Formatter *f)
1561{
1562 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1563 p != mds_requests.end();
1564 ++p) {
1565 f->open_object_section("request");
1566 p->second->dump(f);
1567 f->close_section();
1568 }
1569}
1570
1571int Client::verify_reply_trace(int r,
11fdf7f2 1572 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1573 InodeRef *ptarget, bool *pcreated,
1574 const UserPerm& perms)
1575{
1576 // check whether this request actually did the create, and set created flag
1577 bufferlist extra_bl;
1578 inodeno_t created_ino;
1579 bool got_created_ino = false;
1580 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1581
11fdf7f2 1582 extra_bl = reply->get_extra_bl();
7c673cae
FG
1583 if (extra_bl.length() >= 8) {
1584 // if the extra bufferlist has a buffer, we assume its the created inode
1585 // and that this request to create succeeded in actually creating
1586 // the inode (won the race with other create requests)
11fdf7f2 1587 decode(created_ino, extra_bl);
7c673cae
FG
1588 got_created_ino = true;
1589 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1590 }
1591
1592 if (pcreated)
1593 *pcreated = got_created_ino;
1594
1595 if (request->target) {
1596 *ptarget = request->target;
1597 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1598 } else {
1599 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1600 (*ptarget) = p->second;
1601 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1602 } else {
1603 // we got a traceless reply, and need to look up what we just
1604 // created. for now, do this by name. someday, do this by the
1605 // ino... which we know! FIXME.
1606 InodeRef target;
1607 Dentry *d = request->dentry();
1608 if (d) {
1609 if (d->dir) {
1610 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1611 << d->dir->parent_inode->ino << "/" << d->name
1612 << " got_ino " << got_created_ino
1613 << " ino " << created_ino
1614 << dendl;
1615 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1616 &target, perms);
1617 } else {
1618 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1619 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1620 }
1621 } else {
1622 Inode *in = request->inode();
1623 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1624 << in->ino << dendl;
1625 r = _getattr(in, request->regetattr_mask, perms, true);
1626 target = in;
1627 }
1628 if (r >= 0) {
1629 // verify ino returned in reply and trace_dist are the same
1630 if (got_created_ino &&
1631 created_ino.val != target->ino.val) {
1632 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1633 r = -EINTR;
1634 }
1635 if (ptarget)
1636 ptarget->swap(target);
1637 }
1638 }
1639 }
1640
1641 return r;
1642}
1643
1644
1645/**
1646 * make a request
1647 *
1648 * Blocking helper to make an MDS request.
1649 *
1650 * If the ptarget flag is set, behavior changes slightly: the caller
1651 * expects to get a pointer to the inode we are creating or operating
1652 * on. As a result, we will follow up any traceless mutation reply
1653 * with a getattr or lookup to transparently handle a traceless reply
1654 * from the MDS (as when the MDS restarts and the client has to replay
1655 * a request).
1656 *
1657 * @param request the MetaRequest to execute
1658 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1659 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1660 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1661 * @param use_mds [optional] prefer a specific mds (-1 for default)
1662 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1663 */
1664int Client::make_request(MetaRequest *request,
1665 const UserPerm& perms,
1666 InodeRef *ptarget, bool *pcreated,
1667 mds_rank_t use_mds,
1668 bufferlist *pdirbl)
1669{
1670 int r = 0;
1671
1672 // assign a unique tid
1673 ceph_tid_t tid = ++last_tid;
1674 request->set_tid(tid);
1675
1676 // and timestamp
1677 request->op_stamp = ceph_clock_now();
1678
1679 // make note
1680 mds_requests[tid] = request->get();
1681 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1682 oldest_tid = tid;
1683
1684 request->set_caller_perms(perms);
1685
1686 if (cct->_conf->client_inject_fixed_oldest_tid) {
1687 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1688 request->set_oldest_client_tid(1);
1689 } else {
1690 request->set_oldest_client_tid(oldest_tid);
1691 }
1692
1693 // hack target mds?
1694 if (use_mds >= 0)
1695 request->resend_mds = use_mds;
1696
1697 while (1) {
1698 if (request->aborted())
1699 break;
1700
31f18b77
FG
1701 if (blacklisted) {
1702 request->abort(-EBLACKLISTED);
1703 break;
1704 }
1705
7c673cae
FG
1706 // set up wait cond
1707 Cond caller_cond;
1708 request->caller_cond = &caller_cond;
1709
1710 // choose mds
1711 Inode *hash_diri = NULL;
1712 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1713 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1714 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1715 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1716 if (hash_diri) {
1717 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1718 _fragmap_remove_stopped_mds(hash_diri, mds);
1719 } else {
1720 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1721 request->resend_mds = _get_random_up_mds();
1722 }
1723 } else {
1724 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1725 wait_on_list(waiting_for_mdsmap);
1726 }
1727 continue;
1728 }
1729
1730 // open a session?
1731 MetaSession *session = NULL;
1732 if (!have_open_session(mds)) {
1733 session = _get_or_open_mds_session(mds);
1734
1735 // wait
1736 if (session->state == MetaSession::STATE_OPENING) {
1737 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1738 wait_on_context_list(session->waiting_for_open);
1739 // Abort requests on REJECT from MDS
1740 if (rejected_by_mds.count(mds)) {
1741 request->abort(-EPERM);
1742 break;
1743 }
1744 continue;
1745 }
1746
1747 if (!have_open_session(mds))
1748 continue;
1749 } else {
11fdf7f2 1750 session = &mds_sessions.at(mds);
7c673cae
FG
1751 }
1752
1753 // send request.
1754 send_request(request, session);
1755
1756 // wait for signal
1757 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1758 request->kick = false;
1759 while (!request->reply && // reply
1760 request->resend_mds < 0 && // forward
1761 !request->kick)
1762 caller_cond.Wait(client_lock);
1763 request->caller_cond = NULL;
1764
1765 // did we get a reply?
1766 if (request->reply)
1767 break;
1768 }
1769
1770 if (!request->reply) {
11fdf7f2
TL
1771 ceph_assert(request->aborted());
1772 ceph_assert(!request->got_unsafe);
7c673cae
FG
1773 r = request->get_abort_code();
1774 request->item.remove_myself();
1775 unregister_request(request);
11fdf7f2 1776 put_request(request);
7c673cae
FG
1777 return r;
1778 }
1779
1780 // got it!
11fdf7f2 1781 auto reply = std::move(request->reply);
7c673cae
FG
1782 r = reply->get_result();
1783 if (r >= 0)
1784 request->success = true;
1785
1786 // kick dispatcher (we've got it!)
11fdf7f2 1787 ceph_assert(request->dispatch_cond);
7c673cae
FG
1788 request->dispatch_cond->Signal();
1789 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1790 request->dispatch_cond = 0;
1791
1792 if (r >= 0 && ptarget)
1793 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1794
1795 if (pdirbl)
11fdf7f2 1796 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1797
1798 // -- log times --
1799 utime_t lat = ceph_clock_now();
1800 lat -= request->sent_stamp;
1801 ldout(cct, 20) << "lat " << lat << dendl;
1802 logger->tinc(l_c_lat, lat);
1803 logger->tinc(l_c_reply, lat);
1804
1805 put_request(request);
7c673cae
FG
1806 return r;
1807}
1808
1809void Client::unregister_request(MetaRequest *req)
1810{
1811 mds_requests.erase(req->tid);
1812 if (req->tid == oldest_tid) {
1813 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1814 while (true) {
1815 if (p == mds_requests.end()) {
1816 oldest_tid = 0;
1817 break;
1818 }
1819 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1820 oldest_tid = p->first;
1821 break;
1822 }
1823 ++p;
1824 }
1825 }
1826 put_request(req);
1827}
1828
1829void Client::put_request(MetaRequest *request)
1830{
1831 if (request->_put()) {
1832 int op = -1;
1833 if (request->success)
1834 op = request->get_op();
1835 InodeRef other_in;
1836 request->take_other_inode(&other_in);
1837 delete request;
1838
1839 if (other_in &&
1840 (op == CEPH_MDS_OP_RMDIR ||
1841 op == CEPH_MDS_OP_RENAME ||
1842 op == CEPH_MDS_OP_RMSNAP)) {
1843 _try_to_trim_inode(other_in.get(), false);
1844 }
1845 }
1846}
1847
1848int Client::encode_inode_release(Inode *in, MetaRequest *req,
1849 mds_rank_t mds, int drop,
1850 int unless, int force)
1851{
11fdf7f2 1852 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
7c673cae
FG
1853 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1854 << ", have:" << ", force:" << force << ")" << dendl;
1855 int released = 0;
11fdf7f2
TL
1856 auto it = in->caps.find(mds);
1857 if (it != in->caps.end()) {
1858 Cap &cap = it->second;
7c673cae 1859 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1860 if ((drop & cap.issued) &&
1861 !(unless & cap.issued)) {
1862 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(cap.issued) << dendl;
1863 cap.issued &= ~drop;
1864 cap.implemented &= ~drop;
7c673cae 1865 released = 1;
11fdf7f2 1866 ldout(cct, 25) << "Now have: " << ccap_string(cap.issued) << dendl;
7c673cae
FG
1867 } else {
1868 released = force;
1869 }
1870 if (released) {
1871 ceph_mds_request_release rel;
1872 rel.ino = in->ino;
11fdf7f2
TL
1873 rel.cap_id = cap.cap_id;
1874 rel.seq = cap.seq;
1875 rel.issue_seq = cap.issue_seq;
1876 rel.mseq = cap.mseq;
1877 rel.caps = cap.implemented;
1878 rel.wanted = cap.wanted;
7c673cae
FG
1879 rel.dname_len = 0;
1880 rel.dname_seq = 0;
1881 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1882 }
1883 }
11fdf7f2 1884 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
1885 << released << dendl;
1886 return released;
1887}
1888
1889void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1890 mds_rank_t mds, int drop, int unless)
1891{
11fdf7f2 1892 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
1893 << dn << ")" << dendl;
1894 int released = 0;
1895 if (dn->dir)
1896 released = encode_inode_release(dn->dir->parent_inode, req,
1897 mds, drop, unless, 1);
1898 if (released && dn->lease_mds == mds) {
1899 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 1900 auto& rel = req->cap_releases.back();
7c673cae
FG
1901 rel.item.dname_len = dn->name.length();
1902 rel.item.dname_seq = dn->lease_seq;
1903 rel.dname = dn->name;
1904 }
11fdf7f2 1905 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
1906 << dn << ")" << dendl;
1907}
1908
1909
1910/*
1911 * This requires the MClientRequest *request member to be set.
1912 * It will error out horribly without one.
1913 * Additionally, if you set any *drop member, you'd better have
1914 * set the corresponding dentry!
1915 */
1916void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1917{
11fdf7f2 1918 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
1919 << req << ", mds: " << mds << ")" << dendl;
1920 if (req->inode_drop && req->inode())
1921 encode_inode_release(req->inode(), req,
1922 mds, req->inode_drop,
1923 req->inode_unless);
1924
1925 if (req->old_inode_drop && req->old_inode())
1926 encode_inode_release(req->old_inode(), req,
1927 mds, req->old_inode_drop,
1928 req->old_inode_unless);
1929 if (req->other_inode_drop && req->other_inode())
1930 encode_inode_release(req->other_inode(), req,
1931 mds, req->other_inode_drop,
1932 req->other_inode_unless);
1933
1934 if (req->dentry_drop && req->dentry())
1935 encode_dentry_release(req->dentry(), req,
1936 mds, req->dentry_drop,
1937 req->dentry_unless);
1938
1939 if (req->old_dentry_drop && req->old_dentry())
1940 encode_dentry_release(req->old_dentry(), req,
1941 mds, req->old_dentry_drop,
1942 req->old_dentry_unless);
11fdf7f2 1943 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
1944 << req << ", mds " << mds <<dendl;
1945}
1946
1947bool Client::have_open_session(mds_rank_t mds)
1948{
11fdf7f2
TL
1949 const auto &it = mds_sessions.find(mds);
1950 return it != mds_sessions.end() &&
1951 (it->second.state == MetaSession::STATE_OPEN ||
1952 it->second.state == MetaSession::STATE_STALE);
7c673cae
FG
1953}
1954
1955MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1956{
11fdf7f2
TL
1957 const auto &it = mds_sessions.find(mds);
1958 if (it == mds_sessions.end() || it->second.con != con) {
7c673cae 1959 return NULL;
11fdf7f2
TL
1960 } else {
1961 return &it->second;
1962 }
7c673cae
FG
1963}
1964
1965MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1966{
11fdf7f2
TL
1967 auto it = mds_sessions.find(mds);
1968 return it == mds_sessions.end() ? _open_mds_session(mds) : &it->second;
7c673cae
FG
1969}
1970
1971/**
1972 * Populate a map of strings with client-identifying metadata,
1973 * such as the hostname. Call this once at initialization.
1974 */
1975void Client::populate_metadata(const std::string &mount_root)
1976{
1977 // Hostname
1978 struct utsname u;
1979 int r = uname(&u);
1980 if (r >= 0) {
1981 metadata["hostname"] = u.nodename;
1982 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1983 } else {
1984 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1985 }
1986
1987 metadata["pid"] = stringify(getpid());
1988
1989 // Ceph entity id (the '0' in "client.0")
1990 metadata["entity_id"] = cct->_conf->name.get_id();
1991
1992 // Our mount position
1993 if (!mount_root.empty()) {
1994 metadata["root"] = mount_root;
1995 }
1996
1997 // Ceph version
1998 metadata["ceph_version"] = pretty_version_to_str();
1999 metadata["ceph_sha1"] = git_version_to_str();
2000
2001 // Apply any metadata from the user's configured overrides
2002 std::vector<std::string> tokens;
2003 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2004 for (const auto &i : tokens) {
2005 auto eqpos = i.find("=");
2006 // Throw out anything that isn't of the form "<str>=<str>"
2007 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2008 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2009 continue;
2010 }
2011 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2012 }
2013}
2014
2015/**
2016 * Optionally add or override client metadata fields.
2017 */
2018void Client::update_metadata(std::string const &k, std::string const &v)
2019{
11fdf7f2
TL
2020 std::lock_guard l(client_lock);
2021 ceph_assert(initialized);
7c673cae 2022
11fdf7f2
TL
2023 auto it = metadata.find(k);
2024 if (it != metadata.end()) {
7c673cae 2025 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2026 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2027 }
2028
2029 metadata[k] = v;
2030}
2031
2032MetaSession *Client::_open_mds_session(mds_rank_t mds)
2033{
11fdf7f2
TL
2034 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2035 auto addrs = mdsmap->get_addrs(mds);
2036 auto em = mds_sessions.emplace(std::piecewise_construct,
2037 std::forward_as_tuple(mds),
2038 std::forward_as_tuple(mds, messenger->connect_to_mds(addrs), addrs));
2039 ceph_assert(em.second); /* not already present */
2040 MetaSession *session = &em.first->second;
7c673cae
FG
2041
2042 // Maybe skip sending a request to open if this MDS daemon
2043 // has previously sent us a REJECT.
2044 if (rejected_by_mds.count(mds)) {
11fdf7f2
TL
2045 if (rejected_by_mds[mds] == session->addrs) {
2046 ldout(cct, 4) << __func__ << " mds." << mds << " skipping "
7c673cae
FG
2047 "because we were rejected" << dendl;
2048 return session;
2049 } else {
11fdf7f2 2050 ldout(cct, 4) << __func__ << " mds." << mds << " old inst "
7c673cae
FG
2051 "rejected us, trying with new inst" << dendl;
2052 rejected_by_mds.erase(mds);
2053 }
2054 }
2055
11fdf7f2
TL
2056 auto m = MClientSession::create(CEPH_SESSION_REQUEST_OPEN);
2057 m->metadata = metadata;
2058 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
2059 session->con->send_message2(std::move(m));
7c673cae
FG
2060 return session;
2061}
2062
2063void Client::_close_mds_session(MetaSession *s)
2064{
11fdf7f2 2065 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2066 s->state = MetaSession::STATE_CLOSING;
11fdf7f2 2067 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2068}
2069
2070void Client::_closed_mds_session(MetaSession *s)
2071{
11fdf7f2 2072 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae
FG
2073 s->state = MetaSession::STATE_CLOSED;
2074 s->con->mark_down();
2075 signal_context_list(s->waiting_for_open);
2076 mount_cond.Signal();
2077 remove_session_caps(s);
2078 kick_requests_closed(s);
2079 mds_sessions.erase(s->mds_num);
7c673cae
FG
2080}
2081
11fdf7f2 2082void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2083{
2084 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2085 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae
FG
2086
2087 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2088 if (!session) {
2089 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2090 return;
2091 }
2092
2093 switch (m->get_op()) {
2094 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2095 {
2096 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2097 missing_features -= m->supported_features;
2098 if (!missing_features.empty()) {
2099 lderr(cct) << "mds." << from << " lacks required features '"
2100 << missing_features << "', closing session " << dendl;
2101 rejected_by_mds[session->mds_num] = session->addrs;
2102 _close_mds_session(session);
2103 _closed_mds_session(session);
2104 break;
2105 }
2106 session->mds_features = std::move(m->supported_features);
2107
2108 renew_caps(session);
2109 session->state = MetaSession::STATE_OPEN;
2110 if (unmounting)
2111 mount_cond.Signal();
2112 else
2113 connect_mds_targets(from);
2114 signal_context_list(session->waiting_for_open);
2115 break;
2116 }
7c673cae
FG
2117
2118 case CEPH_SESSION_CLOSE:
2119 _closed_mds_session(session);
2120 break;
2121
2122 case CEPH_SESSION_RENEWCAPS:
2123 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2124 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2125 session->cap_ttl =
2126 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298
TL
2127 if (was_stale)
2128 wake_up_session_caps(session, false);
7c673cae
FG
2129 }
2130 break;
2131
2132 case CEPH_SESSION_STALE:
28e407b8
AA
2133 // invalidate session caps/leases
2134 session->cap_gen++;
2135 session->cap_ttl = ceph_clock_now();
2136 session->cap_ttl -= 1;
7c673cae
FG
2137 renew_caps(session);
2138 break;
2139
2140 case CEPH_SESSION_RECALL_STATE:
2141 trim_caps(session, m->get_max_caps());
2142 break;
2143
2144 case CEPH_SESSION_FLUSHMSG:
a8e16298 2145 /* flush cap release */
11fdf7f2
TL
2146 if (auto& m = session->release; m) {
2147 session->con->send_message2(std::move(m));
a8e16298 2148 }
11fdf7f2 2149 session->con->send_message2(MClientSession::create(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2150 break;
2151
2152 case CEPH_SESSION_FORCE_RO:
2153 force_session_readonly(session);
2154 break;
2155
2156 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2157 {
2158 std::string_view error_str;
2159 auto it = m->metadata.find("error_string");
2160 if (it != m->metadata.end())
2161 error_str = it->second;
2162 else
2163 error_str = "unknown error";
2164 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2165
11fdf7f2
TL
2166 rejected_by_mds[session->mds_num] = session->addrs;
2167 _closed_mds_session(session);
2168 }
7c673cae
FG
2169 break;
2170
2171 default:
2172 ceph_abort();
2173 }
7c673cae
FG
2174}
2175
2176bool Client::_any_stale_sessions() const
2177{
11fdf7f2 2178 ceph_assert(client_lock.is_locked_by_me());
7c673cae 2179
11fdf7f2
TL
2180 for (const auto &p : mds_sessions) {
2181 if (p.second.state == MetaSession::STATE_STALE) {
7c673cae
FG
2182 return true;
2183 }
2184 }
2185
2186 return false;
2187}
2188
2189void Client::_kick_stale_sessions()
2190{
11fdf7f2 2191 ldout(cct, 1) << __func__ << dendl;
7c673cae 2192
11fdf7f2
TL
2193 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
2194 MetaSession &s = it->second;
2195 ++it;
2196 if (s.state == MetaSession::STATE_STALE)
2197 _closed_mds_session(&s);
7c673cae
FG
2198 }
2199}
2200
2201void Client::send_request(MetaRequest *request, MetaSession *session,
2202 bool drop_cap_releases)
2203{
2204 // make the request
2205 mds_rank_t mds = session->mds_num;
11fdf7f2 2206 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2207 << " for mds." << mds << dendl;
11fdf7f2 2208 auto r = build_client_request(request);
7c673cae
FG
2209 if (request->dentry()) {
2210 r->set_dentry_wanted();
2211 }
2212 if (request->got_unsafe) {
2213 r->set_replayed_op();
2214 if (request->target)
2215 r->head.ino = request->target->ino;
2216 } else {
2217 encode_cap_releases(request, mds);
2218 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2219 request->cap_releases.clear();
2220 else
2221 r->releases.swap(request->cap_releases);
2222 }
2223 r->set_mdsmap_epoch(mdsmap->get_epoch());
2224 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2225 objecter->with_osdmap([r](const OSDMap& o) {
2226 r->set_osdmap_epoch(o.get_epoch());
2227 });
2228 }
2229
2230 if (request->mds == -1) {
2231 request->sent_stamp = ceph_clock_now();
11fdf7f2 2232 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2233 }
2234 request->mds = mds;
2235
2236 Inode *in = request->inode();
11fdf7f2
TL
2237 if (in) {
2238 auto it = in->caps.find(mds);
2239 if (it != in->caps.end()) {
2240 request->sent_on_mseq = it->second.mseq;
2241 }
2242 }
7c673cae
FG
2243
2244 session->requests.push_back(&request->item);
2245
11fdf7f2
TL
2246 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2247 session->con->send_message2(std::move(r));
7c673cae
FG
2248}
2249
11fdf7f2 2250MClientRequest::ref Client::build_client_request(MetaRequest *request)
7c673cae 2251{
11fdf7f2 2252 auto req = MClientRequest::create(request->get_op());
7c673cae
FG
2253 req->set_tid(request->tid);
2254 req->set_stamp(request->op_stamp);
2255 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2256
2257 // if the filepath's haven't been set, set them!
2258 if (request->path.empty()) {
2259 Inode *in = request->inode();
2260 Dentry *de = request->dentry();
2261 if (in)
2262 in->make_nosnap_relative_path(request->path);
2263 else if (de) {
2264 if (de->inode)
2265 de->inode->make_nosnap_relative_path(request->path);
2266 else if (de->dir) {
2267 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2268 request->path.push_dentry(de->name);
2269 }
2270 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2271 << " No path, inode, or appropriately-endowed dentry given!"
2272 << dendl;
2273 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2274 << " No path, inode, or dentry given!"
2275 << dendl;
2276 }
2277 req->set_filepath(request->get_filepath());
2278 req->set_filepath2(request->get_filepath2());
2279 req->set_data(request->data);
2280 req->set_retry_attempt(request->retry_attempt++);
2281 req->head.num_fwd = request->num_fwd;
2282 const gid_t *_gids;
2283 int gid_count = request->perms.get_gids(&_gids);
2284 req->set_gid_list(gid_count, _gids);
2285 return req;
2286}
2287
2288
2289
11fdf7f2 2290void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2291{
2292 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2293 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2294 if (!session) {
7c673cae
FG
2295 return;
2296 }
2297 ceph_tid_t tid = fwd->get_tid();
2298
2299 if (mds_requests.count(tid) == 0) {
11fdf7f2 2300 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2301 return;
2302 }
2303
2304 MetaRequest *request = mds_requests[tid];
11fdf7f2 2305 ceph_assert(request);
7c673cae
FG
2306
2307 // reset retry counter
2308 request->retry_attempt = 0;
2309
2310 // request not forwarded, or dest mds has no session.
2311 // resend.
11fdf7f2 2312 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2313 << " fwd " << fwd->get_num_fwd()
2314 << " to mds." << fwd->get_dest_mds()
2315 << ", resending to " << fwd->get_dest_mds()
2316 << dendl;
2317
2318 request->mds = -1;
2319 request->item.remove_myself();
2320 request->num_fwd = fwd->get_num_fwd();
2321 request->resend_mds = fwd->get_dest_mds();
2322 request->caller_cond->Signal();
7c673cae
FG
2323}
2324
2325bool Client::is_dir_operation(MetaRequest *req)
2326{
2327 int op = req->get_op();
2328 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2329 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2330 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2331 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2332 return true;
2333 return false;
2334}
2335
11fdf7f2 2336void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2337{
2338 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2339 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2340 if (!session) {
7c673cae
FG
2341 return;
2342 }
2343
2344 ceph_tid_t tid = reply->get_tid();
2345 bool is_safe = reply->is_safe();
2346
2347 if (mds_requests.count(tid) == 0) {
11fdf7f2 2348 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2349 << " safe is:" << is_safe << dendl;
7c673cae
FG
2350 return;
2351 }
2352 MetaRequest *request = mds_requests.at(tid);
2353
11fdf7f2 2354 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2355 << " tid " << tid << dendl;
2356
2357 if (request->got_unsafe && !is_safe) {
2358 //duplicate response
2359 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2360 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2361 return;
2362 }
2363
2364 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2365 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2366 << " from mds." << request->mds << dendl;
2367 request->send_to_auth = true;
2368 request->resend_mds = choose_target_mds(request);
2369 Inode *in = request->inode();
11fdf7f2 2370 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2371 if (request->resend_mds >= 0 &&
2372 request->resend_mds == request->mds &&
2373 (in == NULL ||
11fdf7f2
TL
2374 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2375 request->sent_on_mseq == it->second.mseq)) {
2376 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae
FG
2377 } else {
2378 request->caller_cond->Signal();
7c673cae
FG
2379 return;
2380 }
7c673cae
FG
2381 }
2382
11fdf7f2 2383 ceph_assert(!request->reply);
7c673cae
FG
2384 request->reply = reply;
2385 insert_trace(request, session);
2386
2387 // Handle unsafe reply
2388 if (!is_safe) {
2389 request->got_unsafe = true;
2390 session->unsafe_requests.push_back(&request->unsafe_item);
2391 if (is_dir_operation(request)) {
2392 Inode *dir = request->inode();
11fdf7f2 2393 ceph_assert(dir);
7c673cae
FG
2394 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2395 }
2396 if (request->target) {
2397 InodeRef &in = request->target;
2398 in->unsafe_ops.push_back(&request->unsafe_target_item);
2399 }
2400 }
2401
2402 // Only signal the caller once (on the first reply):
2403 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2404 if (!is_safe || !request->got_unsafe) {
2405 Cond cond;
2406 request->dispatch_cond = &cond;
2407
2408 // wake up waiter
11fdf7f2 2409 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
7c673cae
FG
2410 request->caller_cond->Signal();
2411
2412 // wake for kick back
2413 while (request->dispatch_cond) {
11fdf7f2 2414 ldout(cct, 20) << __func__ << " awaiting kickback on tid " << tid << " " << &cond << dendl;
7c673cae
FG
2415 cond.Wait(client_lock);
2416 }
2417 }
2418
2419 if (is_safe) {
2420 // the filesystem change is committed to disk
2421 // we're done, clean up
2422 if (request->got_unsafe) {
2423 request->unsafe_item.remove_myself();
2424 request->unsafe_dir_item.remove_myself();
2425 request->unsafe_target_item.remove_myself();
2426 signal_cond_list(request->waitfor_safe);
2427 }
2428 request->item.remove_myself();
2429 unregister_request(request);
2430 }
2431 if (unmounting)
2432 mount_cond.Signal();
2433}
2434
2435void Client::_handle_full_flag(int64_t pool)
2436{
2437 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2438 << "on " << pool << dendl;
2439 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2440 // to do this rather than blocking, because otherwise when we fill up we
2441 // potentially lock caps forever on files with dirty pages, and we need
2442 // to be able to release those caps to the MDS so that it can delete files
2443 // and free up space.
2444 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2445
2446 // For all inodes with layouts in this pool and a pending flush write op
2447 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2448 // from ObjectCacher so that it doesn't re-issue the write in response to
2449 // the ENOSPC error.
2450 // Fortunately since we're cancelling everything in a given pool, we don't
2451 // need to know which ops belong to which ObjectSet, we can just blow all
2452 // the un-flushed cached data away and mark any dirty inodes' async_err
2453 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2454 // affecting this pool, and all the objectsets we're purging were also
2455 // in this pool.
2456 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2457 i != inode_map.end(); ++i)
2458 {
2459 Inode *inode = i->second;
2460 if (inode->oset.dirty_or_tx
2461 && (pool == -1 || inode->layout.pool_id == pool)) {
2462 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2463 << " has dirty objects, purging and setting ENOSPC" << dendl;
2464 objectcacher->purge_set(&inode->oset);
2465 inode->set_async_err(-ENOSPC);
2466 }
2467 }
2468
2469 if (cancelled_epoch != (epoch_t)-1) {
2470 set_cap_epoch_barrier(cancelled_epoch);
2471 }
2472}
2473
11fdf7f2 2474void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2475{
31f18b77
FG
2476 std::set<entity_addr_t> new_blacklists;
2477 objecter->consume_blacklist_events(&new_blacklists);
2478
11fdf7f2
TL
2479 const auto myaddrs = messenger->get_myaddrs();
2480 bool new_blacklist = false;
2481 bool prenautilus = objecter->with_osdmap(
2482 [&](const OSDMap& o) {
2483 return o.require_osd_release < CEPH_RELEASE_NAUTILUS;
2484 });
2485 if (!blacklisted) {
2486 for (auto a : myaddrs.v) {
2487 // blacklist entries are always TYPE_ANY for nautilus+
2488 a.set_type(entity_addr_t::TYPE_ANY);
2489 if (new_blacklists.count(a)) {
2490 new_blacklist = true;
2491 break;
2492 }
2493 if (prenautilus) {
2494 // ...except pre-nautilus, they were TYPE_LEGACY
2495 a.set_type(entity_addr_t::TYPE_LEGACY);
2496 if (new_blacklists.count(a)) {
2497 new_blacklist = true;
2498 break;
2499 }
2500 }
2501 }
2502 }
2503 if (new_blacklist) {
31f18b77
FG
2504 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2505 return o.get_epoch();
2506 });
2507 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2508 blacklisted = true;
31f18b77 2509
11fdf7f2 2510 _abort_mds_sessions(-EBLACKLISTED);
31f18b77
FG
2511
2512 // Since we know all our OSD ops will fail, cancel them all preemtively,
2513 // so that on an unhealthy cluster we can umount promptly even if e.g.
2514 // some PGs were inaccessible.
2515 objecter->op_cancel_writes(-EBLACKLISTED);
2516
2517 } else if (blacklisted) {
2518 // Handle case where we were blacklisted but no longer are
11fdf7f2
TL
2519 blacklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2520 return o.is_blacklisted(myaddrs);});
31f18b77
FG
2521 }
2522
f64942e4
AA
2523 // Always subscribe to next osdmap for blacklisted client
2524 // until this client is not blacklisted.
2525 if (blacklisted) {
2526 objecter->maybe_request_map();
2527 }
2528
7c673cae
FG
2529 if (objecter->osdmap_full_flag()) {
2530 _handle_full_flag(-1);
2531 } else {
2532 // Accumulate local list of full pools so that I can drop
2533 // the objecter lock before re-entering objecter in
2534 // cancel_writes
2535 std::vector<int64_t> full_pools;
2536
2537 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2538 for (const auto& kv : o.get_pools()) {
2539 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2540 full_pools.push_back(kv.first);
2541 }
2542 }
2543 });
2544
2545 for (auto p : full_pools)
2546 _handle_full_flag(p);
2547
2548 // Subscribe to subsequent maps to watch for the full flag going
2549 // away. For the global full flag objecter does this for us, but
2550 // it pays no attention to the per-pool full flag so in this branch
2551 // we do it ourselves.
2552 if (!full_pools.empty()) {
2553 objecter->maybe_request_map();
2554 }
2555 }
7c673cae
FG
2556}
2557
2558
2559// ------------------------
2560// incoming messages
2561
2562
11fdf7f2 2563bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2564{
11fdf7f2 2565 std::lock_guard l(client_lock);
7c673cae
FG
2566 if (!initialized) {
2567 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2568 return true;
2569 }
2570
2571 switch (m->get_type()) {
2572 // mounting and mds sessions
2573 case CEPH_MSG_MDS_MAP:
11fdf7f2 2574 handle_mds_map(MMDSMap::msgref_cast(m));
7c673cae
FG
2575 break;
2576 case CEPH_MSG_FS_MAP:
11fdf7f2 2577 handle_fs_map(MFSMap::msgref_cast(m));
7c673cae
FG
2578 break;
2579 case CEPH_MSG_FS_MAP_USER:
11fdf7f2 2580 handle_fs_map_user(MFSMapUser::msgref_cast(m));
7c673cae
FG
2581 break;
2582 case CEPH_MSG_CLIENT_SESSION:
11fdf7f2 2583 handle_client_session(MClientSession::msgref_cast(m));
7c673cae
FG
2584 break;
2585
2586 case CEPH_MSG_OSD_MAP:
11fdf7f2 2587 handle_osd_map(MOSDMap::msgref_cast(m));
7c673cae
FG
2588 break;
2589
2590 // requests
2591 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
11fdf7f2 2592 handle_client_request_forward(MClientRequestForward::msgref_cast(m));
7c673cae
FG
2593 break;
2594 case CEPH_MSG_CLIENT_REPLY:
11fdf7f2
TL
2595 handle_client_reply(MClientReply::msgref_cast(m));
2596 break;
2597
2598 // reclaim reply
2599 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
2600 handle_client_reclaim_reply(MClientReclaimReply::msgref_cast(m));
7c673cae
FG
2601 break;
2602
2603 case CEPH_MSG_CLIENT_SNAP:
11fdf7f2 2604 handle_snap(MClientSnap::msgref_cast(m));
7c673cae
FG
2605 break;
2606 case CEPH_MSG_CLIENT_CAPS:
11fdf7f2 2607 handle_caps(MClientCaps::msgref_cast(m));
7c673cae
FG
2608 break;
2609 case CEPH_MSG_CLIENT_LEASE:
11fdf7f2 2610 handle_lease(MClientLease::msgref_cast(m));
7c673cae
FG
2611 break;
2612 case MSG_COMMAND_REPLY:
2613 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
11fdf7f2 2614 handle_command_reply(MCommandReply::msgref_cast(m));
7c673cae
FG
2615 } else {
2616 return false;
2617 }
2618 break;
2619 case CEPH_MSG_CLIENT_QUOTA:
11fdf7f2 2620 handle_quota(MClientQuota::msgref_cast(m));
7c673cae
FG
2621 break;
2622
2623 default:
2624 return false;
2625 }
2626
2627 // unmounting?
2628 if (unmounting) {
2629 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2630 << "+" << inode_map.size() << dendl;
2631 long unsigned size = lru.lru_get_size() + inode_map.size();
2632 trim_cache();
2633 if (size < lru.lru_get_size() + inode_map.size()) {
2634 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2635 mount_cond.Signal();
2636 } else {
2637 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2638 << "+" << inode_map.size() << dendl;
2639 }
2640 }
2641
2642 return true;
2643}
2644
11fdf7f2 2645void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae
FG
2646{
2647 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2648
2649 signal_cond_list(waiting_for_fsmap);
2650
2651 monclient->sub_got("fsmap", fsmap->get_epoch());
2652}
2653
11fdf7f2 2654void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae
FG
2655{
2656 fsmap_user.reset(new FSMapUser);
2657 *fsmap_user = m->get_fsmap();
7c673cae
FG
2658
2659 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2660 signal_cond_list(waiting_for_fsmap);
2661}
2662
11fdf7f2 2663void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
7c673cae 2664{
f64942e4 2665 mds_gid_t old_inc, new_inc;
7c673cae 2666 if (m->get_epoch() <= mdsmap->get_epoch()) {
11fdf7f2 2667 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
7c673cae
FG
2668 << " is identical to or older than our "
2669 << mdsmap->get_epoch() << dendl;
7c673cae 2670 return;
f64942e4 2671 }
7c673cae 2672
11fdf7f2 2673 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
7c673cae
FG
2674
2675 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2676 oldmap.swap(mdsmap);
2677
2678 mdsmap->decode(m->get_encoded());
2679
2680 // Cancel any commands for missing or laggy GIDs
2681 std::list<ceph_tid_t> cancel_ops;
2682 auto &commands = command_table.get_commands();
2683 for (const auto &i : commands) {
2684 auto &op = i.second;
2685 const mds_gid_t op_mds_gid = op.mds_gid;
2686 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2687 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2688 cancel_ops.push_back(i.first);
2689 if (op.outs) {
2690 std::ostringstream ss;
2691 ss << "MDS " << op_mds_gid << " went away";
2692 *(op.outs) = ss.str();
2693 }
2694 op.con->mark_down();
2695 if (op.on_finish) {
2696 op.on_finish->complete(-ETIMEDOUT);
2697 }
2698 }
2699 }
2700
2701 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2702 i != cancel_ops.end(); ++i) {
2703 command_table.erase(*i);
2704 }
2705
2706 // reset session
11fdf7f2 2707 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2708 mds_rank_t mds = p->first;
11fdf7f2 2709 MetaSession *session = &p->second;
7c673cae
FG
2710 ++p;
2711
2712 int oldstate = oldmap->get_state(mds);
2713 int newstate = mdsmap->get_state(mds);
2714 if (!mdsmap->is_up(mds)) {
2715 session->con->mark_down();
11fdf7f2 2716 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f64942e4
AA
2717 old_inc = oldmap->get_incarnation(mds);
2718 new_inc = mdsmap->get_incarnation(mds);
2719 if (old_inc != new_inc) {
2720 ldout(cct, 1) << "mds incarnation changed from "
2721 << old_inc << " to " << new_inc << dendl;
2722 oldstate = MDSMap::STATE_NULL;
2723 }
7c673cae 2724 session->con->mark_down();
11fdf7f2 2725 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2726 // When new MDS starts to take over, notify kernel to trim unused entries
2727 // in its dcache/icache. Hopefully, the kernel will release some unused
2728 // inodes before the new MDS enters reconnect state.
2729 trim_cache_for_reconnect(session);
2730 } else if (oldstate == newstate)
2731 continue; // no change
2732
2733 session->mds_state = newstate;
2734 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2735 session->con = messenger->connect_to_mds(session->addrs);
7c673cae 2736 send_reconnect(session);
81eedcae
TL
2737 } else if (newstate > MDSMap::STATE_RECONNECT) {
2738 if (oldstate < MDSMap::STATE_RECONNECT) {
2739 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
2740 _closed_mds_session(session);
2741 continue;
2742 }
2743 if (newstate >= MDSMap::STATE_ACTIVE) {
2744 if (oldstate < MDSMap::STATE_ACTIVE) {
2745 // kick new requests
2746 kick_requests(session);
2747 kick_flushing_caps(session);
2748 signal_context_list(session->waiting_for_open);
2749 wake_up_session_caps(session, true);
2750 }
2751 connect_mds_targets(mds);
7c673cae 2752 }
7c673cae
FG
2753 } else if (newstate == MDSMap::STATE_NULL &&
2754 mds >= mdsmap->get_max_mds()) {
2755 _closed_mds_session(session);
2756 }
2757 }
2758
2759 // kick any waiting threads
2760 signal_cond_list(waiting_for_mdsmap);
2761
7c673cae
FG
2762 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2763}
2764
2765void Client::send_reconnect(MetaSession *session)
2766{
2767 mds_rank_t mds = session->mds_num;
11fdf7f2 2768 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2769
2770 // trim unused caps to reduce MDS's cache rejoin time
2771 trim_cache_for_reconnect(session);
2772
2773 session->readonly = false;
2774
11fdf7f2 2775 session->release.reset();
7c673cae
FG
2776
2777 // reset my cap seq number
2778 session->seq = 0;
2779 //connect to the mds' offload targets
2780 connect_mds_targets(mds);
2781 //make sure unsafe requests get saved
2782 resend_unsafe_requests(session);
2783
11fdf7f2
TL
2784 early_kick_flushing_caps(session);
2785
2786 auto m = MClientReconnect::create();
2787 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2788
2789 // i have an open session.
2790 ceph::unordered_set<inodeno_t> did_snaprealm;
2791 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2792 p != inode_map.end();
2793 ++p) {
2794 Inode *in = p->second;
11fdf7f2
TL
2795 auto it = in->caps.find(mds);
2796 if (it != in->caps.end()) {
2797 if (allow_multi &&
2798 m->get_approx_size() >= (std::numeric_limits<int>::max() >> 1)) {
2799 m->mark_more();
2800 session->con->send_message2(std::move(m));
2801
2802 m = MClientReconnect::create();
2803 }
2804
2805 Cap &cap = it->second;
7c673cae 2806 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2807 << " " << ccap_string(cap.issued)
7c673cae
FG
2808 << " wants " << ccap_string(in->caps_wanted())
2809 << dendl;
2810 filepath path;
2811 in->make_long_path(path);
2812 ldout(cct, 10) << " path " << path << dendl;
2813
2814 bufferlist flockbl;
2815 _encode_filelocks(in, flockbl);
2816
11fdf7f2
TL
2817 cap.seq = 0; // reset seq.
2818 cap.issue_seq = 0; // reset seq.
2819 cap.mseq = 0; // reset seq.
2820 // cap gen should catch up with session cap_gen
2821 if (cap.gen < session->cap_gen) {
2822 cap.gen = session->cap_gen;
2823 cap.issued = cap.implemented = CEPH_CAP_PIN;
2824 } else {
2825 cap.issued = cap.implemented;
2826 }
7c673cae
FG
2827 snapid_t snap_follows = 0;
2828 if (!in->cap_snaps.empty())
2829 snap_follows = in->cap_snaps.begin()->first;
2830
2831 m->add_cap(p->first.ino,
11fdf7f2 2832 cap.cap_id,
7c673cae
FG
2833 path.get_ino(), path.get_path(), // ino
2834 in->caps_wanted(), // wanted
11fdf7f2 2835 cap.issued, // issued
7c673cae
FG
2836 in->snaprealm->ino,
2837 snap_follows,
2838 flockbl);
2839
2840 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2841 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2842 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2843 did_snaprealm.insert(in->snaprealm->ino);
2844 }
2845 }
2846 }
2847
11fdf7f2
TL
2848 if (!allow_multi)
2849 m->set_encoding_version(0); // use connection features to choose encoding
2850 session->con->send_message2(std::move(m));
7c673cae
FG
2851
2852 mount_cond.Signal();
11fdf7f2
TL
2853
2854 if (session->reclaim_state == MetaSession::RECLAIMING)
2855 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
2856}
2857
2858
2859void Client::kick_requests(MetaSession *session)
2860{
11fdf7f2 2861 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2862 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2863 p != mds_requests.end();
2864 ++p) {
31f18b77
FG
2865 MetaRequest *req = p->second;
2866 if (req->got_unsafe)
2867 continue;
2868 if (req->aborted()) {
2869 if (req->caller_cond) {
2870 req->kick = true;
2871 req->caller_cond->Signal();
2872 }
7c673cae 2873 continue;
31f18b77
FG
2874 }
2875 if (req->retry_attempt > 0)
7c673cae 2876 continue; // new requests only
31f18b77 2877 if (req->mds == session->mds_num) {
7c673cae
FG
2878 send_request(p->second, session);
2879 }
2880 }
2881}
2882
2883void Client::resend_unsafe_requests(MetaSession *session)
2884{
2885 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2886 !iter.end();
2887 ++iter)
2888 send_request(*iter, session);
2889
2890 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2891 // process completed requests in clientreplay stage.
2892 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2893 p != mds_requests.end();
2894 ++p) {
2895 MetaRequest *req = p->second;
2896 if (req->got_unsafe)
2897 continue;
31f18b77
FG
2898 if (req->aborted())
2899 continue;
7c673cae
FG
2900 if (req->retry_attempt == 0)
2901 continue; // old requests only
2902 if (req->mds == session->mds_num)
2903 send_request(req, session, true);
2904 }
2905}
2906
2907void Client::wait_unsafe_requests()
2908{
2909 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2
TL
2910 for (const auto &p : mds_sessions) {
2911 const MetaSession &s = p.second;
2912 if (!s.unsafe_requests.empty()) {
2913 MetaRequest *req = s.unsafe_requests.back();
7c673cae
FG
2914 req->get();
2915 last_unsafe_reqs.push_back(req);
2916 }
2917 }
2918
2919 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2920 p != last_unsafe_reqs.end();
2921 ++p) {
2922 MetaRequest *req = *p;
2923 if (req->unsafe_item.is_on_list())
2924 wait_on_list(req->waitfor_safe);
2925 put_request(req);
2926 }
2927}
2928
2929void Client::kick_requests_closed(MetaSession *session)
2930{
11fdf7f2 2931 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
2932 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2933 p != mds_requests.end(); ) {
2934 MetaRequest *req = p->second;
2935 ++p;
2936 if (req->mds == session->mds_num) {
2937 if (req->caller_cond) {
2938 req->kick = true;
2939 req->caller_cond->Signal();
2940 }
2941 req->item.remove_myself();
2942 if (req->got_unsafe) {
11fdf7f2 2943 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae
FG
2944 req->unsafe_item.remove_myself();
2945 req->unsafe_dir_item.remove_myself();
2946 req->unsafe_target_item.remove_myself();
2947 signal_cond_list(req->waitfor_safe);
2948 unregister_request(req);
2949 }
2950 }
2951 }
11fdf7f2
TL
2952 ceph_assert(session->requests.empty());
2953 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
2954}
2955
2956
2957
2958
2959/************
2960 * leases
2961 */
2962
2963void Client::got_mds_push(MetaSession *s)
2964{
2965 s->seq++;
2966 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2967 if (s->state == MetaSession::STATE_CLOSING) {
11fdf7f2 2968 s->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2969 }
2970}
2971
11fdf7f2 2972void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 2973{
11fdf7f2 2974 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 2975
11fdf7f2 2976 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae
FG
2977
2978 mds_rank_t mds = mds_rank_t(m->get_source().num());
2979 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2980 if (!session) {
7c673cae
FG
2981 return;
2982 }
2983
2984 got_mds_push(session);
2985
2986 ceph_seq_t seq = m->get_seq();
2987
2988 Inode *in;
2989 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2990 if (inode_map.count(vino) == 0) {
2991 ldout(cct, 10) << " don't have vino " << vino << dendl;
2992 goto revoke;
2993 }
2994 in = inode_map[vino];
2995
2996 if (m->get_mask() & CEPH_LOCK_DN) {
2997 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2998 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2999 goto revoke;
3000 }
3001 Dentry *dn = in->dir->dentries[m->dname];
3002 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3003 dn->lease_mds = -1;
3004 }
3005
3006 revoke:
11fdf7f2
TL
3007 {
3008 auto reply = MClientLease::create(CEPH_MDS_LEASE_RELEASE, seq, m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname);
3009 m->get_connection()->send_message2(std::move(reply));
3010 }
7c673cae
FG
3011}
3012
3013void Client::put_inode(Inode *in, int n)
3014{
11fdf7f2 3015 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3016 int left = in->_put(n);
3017 if (left == 0) {
3018 // release any caps
3019 remove_all_caps(in);
3020
11fdf7f2 3021 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3022 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3023 ceph_assert(!unclean);
7c673cae
FG
3024 inode_map.erase(in->vino());
3025 if (use_faked_inos())
3026 _release_faked_ino(in);
3027
3028 if (in == root) {
3029 root = 0;
3030 root_ancestor = 0;
3031 while (!root_parents.empty())
3032 root_parents.erase(root_parents.begin());
3033 }
3034
3035 delete in;
3036 }
3037}
3038
3039void Client::close_dir(Dir *dir)
3040{
3041 Inode *in = dir->parent_inode;
11fdf7f2
TL
3042 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3043 ceph_assert(dir->is_empty());
3044 ceph_assert(in->dir == dir);
3045 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3046 if (!in->dentries.empty())
7c673cae
FG
3047 in->get_first_parent()->put(); // unpin dentry
3048
3049 delete in->dir;
3050 in->dir = 0;
3051 put_inode(in); // unpin inode
3052}
3053
3054 /**
3055 * Don't call this with in==NULL, use get_or_create for that
3056 * leave dn set to default NULL unless you're trying to add
3057 * a new inode to a pre-created Dentry
3058 */
3059Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3060{
3061 if (!dn) {
3062 // create a new Dentry
11fdf7f2
TL
3063 dn = new Dentry(dir, name);
3064
7c673cae
FG
3065 lru.lru_insert_mid(dn); // mid or top?
3066
3067 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3068 << " dn " << dn << " (new dn)" << dendl;
3069 } else {
11fdf7f2 3070 ceph_assert(!dn->inode);
7c673cae
FG
3071 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3072 << " dn " << dn << " (old dn)" << dendl;
3073 }
3074
3075 if (in) { // link to inode
11fdf7f2 3076 InodeRef tmp_ref;
7c673cae 3077 // only one parent for directories!
11fdf7f2
TL
3078 if (in->is_dir() && !in->dentries.empty()) {
3079 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3080 Dentry *olddn = in->get_first_parent();
11fdf7f2 3081 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae
FG
3082 Inode *old_diri = olddn->dir->parent_inode;
3083 old_diri->dir_release_count++;
3084 clear_dir_complete_and_ordered(old_diri, true);
3085 unlink(olddn, true, true); // keep dir, dentry
3086 }
3087
11fdf7f2
TL
3088 dn->link(in);
3089 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3090 }
3091
3092 return dn;
3093}
3094
3095void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3096{
11fdf7f2 3097 InodeRef in(dn->inode);
7c673cae
FG
3098 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3099 << " inode " << dn->inode << dendl;
3100
3101 // unlink from inode
11fdf7f2
TL
3102 if (dn->inode) {
3103 dn->unlink();
3104 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3105 }
3106
3107 if (keepdentry) {
3108 dn->lease_mds = -1;
3109 } else {
3110 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3111
3112 // unlink from dir
11fdf7f2
TL
3113 Dir *dir = dn->dir;
3114 dn->detach();
7c673cae
FG
3115
3116 // delete den
3117 lru.lru_remove(dn);
3118 dn->put();
11fdf7f2
TL
3119
3120 if (dir->is_empty() && !keepdir)
3121 close_dir(dir);
7c673cae
FG
3122 }
3123}
3124
3125/**
3126 * For asynchronous flushes, check for errors from the IO and
3127 * update the inode if necessary
3128 */
3129class C_Client_FlushComplete : public Context {
3130private:
3131 Client *client;
3132 InodeRef inode;
3133public:
3134 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3135 void finish(int r) override {
11fdf7f2 3136 ceph_assert(client->client_lock.is_locked_by_me());
7c673cae
FG
3137 if (r != 0) {
3138 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3139 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3140 << " 0x" << std::hex << inode->ino << std::dec
3141 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3142 inode->set_async_err(r);
3143 }
3144 }
3145};
3146
3147
3148/****
3149 * caps
3150 */
3151
3152void Client::get_cap_ref(Inode *in, int cap)
3153{
3154 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3155 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3156 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3157 in->get();
3158 }
3159 if ((cap & CEPH_CAP_FILE_CACHE) &&
3160 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3161 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3162 in->get();
3163 }
3164 in->get_cap_ref(cap);
3165}
3166
3167void Client::put_cap_ref(Inode *in, int cap)
3168{
3169 int last = in->put_cap_ref(cap);
3170 if (last) {
3171 int put_nref = 0;
3172 int drop = last & ~in->caps_issued();
3173 if (in->snapid == CEPH_NOSNAP) {
3174 if ((last & CEPH_CAP_FILE_WR) &&
3175 !in->cap_snaps.empty() &&
3176 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3177 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3178 in->cap_snaps.rbegin()->second.writing = 0;
3179 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3180 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3181 }
3182 if (last & CEPH_CAP_FILE_BUFFER) {
3183 for (auto &p : in->cap_snaps)
3184 p.second.dirty_data = 0;
3185 signal_cond_list(in->waitfor_commit);
11fdf7f2 3186 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3187 ++put_nref;
3188 }
3189 }
3190 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3191 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3192 ++put_nref;
3193 }
3194 if (drop)
3195 check_caps(in, 0);
3196 if (put_nref)
3197 put_inode(in, put_nref);
3198 }
3199}
3200
3201int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3202{
3203 int r = check_pool_perm(in, need);
3204 if (r < 0)
3205 return r;
3206
3207 while (1) {
3208 int file_wanted = in->caps_file_wanted();
3209 if ((file_wanted & need) != need) {
3210 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3211 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3212 << dendl;
3213 return -EBADF;
3214 }
3215
3216 int implemented;
3217 int have = in->caps_issued(&implemented);
3218
3219 bool waitfor_caps = false;
3220 bool waitfor_commit = false;
3221
3222 if (have & need & CEPH_CAP_FILE_WR) {
3223 if (endoff > 0 &&
3224 (endoff >= (loff_t)in->max_size ||
3225 endoff > (loff_t)(in->size << 1)) &&
3226 endoff > (loff_t)in->wanted_max_size) {
3227 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3228 in->wanted_max_size = endoff;
3229 check_caps(in, 0);
3230 }
3231
3232 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3233 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3234 waitfor_caps = true;
3235 }
3236 if (!in->cap_snaps.empty()) {
3237 if (in->cap_snaps.rbegin()->second.writing) {
3238 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3239 waitfor_caps = true;
3240 }
3241 for (auto &p : in->cap_snaps) {
3242 if (p.second.dirty_data) {
3243 waitfor_commit = true;
3244 break;
3245 }
3246 }
3247 if (waitfor_commit) {
3248 _flush(in, new C_Client_FlushComplete(this, in));
3249 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3250 }
3251 }
3252 }
3253
3254 if (!waitfor_caps && !waitfor_commit) {
3255 if ((have & need) == need) {
7c673cae
FG
3256 int revoking = implemented & ~have;
3257 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3258 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3259 << " revoking " << ccap_string(revoking)
7c673cae 3260 << dendl;
c07f9fc5 3261 if ((revoking & want) == 0) {
7c673cae
FG
3262 *phave = need | (have & want);
3263 in->get_cap_ref(need);
3264 return 0;
3265 }
3266 }
3267 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3268 waitfor_caps = true;
3269 }
3270
3271 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3272 in->auth_cap->session->readonly)
3273 return -EROFS;
3274
3275 if (in->flags & I_CAP_DROPPED) {
3276 int mds_wanted = in->caps_mds_wanted();
3277 if ((mds_wanted & need) != need) {
3278 int ret = _renew_caps(in);
3279 if (ret < 0)
3280 return ret;
3281 continue;
3282 }
a8e16298 3283 if (!(file_wanted & ~mds_wanted))
7c673cae 3284 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3285 }
3286
3287 if (waitfor_caps)
3288 wait_on_list(in->waitfor_caps);
3289 else if (waitfor_commit)
3290 wait_on_list(in->waitfor_commit);
3291 }
3292}
3293
3294int Client::get_caps_used(Inode *in)
3295{
3296 unsigned used = in->caps_used();
3297 if (!(used & CEPH_CAP_FILE_CACHE) &&
3298 !objectcacher->set_is_empty(&in->oset))
3299 used |= CEPH_CAP_FILE_CACHE;
3300 return used;
3301}
3302
3303void Client::cap_delay_requeue(Inode *in)
3304{
11fdf7f2 3305 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3306 in->hold_caps_until = ceph_clock_now();
3307 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3308 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3309}
3310
3311void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3312 bool sync, int used, int want, int retain,
3313 int flush, ceph_tid_t flush_tid)
3314{
3315 int held = cap->issued | cap->implemented;
3316 int revoking = cap->implemented & ~cap->issued;
3317 retain &= ~revoking;
3318 int dropping = cap->issued & ~retain;
3319 int op = CEPH_CAP_OP_UPDATE;
3320
11fdf7f2 3321 ldout(cct, 10) << __func__ << " " << *in
7c673cae
FG
3322 << " mds." << session->mds_num << " seq " << cap->seq
3323 << (sync ? " sync " : " async ")
3324 << " used " << ccap_string(used)
3325 << " want " << ccap_string(want)
3326 << " flush " << ccap_string(flush)
3327 << " retain " << ccap_string(retain)
3328 << " held "<< ccap_string(held)
3329 << " revoking " << ccap_string(revoking)
3330 << " dropping " << ccap_string(dropping)
3331 << dendl;
3332
3333 if (cct->_conf->client_inject_release_failure && revoking) {
3334 const int would_have_issued = cap->issued & retain;
3335 const int would_have_implemented = cap->implemented & (cap->issued | used);
3336 // Simulated bug:
3337 // - tell the server we think issued is whatever they issued plus whatever we implemented
3338 // - leave what we have implemented in place
3339 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3340 cap->issued = cap->issued | cap->implemented;
3341
3342 // Make an exception for revoking xattr caps: we are injecting
3343 // failure to release other caps, but allow xattr because client
3344 // will block on xattr ops if it can't release these to MDS (#9800)
3345 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3346 cap->issued ^= xattr_mask & revoking;
3347 cap->implemented ^= xattr_mask & revoking;
3348
3349 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3350 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3351 } else {
3352 // Normal behaviour
3353 cap->issued &= retain;
3354 cap->implemented &= cap->issued | used;
3355 }
3356
3357 snapid_t follows = 0;
3358
3359 if (flush)
3360 follows = in->snaprealm->get_snap_context().seq;
3361
11fdf7f2 3362 auto m = MClientCaps::create(op,
7c673cae
FG
3363 in->ino,
3364 0,
3365 cap->cap_id, cap->seq,
3366 cap->implemented,
3367 want,
3368 flush,
3369 cap->mseq,
3370 cap_epoch_barrier);
3371 m->caller_uid = in->cap_dirtier_uid;
3372 m->caller_gid = in->cap_dirtier_gid;
3373
3374 m->head.issue_seq = cap->issue_seq;
3375 m->set_tid(flush_tid);
3376
3377 m->head.uid = in->uid;
3378 m->head.gid = in->gid;
3379 m->head.mode = in->mode;
3380
3381 m->head.nlink = in->nlink;
3382
3383 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3384 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3385 m->head.xattr_version = in->xattr_version;
3386 }
3387
3388 m->size = in->size;
3389 m->max_size = in->max_size;
3390 m->truncate_seq = in->truncate_seq;
3391 m->truncate_size = in->truncate_size;
3392 m->mtime = in->mtime;
3393 m->atime = in->atime;
3394 m->ctime = in->ctime;
3395 m->btime = in->btime;
3396 m->time_warp_seq = in->time_warp_seq;
3397 m->change_attr = in->change_attr;
3398 if (sync)
11fdf7f2
TL
3399 m->flags |= MClientCaps::FLAG_SYNC;
3400 if (!in->cap_snaps.empty())
3401 m->flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
7c673cae
FG
3402
3403 if (flush & CEPH_CAP_FILE_WR) {
3404 m->inline_version = in->inline_version;
3405 m->inline_data = in->inline_data;
3406 }
3407
3408 in->reported_size = in->size;
3409 m->set_snap_follows(follows);
3410 cap->wanted = want;
3411 if (cap == in->auth_cap) {
3412 m->set_max_size(in->wanted_max_size);
3413 in->requested_max_size = in->wanted_max_size;
3414 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3415 }
3416
3417 if (!session->flushing_caps_tids.empty())
3418 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3419
11fdf7f2 3420 session->con->send_message2(std::move(m));
7c673cae
FG
3421}
3422
31f18b77
FG
3423static bool is_max_size_approaching(Inode *in)
3424{
3425 /* mds will adjust max size according to the reported size */
3426 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3427 return false;
3428 if (in->size >= in->max_size)
3429 return true;
3430 /* half of previous max_size increment has been used */
3431 if (in->max_size > in->reported_size &&
3432 (in->size << 1) >= in->max_size + in->reported_size)
3433 return true;
3434 return false;
3435}
7c673cae 3436
11fdf7f2
TL
3437static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3438{
3439 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3440 return used;
3441 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3442 return used;
3443
3444 if (issued & CEPH_CAP_FILE_LAZYIO) {
3445 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3446 used &= ~CEPH_CAP_FILE_CACHE;
3447 used |= CEPH_CAP_FILE_LAZYIO;
3448 }
3449 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3450 used &= ~CEPH_CAP_FILE_BUFFER;
3451 used |= CEPH_CAP_FILE_LAZYIO;
3452 }
3453 } else {
3454 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3455 used &= ~CEPH_CAP_FILE_CACHE;
3456 used |= CEPH_CAP_FILE_LAZYIO;
3457 }
3458 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3459 used &= ~CEPH_CAP_FILE_BUFFER;
3460 used |= CEPH_CAP_FILE_LAZYIO;
3461 }
3462 }
3463 return used;
3464}
3465
7c673cae
FG
3466/**
3467 * check_caps
3468 *
3469 * Examine currently used and wanted versus held caps. Release, flush or ack
3470 * revoked caps to the MDS as appropriate.
3471 *
3472 * @param in the inode to check
3473 * @param flags flags to apply to cap check
3474 */
3475void Client::check_caps(Inode *in, unsigned flags)
3476{
3477 unsigned wanted = in->caps_wanted();
3478 unsigned used = get_caps_used(in);
3479 unsigned cap_used;
3480
7c673cae
FG
3481 int implemented;
3482 int issued = in->caps_issued(&implemented);
3483 int revoking = implemented & ~issued;
3484
11fdf7f2
TL
3485 int orig_used = used;
3486 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3487
7c673cae 3488 int retain = wanted | used | CEPH_CAP_PIN;
a8e16298
TL
3489 if (!unmounting && in->nlink > 0) {
3490 if (wanted) {
7c673cae 3491 retain |= CEPH_CAP_ANY;
a8e16298
TL
3492 } else if (in->is_dir() &&
3493 (issued & CEPH_CAP_FILE_SHARED) &&
3494 (in->flags & I_COMPLETE)) {
3495 // we do this here because we don't want to drop to Fs (and then
3496 // drop the Fs if we do a create!) if that alone makes us send lookups
3497 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3498 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3499 retain |= wanted;
3500 } else {
7c673cae 3501 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3502 // keep RD only if we didn't have the file open RW,
3503 // because then the mds would revoke it anyway to
3504 // journal max_size=0.
3505 if (in->max_size == 0)
3506 retain |= CEPH_CAP_ANY_RD;
3507 }
7c673cae
FG
3508 }
3509
11fdf7f2 3510 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3511 << " wanted " << ccap_string(wanted)
3512 << " used " << ccap_string(used)
3513 << " issued " << ccap_string(issued)
3514 << " revoking " << ccap_string(revoking)
3515 << " flags=" << flags
3516 << dendl;
3517
3518 if (in->snapid != CEPH_NOSNAP)
3519 return; //snap caps last forever, can't write
3520
3521 if (in->caps.empty())
3522 return; // guard if at end of func
3523
11fdf7f2
TL
3524 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3525 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3526 if (_release(in))
11fdf7f2 3527 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3528 }
7c673cae
FG
3529
3530 if (!in->cap_snaps.empty())
3531 flush_snaps(in);
3532
11fdf7f2
TL
3533 for (auto &p : in->caps) {
3534 mds_rank_t mds = p.first;
3535 Cap &cap = p.second;
7c673cae 3536
11fdf7f2 3537 MetaSession *session = &mds_sessions.at(mds);
7c673cae
FG
3538
3539 cap_used = used;
11fdf7f2 3540 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3541 cap_used &= ~in->auth_cap->issued;
3542
11fdf7f2 3543 revoking = cap.implemented & ~cap.issued;
7c673cae
FG
3544
3545 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3546 << " issued " << ccap_string(cap.issued)
3547 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3548 << " revoking " << ccap_string(revoking) << dendl;
3549
3550 if (in->wanted_max_size > in->max_size &&
3551 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3552 &cap == in->auth_cap)
7c673cae
FG
3553 goto ack;
3554
3555 /* approaching file_max? */
11fdf7f2
TL
3556 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3557 &cap == in->auth_cap &&
31f18b77 3558 is_max_size_approaching(in)) {
7c673cae 3559 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3560 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3561 goto ack;
3562 }
3563
3564 /* completed revocation? */
3565 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3566 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3567 goto ack;
3568 }
3569
3570 /* want more caps from mds? */
11fdf7f2 3571 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3572 goto ack;
3573
3574 if (!revoking && unmounting && (cap_used == 0))
3575 goto ack;
3576
11fdf7f2 3577 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3578 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3579 continue;
3580
11fdf7f2 3581 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3582 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3583 cap_delay_requeue(in);
7c673cae
FG
3584 continue;
3585 }
3586
3587 ack:
3588 // re-send old cap/snapcap flushes first.
3589 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3590 session->mds_state < MDSMap::STATE_ACTIVE &&
3591 session->early_flushing_caps.count(in) == 0) {
3592 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3593 << " to mds." << session->mds_num << dendl;
3594 session->early_flushing_caps.insert(in);
3595 if (in->cap_snaps.size())
3596 flush_snaps(in, true);
3597 if (in->flushing_caps)
3598 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3599 }
3600
3601 int flushing;
3602 ceph_tid_t flush_tid;
11fdf7f2 3603 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae
FG
3604 flushing = mark_caps_flushing(in, &flush_tid);
3605 } else {
3606 flushing = 0;
3607 flush_tid = 0;
3608 }
3609
11fdf7f2 3610 send_cap(in, session, &cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
7c673cae
FG
3611 retain, flushing, flush_tid);
3612 }
3613}
3614
3615
3616void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3617{
3618 int used = get_caps_used(in);
3619 int dirty = in->caps_dirty();
11fdf7f2 3620 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3621
3622 if (in->cap_snaps.size() &&
3623 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3624 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3625 return;
3626 } else if (in->caps_dirty() ||
3627 (used & CEPH_CAP_FILE_WR) ||
3628 (dirty & CEPH_CAP_ANY_WR)) {
3629 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3630 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3631 CapSnap &capsnap = capsnapem.first->second;
3632 capsnap.context = old_snapc;
3633 capsnap.issued = in->caps_issued();
3634 capsnap.dirty = in->caps_dirty();
3635
3636 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3637
3638 capsnap.uid = in->uid;
3639 capsnap.gid = in->gid;
3640 capsnap.mode = in->mode;
3641 capsnap.btime = in->btime;
3642 capsnap.xattrs = in->xattrs;
3643 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3644 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3645 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
7c673cae
FG
3646
3647 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3648 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3649 capsnap.writing = 1;
3650 } else {
3651 finish_cap_snap(in, capsnap, used);
3652 }
3653 } else {
11fdf7f2 3654 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3655 }
3656}
3657
3658void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3659{
11fdf7f2 3660 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3661 capsnap.size = in->size;
3662 capsnap.mtime = in->mtime;
3663 capsnap.atime = in->atime;
3664 capsnap.ctime = in->ctime;
3665 capsnap.time_warp_seq = in->time_warp_seq;
3666 capsnap.change_attr = in->change_attr;
7c673cae
FG
3667 capsnap.dirty |= in->caps_dirty();
3668
11fdf7f2
TL
3669 /* Only reset it if it wasn't set before */
3670 if (capsnap.cap_dirtier_uid == -1) {
3671 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3672 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3673 }
3674
7c673cae
FG
3675 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3676 capsnap.inline_data = in->inline_data;
3677 capsnap.inline_version = in->inline_version;
3678 }
3679
3680 if (used & CEPH_CAP_FILE_BUFFER) {
11fdf7f2 3681 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3682 << " WRBUFFER, delaying" << dendl;
3683 } else {
3684 capsnap.dirty_data = 0;
3685 flush_snaps(in);
3686 }
3687}
3688
3689void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3690{
11fdf7f2 3691 ldout(cct, 10) << __func__ << " seq " << seq << " on " << *in << dendl;
7c673cae
FG
3692 in->cap_snaps.at(seq).dirty_data = 0;
3693 flush_snaps(in);
3694}
3695
3696void Client::flush_snaps(Inode *in, bool all_again)
3697{
3698 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
11fdf7f2 3699 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3700
3701 // pick auth mds
11fdf7f2 3702 ceph_assert(in->auth_cap);
7c673cae
FG
3703 MetaSession *session = in->auth_cap->session;
3704 int mseq = in->auth_cap->mseq;
3705
3706 for (auto &p : in->cap_snaps) {
3707 CapSnap &capsnap = p.second;
3708 if (!all_again) {
3709 // only flush once per session
3710 if (capsnap.flush_tid > 0)
3711 continue;
3712 }
3713
3714 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3715 << " follows " << p.first
3716 << " size " << capsnap.size
3717 << " mtime " << capsnap.mtime
3718 << " dirty_data=" << capsnap.dirty_data
3719 << " writing=" << capsnap.writing
3720 << " on " << *in << dendl;
3721 if (capsnap.dirty_data || capsnap.writing)
3722 continue;
3723
3724 if (capsnap.flush_tid == 0) {
3725 capsnap.flush_tid = ++last_flush_tid;
3726 if (!in->flushing_cap_item.is_on_list())
3727 session->flushing_caps.push_back(&in->flushing_cap_item);
3728 session->flushing_caps_tids.insert(capsnap.flush_tid);
3729 }
3730
11fdf7f2 3731 auto m = MClientCaps::create(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
7c673cae 3732 cap_epoch_barrier);
11fdf7f2
TL
3733 m->caller_uid = capsnap.cap_dirtier_uid;
3734 m->caller_gid = capsnap.cap_dirtier_gid;
7c673cae
FG
3735
3736 m->set_client_tid(capsnap.flush_tid);
3737 m->head.snap_follows = p.first;
3738
3739 m->head.caps = capsnap.issued;
3740 m->head.dirty = capsnap.dirty;
3741
3742 m->head.uid = capsnap.uid;
3743 m->head.gid = capsnap.gid;
3744 m->head.mode = capsnap.mode;
3745 m->btime = capsnap.btime;
3746
3747 m->size = capsnap.size;
3748
3749 m->head.xattr_version = capsnap.xattr_version;
11fdf7f2 3750 encode(capsnap.xattrs, m->xattrbl);
7c673cae
FG
3751
3752 m->ctime = capsnap.ctime;
3753 m->btime = capsnap.btime;
3754 m->mtime = capsnap.mtime;
3755 m->atime = capsnap.atime;
3756 m->time_warp_seq = capsnap.time_warp_seq;
3757 m->change_attr = capsnap.change_attr;
3758
3759 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3760 m->inline_version = in->inline_version;
3761 m->inline_data = in->inline_data;
3762 }
3763
11fdf7f2 3764 ceph_assert(!session->flushing_caps_tids.empty());
7c673cae
FG
3765 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3766
11fdf7f2 3767 session->con->send_message2(std::move(m));
7c673cae
FG
3768 }
3769}
3770
3771
3772
3773void Client::wait_on_list(list<Cond*>& ls)
3774{
3775 Cond cond;
3776 ls.push_back(&cond);
3777 cond.Wait(client_lock);
3778 ls.remove(&cond);
3779}
3780
3781void Client::signal_cond_list(list<Cond*>& ls)
3782{
3783 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3784 (*it)->Signal();
3785}
3786
3787void Client::wait_on_context_list(list<Context*>& ls)
3788{
3789 Cond cond;
3790 bool done = false;
3791 int r;
3792 ls.push_back(new C_Cond(&cond, &done, &r));
3793 while (!done)
3794 cond.Wait(client_lock);
3795}
3796
3797void Client::signal_context_list(list<Context*>& ls)
3798{
3799 while (!ls.empty()) {
3800 ls.front()->complete(0);
3801 ls.pop_front();
3802 }
3803}
3804
a8e16298 3805void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 3806{
11fdf7f2
TL
3807 for (const auto &cap : s->caps) {
3808 auto &in = cap->inode;
a8e16298 3809 if (reconnect) {
11fdf7f2
TL
3810 in.requested_max_size = 0;
3811 in.wanted_max_size = 0;
a8e16298
TL
3812 } else {
3813 if (cap->gen < s->cap_gen) {
3814 // mds did not re-issue stale cap.
3815 cap->issued = cap->implemented = CEPH_CAP_PIN;
3816 // make sure mds knows what we want.
11fdf7f2
TL
3817 if (in.caps_file_wanted() & ~cap->wanted)
3818 in.flags |= I_CAP_DROPPED;
a8e16298
TL
3819 }
3820 }
11fdf7f2 3821 signal_cond_list(in.waitfor_caps);
7c673cae
FG
3822 }
3823}
3824
3825
3826// flush dirty data (from objectcache)
3827
3828class C_Client_CacheInvalidate : public Context {
3829private:
3830 Client *client;
3831 vinodeno_t ino;
3832 int64_t offset, length;
3833public:
3834 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3835 client(c), offset(off), length(len) {
3836 if (client->use_faked_inos())
3837 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3838 else
3839 ino = in->vino();
3840 }
3841 void finish(int r) override {
3842 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
11fdf7f2 3843 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
3844 client->_async_invalidate(ino, offset, length);
3845 }
3846};
3847
3848void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3849{
3850 if (unmounting)
3851 return;
11fdf7f2 3852 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
3853 ino_invalidate_cb(callback_handle, ino, off, len);
3854}
3855
3856void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3857
3858 if (ino_invalidate_cb)
3859 // we queue the invalidate, which calls the callback and decrements the ref
3860 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3861}
3862
3863void Client::_invalidate_inode_cache(Inode *in)
3864{
11fdf7f2 3865 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
3866
3867 // invalidate our userspace inode cache
94b18763 3868 if (cct->_conf->client_oc) {
7c673cae 3869 objectcacher->release_set(&in->oset);
94b18763
FG
3870 if (!objectcacher->set_is_empty(&in->oset))
3871 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
3872 }
7c673cae
FG
3873
3874 _schedule_invalidate_callback(in, 0, 0);
3875}
3876
3877void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3878{
11fdf7f2 3879 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
3880
3881 // invalidate our userspace inode cache
3882 if (cct->_conf->client_oc) {
3883 vector<ObjectExtent> ls;
3884 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 3885 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
3886 }
3887
3888 _schedule_invalidate_callback(in, off, len);
3889}
3890
3891bool Client::_release(Inode *in)
3892{
3893 ldout(cct, 20) << "_release " << *in << dendl;
3894 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3895 _invalidate_inode_cache(in);
3896 return true;
3897 }
3898 return false;
3899}
3900
3901bool Client::_flush(Inode *in, Context *onfinish)
3902{
3903 ldout(cct, 10) << "_flush " << *in << dendl;
3904
3905 if (!in->oset.dirty_or_tx) {
3906 ldout(cct, 10) << " nothing to flush" << dendl;
3907 onfinish->complete(0);
3908 return true;
3909 }
3910
3911 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 3912 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
3913 objectcacher->purge_set(&in->oset);
3914 if (onfinish) {
3915 onfinish->complete(-ENOSPC);
3916 }
3917 return true;
3918 }
3919
3920 return objectcacher->flush_set(&in->oset, onfinish);
3921}
3922
3923void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3924{
11fdf7f2 3925 ceph_assert(client_lock.is_locked());
7c673cae
FG
3926 if (!in->oset.dirty_or_tx) {
3927 ldout(cct, 10) << " nothing to flush" << dendl;
3928 return;
3929 }
3930
11fdf7f2 3931 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 3932 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 3933 offset, size, &onflush);
7c673cae
FG
3934 if (!ret) {
3935 // wait for flush
3936 client_lock.Unlock();
11fdf7f2 3937 onflush.wait();
7c673cae
FG
3938 client_lock.Lock();
3939 }
3940}
3941
3942void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3943{
11fdf7f2
TL
3944 // std::lock_guard l(client_lock);
3945 ceph_assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
7c673cae 3946 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 3947 ceph_assert(in);
7c673cae
FG
3948 _flushed(in);
3949}
3950
3951void Client::_flushed(Inode *in)
3952{
3953 ldout(cct, 10) << "_flushed " << *in << dendl;
3954
3955 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3956}
3957
3958
3959
3960// checks common to add_update_cap, handle_cap_grant
11fdf7f2 3961void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
3962{
3963 unsigned had = in->caps_issued();
3964
3965 if ((issued & CEPH_CAP_FILE_CACHE) &&
3966 !(had & CEPH_CAP_FILE_CACHE))
3967 in->cache_gen++;
3968
3969 if ((issued & CEPH_CAP_FILE_SHARED) &&
3970 !(had & CEPH_CAP_FILE_SHARED)) {
3971 in->shared_gen++;
3972
3973 if (in->is_dir())
3974 clear_dir_complete_and_ordered(in, true);
3975 }
3976}
3977
3978void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
3979 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
3980 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 3981{
11fdf7f2
TL
3982 if (!in->is_any_caps()) {
3983 ceph_assert(in->snaprealm == 0);
3984 in->snaprealm = get_snap_realm(realm);
3985 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3986 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
3987 } else {
3988 ceph_assert(in->snaprealm);
3989 if ((flags & CEPH_CAP_FLAG_AUTH) &&
3990 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
3991 in->snaprealm_item.remove_myself();
3992 auto oldrealm = in->snaprealm;
3993 in->snaprealm = get_snap_realm(realm);
3994 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3995 put_snap_realm(oldrealm);
3996 }
3997 }
3998
7c673cae 3999 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4000 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4001 Cap &cap = capem.first->second;
4002 if (!capem.second) {
4003 if (cap.gen < mds_session->cap_gen)
4004 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4005
4006 /*
4007 * auth mds of the inode changed. we received the cap export
4008 * message, but still haven't received the cap import message.
4009 * handle_cap_export() updated the new auth MDS' cap.
4010 *
4011 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4012 * a message that was send before the cap import message. So
4013 * don't remove caps.
4014 */
11fdf7f2
TL
4015 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
4016 ceph_assert(&cap == in->auth_cap);
4017 ceph_assert(cap.cap_id == cap_id);
4018 seq = cap.seq;
4019 mseq = cap.mseq;
4020 issued |= cap.issued;
7c673cae
FG
4021 flags |= CEPH_CAP_FLAG_AUTH;
4022 }
7c673cae
FG
4023 }
4024
11fdf7f2 4025 check_cap_issue(in, issued);
7c673cae
FG
4026
4027 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4028 if (in->auth_cap != &cap &&
7c673cae
FG
4029 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4030 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4031 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4032 << "add myself to new auth MDS' flushing caps list" << dendl;
4033 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4034 }
11fdf7f2 4035 in->auth_cap = &cap;
7c673cae
FG
4036 }
4037 }
4038
11fdf7f2
TL
4039 unsigned old_caps = cap.issued;
4040 cap.cap_id = cap_id;
4041 cap.issued = issued;
4042 cap.implemented |= issued;
4043 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4044 cap.wanted = wanted;
a8e16298 4045 else
11fdf7f2
TL
4046 cap.wanted |= wanted;
4047 cap.seq = seq;
4048 cap.issue_seq = seq;
4049 cap.mseq = mseq;
4050 cap.gen = mds_session->cap_gen;
4051 cap.latest_perms = cap_perms;
4052 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4053 << " from mds." << mds
4054 << " on " << *in
4055 << dendl;
4056
4057 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4058 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4059 for (auto &p : in->caps) {
4060 if (&p.second == &cap)
7c673cae 4061 continue;
11fdf7f2 4062 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4063 check_caps(in, CHECK_CAPS_NODELAY);
4064 break;
4065 }
4066 }
4067 }
4068
4069 if (issued & ~old_caps)
4070 signal_cond_list(in->waitfor_caps);
4071}
4072
4073void Client::remove_cap(Cap *cap, bool queue_release)
4074{
11fdf7f2 4075 auto &in = cap->inode;
7c673cae
FG
4076 MetaSession *session = cap->session;
4077 mds_rank_t mds = cap->session->mds_num;
4078
11fdf7f2 4079 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4080
4081 if (queue_release) {
4082 session->enqueue_cap_release(
11fdf7f2 4083 in.ino,
7c673cae
FG
4084 cap->cap_id,
4085 cap->issue_seq,
4086 cap->mseq,
4087 cap_epoch_barrier);
4088 }
4089
11fdf7f2
TL
4090 if (in.auth_cap == cap) {
4091 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4092 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4093 in.flushing_cap_item.remove_myself();
7c673cae 4094 }
11fdf7f2 4095 in.auth_cap = NULL;
7c673cae 4096 }
11fdf7f2
TL
4097 size_t n = in.caps.erase(mds);
4098 ceph_assert(n == 1);
7c673cae
FG
4099 cap = nullptr;
4100
11fdf7f2
TL
4101 if (!in.is_any_caps()) {
4102 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4103 in.snaprealm_item.remove_myself();
4104 put_snap_realm(in.snaprealm);
4105 in.snaprealm = 0;
7c673cae
FG
4106 }
4107}
4108
4109void Client::remove_all_caps(Inode *in)
4110{
4111 while (!in->caps.empty())
11fdf7f2 4112 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4113}
4114
4115void Client::remove_session_caps(MetaSession *s)
4116{
11fdf7f2 4117 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4118
4119 while (s->caps.size()) {
4120 Cap *cap = *s->caps.begin();
11fdf7f2 4121 InodeRef in(&cap->inode);
7c673cae
FG
4122 bool dirty_caps = false, cap_snaps = false;
4123 if (in->auth_cap == cap) {
4124 cap_snaps = !in->cap_snaps.empty();
4125 dirty_caps = in->dirty_caps | in->flushing_caps;
4126 in->wanted_max_size = 0;
4127 in->requested_max_size = 0;
7c673cae 4128 }
a8e16298
TL
4129 if (cap->wanted | cap->issued)
4130 in->flags |= I_CAP_DROPPED;
7c673cae 4131 remove_cap(cap, false);
7c673cae 4132 if (cap_snaps) {
7c673cae
FG
4133 in->cap_snaps.clear();
4134 }
4135 if (dirty_caps) {
11fdf7f2 4136 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4137 if (in->flushing_caps) {
4138 num_flushing_caps--;
4139 in->flushing_cap_tids.clear();
4140 }
4141 in->flushing_caps = 0;
28e407b8 4142 in->mark_caps_clean();
11fdf7f2 4143 put_inode(in.get());
7c673cae 4144 }
a8e16298 4145 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4146 }
4147 s->flushing_caps_tids.clear();
4148 sync_cond.Signal();
4149}
4150
91327a77 4151int Client::_do_remount(bool retry_on_error)
b32b8144 4152{
11fdf7f2 4153 uint64_t max_retries = g_conf().get_val<uint64_t>("mds_max_retries_on_remount_failure");
91327a77 4154
b32b8144
FG
4155 errno = 0;
4156 int r = remount_cb(callback_handle);
91327a77
AA
4157 if (r == 0) {
4158 retries_on_invalidate = 0;
4159 } else {
b32b8144
FG
4160 int e = errno;
4161 client_t whoami = get_nodeid();
4162 if (r == -1) {
4163 lderr(cct) <<
4164 "failed to remount (to trim kernel dentries): "
4165 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4166 } else {
4167 lderr(cct) <<
4168 "failed to remount (to trim kernel dentries): "
4169 "return code = " << r << dendl;
4170 }
91327a77 4171 bool should_abort =
11fdf7f2
TL
4172 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4173 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4174 !(retry_on_error && (++retries_on_invalidate < max_retries));
b32b8144
FG
4175 if (should_abort && !unmounting) {
4176 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
4177 ceph_abort();
4178 }
4179 }
4180 return r;
4181}
4182
7c673cae
FG
4183class C_Client_Remount : public Context {
4184private:
4185 Client *client;
4186public:
4187 explicit C_Client_Remount(Client *c) : client(c) {}
4188 void finish(int r) override {
11fdf7f2 4189 ceph_assert(r == 0);
91327a77 4190 client->_do_remount(true);
7c673cae
FG
4191 }
4192};
4193
4194void Client::_invalidate_kernel_dcache()
4195{
4196 if (unmounting)
4197 return;
94b18763
FG
4198 if (can_invalidate_dentries) {
4199 if (dentry_invalidate_cb && root->dir) {
4200 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4201 p != root->dir->dentries.end();
4202 ++p) {
4203 if (p->second->inode)
4204 _schedule_invalidate_dentry_callback(p->second, false);
4205 }
7c673cae
FG
4206 }
4207 } else if (remount_cb) {
4208 // Hacky:
4209 // when remounting a file system, linux kernel trims all unused dentries in the fs
4210 remount_finisher.queue(new C_Client_Remount(this));
4211 }
4212}
4213
91327a77
AA
4214void Client::_trim_negative_child_dentries(InodeRef& in)
4215{
4216 if (!in->is_dir())
4217 return;
4218
4219 Dir* dir = in->dir;
4220 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4221 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4222 Dentry *dn = p->second;
4223 ++p;
11fdf7f2 4224 ceph_assert(!dn->inode);
91327a77
AA
4225 if (dn->lru_is_expireable())
4226 unlink(dn, true, false); // keep dir, drop dentry
4227 }
4228 if (dir->dentries.empty()) {
4229 close_dir(dir);
4230 }
4231 }
4232
4233 if (in->flags & I_SNAPDIR_OPEN) {
4234 InodeRef snapdir = open_snapdir(in.get());
4235 _trim_negative_child_dentries(snapdir);
4236 }
4237}
4238
28e407b8 4239void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4240{
4241 mds_rank_t mds = s->mds_num;
28e407b8 4242 size_t caps_size = s->caps.size();
11fdf7f2 4243 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4244 << " caps " << caps_size << dendl;
4245
28e407b8
AA
4246 uint64_t trimmed = 0;
4247 auto p = s->caps.begin();
4248 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4249 * looking at from getting deleted during traversal. */
7c673cae
FG
4250 while ((caps_size - trimmed) > max && !p.end()) {
4251 Cap *cap = *p;
11fdf7f2 4252 InodeRef in(&cap->inode);
7c673cae
FG
4253
4254 // Increment p early because it will be invalidated if cap
4255 // is deleted inside remove_cap
4256 ++p;
4257
4258 if (in->caps.size() > 1 && cap != in->auth_cap) {
4259 int mine = cap->issued | cap->implemented;
4260 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4261 // disposable non-auth cap
b32b8144 4262 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4263 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4264 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4265 trimmed++;
4266 }
4267 } else {
4268 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4269 _trim_negative_child_dentries(in);
7c673cae 4270 bool all = true;
11fdf7f2
TL
4271 auto q = in->dentries.begin();
4272 while (q != in->dentries.end()) {
4273 Dentry *dn = *q;
4274 ++q;
7c673cae
FG
4275 if (dn->lru_is_expireable()) {
4276 if (can_invalidate_dentries &&
4277 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4278 // Only issue one of these per DN for inodes in root: handle
4279 // others more efficiently by calling for root-child DNs at
4280 // the end of this function.
4281 _schedule_invalidate_dentry_callback(dn, true);
4282 }
28e407b8
AA
4283 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4284 to_trim.insert(dn);
7c673cae
FG
4285 } else {
4286 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4287 all = false;
4288 }
4289 }
4290 if (all && in->ino != MDS_INO_ROOT) {
4291 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4292 trimmed++;
4293 }
4294 }
4295 }
28e407b8
AA
4296 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4297 for (const auto &dn : to_trim) {
4298 trim_dentry(dn);
4299 }
4300 to_trim.clear();
7c673cae 4301
b32b8144 4302 caps_size = s->caps.size();
11fdf7f2 4303 if (caps_size > (size_t)max)
7c673cae
FG
4304 _invalidate_kernel_dcache();
4305}
4306
4307void Client::force_session_readonly(MetaSession *s)
4308{
4309 s->readonly = true;
4310 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4311 auto &in = (*p)->inode;
4312 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4313 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4314 }
4315}
4316
7c673cae
FG
4317int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4318{
4319 MetaSession *session = in->auth_cap->session;
4320
4321 int flushing = in->dirty_caps;
11fdf7f2 4322 ceph_assert(flushing);
7c673cae
FG
4323
4324 ceph_tid_t flush_tid = ++last_flush_tid;
4325 in->flushing_cap_tids[flush_tid] = flushing;
4326
4327 if (!in->flushing_caps) {
11fdf7f2 4328 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4329 num_flushing_caps++;
4330 } else {
11fdf7f2 4331 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4332 }
4333
4334 in->flushing_caps |= flushing;
28e407b8 4335 in->mark_caps_clean();
7c673cae
FG
4336
4337 if (!in->flushing_cap_item.is_on_list())
4338 session->flushing_caps.push_back(&in->flushing_cap_item);
4339 session->flushing_caps_tids.insert(flush_tid);
4340
4341 *ptid = flush_tid;
4342 return flushing;
4343}
4344
4345void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4346{
4347 for (auto &p : in->cap_snaps) {
4348 CapSnap &capsnap = p.second;
4349 if (capsnap.flush_tid > 0) {
4350 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4351 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4352 }
4353 }
4354 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4355 it != in->flushing_cap_tids.end();
4356 ++it) {
4357 old_s->flushing_caps_tids.erase(it->first);
4358 new_s->flushing_caps_tids.insert(it->first);
4359 }
4360 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4361}
4362
4363/*
4364 * Flush all caps back to the MDS. Because the callers generally wait on the
4365 * result of this function (syncfs and umount cases), we set
4366 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4367 */
4368void Client::flush_caps_sync()
4369{
4370 ldout(cct, 10) << __func__ << dendl;
28e407b8 4371 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
4372 while (!p.end()) {
4373 unsigned flags = CHECK_CAPS_NODELAY;
4374 Inode *in = *p;
4375
4376 ++p;
28e407b8
AA
4377 delayed_list.pop_front();
4378 if (p.end() && dirty_list.empty())
7c673cae
FG
4379 flags |= CHECK_CAPS_SYNCHRONOUS;
4380 check_caps(in, flags);
4381 }
4382
4383 // other caps, too
28e407b8 4384 p = dirty_list.begin();
7c673cae
FG
4385 while (!p.end()) {
4386 unsigned flags = CHECK_CAPS_NODELAY;
4387 Inode *in = *p;
4388
4389 ++p;
4390 if (p.end())
4391 flags |= CHECK_CAPS_SYNCHRONOUS;
4392 check_caps(in, flags);
4393 }
4394}
4395
4396void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4397{
11fdf7f2 4398 ldout(cct, 10) << __func__ << " " << in << " mds." << session->mds_num << dendl;
7c673cae 4399 Cap *cap = in->auth_cap;
11fdf7f2 4400 ceph_assert(cap->session == session);
7c673cae
FG
4401
4402 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4403 p != in->flushing_cap_tids.end();
4404 ++p) {
4405 bool req_sync = false;
4406
4407 /* If this is a synchronous request, then flush the journal on last one */
4408 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4409 req_sync = true;
4410
4411 send_cap(in, session, cap, req_sync,
4412 (get_caps_used(in) | in->caps_dirty()),
4413 in->caps_wanted(), (cap->issued | cap->implemented),
4414 p->second, p->first);
4415 }
4416}
4417
4418void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4419{
4420 while (in->flushing_caps) {
4421 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4422 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4423 if (it->first > want)
4424 break;
11fdf7f2 4425 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4426 << ccap_string(it->second) << " want " << want
4427 << " last " << it->first << dendl;
4428 wait_on_list(in->waitfor_caps);
4429 }
4430}
4431
4432void Client::wait_sync_caps(ceph_tid_t want)
4433{
4434 retry:
11fdf7f2 4435 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4436 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2
TL
4437 for (auto &p : mds_sessions) {
4438 MetaSession *s = &p.second;
7c673cae
FG
4439 if (s->flushing_caps_tids.empty())
4440 continue;
4441 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4442 if (oldest_tid <= want) {
11fdf7f2 4443 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae
FG
4444 << " (want " << want << ")" << dendl;
4445 sync_cond.Wait(client_lock);
4446 goto retry;
4447 }
4448 }
4449}
4450
4451void Client::kick_flushing_caps(MetaSession *session)
4452{
4453 mds_rank_t mds = session->mds_num;
11fdf7f2 4454 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4455
4456 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4457 Inode *in = *p;
4458 if (session->early_flushing_caps.count(in))
4459 continue;
4460 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4461 if (in->cap_snaps.size())
4462 flush_snaps(in, true);
4463 if (in->flushing_caps)
4464 flush_caps(in, session);
4465 }
4466
4467 session->early_flushing_caps.clear();
4468}
4469
4470void Client::early_kick_flushing_caps(MetaSession *session)
4471{
4472 session->early_flushing_caps.clear();
4473
4474 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4475 Inode *in = *p;
11fdf7f2
TL
4476 Cap *cap = in->auth_cap;
4477 ceph_assert(cap);
7c673cae
FG
4478
4479 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4480 // stage. This guarantees that MDS processes the cap flush message before issuing
4481 // the flushing caps to other client.
4482 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4483 continue;
4484
4485 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4486 << " to mds." << session->mds_num << dendl;
4487
4488 session->early_flushing_caps.insert(in);
4489
11fdf7f2
TL
4490 // send_reconnect() also will reset these sequence numbers. make sure
4491 // sequence numbers in cap flush message match later reconnect message.
4492 cap->seq = 0;
4493 cap->issue_seq = 0;
4494 cap->mseq = 0;
4495 cap->issued = cap->implemented;
4496
7c673cae
FG
4497 if (in->cap_snaps.size())
4498 flush_snaps(in, true);
4499 if (in->flushing_caps)
4500 flush_caps(in, session);
4501
4502 }
4503}
4504
7c673cae
FG
4505void SnapRealm::build_snap_context()
4506{
4507 set<snapid_t> snaps;
4508 snapid_t max_seq = seq;
4509
4510 // start with prior_parents?
4511 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4512 snaps.insert(prior_parent_snaps[i]);
4513
4514 // current parent's snaps
4515 if (pparent) {
4516 const SnapContext& psnapc = pparent->get_snap_context();
4517 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4518 if (psnapc.snaps[i] >= parent_since)
4519 snaps.insert(psnapc.snaps[i]);
4520 if (psnapc.seq > max_seq)
4521 max_seq = psnapc.seq;
4522 }
4523
4524 // my snaps
4525 for (unsigned i=0; i<my_snaps.size(); i++)
4526 snaps.insert(my_snaps[i]);
4527
4528 // ok!
4529 cached_snap_context.seq = max_seq;
4530 cached_snap_context.snaps.resize(0);
4531 cached_snap_context.snaps.reserve(snaps.size());
4532 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4533 cached_snap_context.snaps.push_back(*p);
4534}
4535
4536void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4537{
4538 list<SnapRealm*> q;
4539 q.push_back(realm);
4540
4541 while (!q.empty()) {
4542 realm = q.front();
4543 q.pop_front();
4544
11fdf7f2 4545 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4546 realm->invalidate_cache();
4547
4548 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4549 p != realm->pchildren.end();
4550 ++p)
4551 q.push_back(*p);
4552 }
4553}
4554
4555SnapRealm *Client::get_snap_realm(inodeno_t r)
4556{
4557 SnapRealm *realm = snap_realms[r];
4558 if (!realm)
4559 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4560 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4561 realm->nref++;
4562 return realm;
4563}
4564
4565SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4566{
4567 if (snap_realms.count(r) == 0) {
11fdf7f2 4568 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4569 return NULL;
4570 }
4571 SnapRealm *realm = snap_realms[r];
11fdf7f2 4572 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4573 realm->nref++;
4574 return realm;
4575}
4576
4577void Client::put_snap_realm(SnapRealm *realm)
4578{
11fdf7f2 4579 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4580 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4581 if (--realm->nref == 0) {
4582 snap_realms.erase(realm->ino);
4583 if (realm->pparent) {
4584 realm->pparent->pchildren.erase(realm);
4585 put_snap_realm(realm->pparent);
4586 }
4587 delete realm;
4588 }
4589}
4590
4591bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4592{
4593 if (realm->parent != parent) {
11fdf7f2 4594 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4595 << " " << realm->parent << " -> " << parent << dendl;
4596 realm->parent = parent;
4597 if (realm->pparent) {
4598 realm->pparent->pchildren.erase(realm);
4599 put_snap_realm(realm->pparent);
4600 }
4601 realm->pparent = get_snap_realm(parent);
4602 realm->pparent->pchildren.insert(realm);
4603 return true;
4604 }
4605 return false;
4606}
4607
4608static bool has_new_snaps(const SnapContext& old_snapc,
4609 const SnapContext& new_snapc)
4610{
4611 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4612}
4613
4614
11fdf7f2 4615void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4616{
4617 SnapRealm *first_realm = NULL;
11fdf7f2 4618 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4619
4620 map<SnapRealm*, SnapContext> dirty_realms;
4621
11fdf7f2 4622 auto p = bl.cbegin();
7c673cae
FG
4623 while (!p.end()) {
4624 SnapRealmInfo info;
11fdf7f2 4625 decode(info, p);
7c673cae
FG
4626 SnapRealm *realm = get_snap_realm(info.ino());
4627
4628 bool invalidate = false;
4629
4630 if (info.seq() > realm->seq) {
11fdf7f2 4631 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4632 << dendl;
4633
4634 if (flush) {
4635 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4636 // flush me + children
4637 list<SnapRealm*> q;
4638 q.push_back(realm);
4639 while (!q.empty()) {
4640 SnapRealm *realm = q.front();
4641 q.pop_front();
4642
4643 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4644 p != realm->pchildren.end();
4645 ++p)
4646 q.push_back(*p);
4647
4648 if (dirty_realms.count(realm) == 0) {
4649 realm->nref++;
4650 dirty_realms[realm] = realm->get_snap_context();
4651 }
4652 }
4653 }
4654
4655 // update
4656 realm->seq = info.seq();
4657 realm->created = info.created();
4658 realm->parent_since = info.parent_since();
4659 realm->prior_parent_snaps = info.prior_parent_snaps;
4660 realm->my_snaps = info.my_snaps;
4661 invalidate = true;
4662 }
4663
4664 // _always_ verify parent
4665 if (adjust_realm_parent(realm, info.parent()))
4666 invalidate = true;
4667
4668 if (invalidate) {
4669 invalidate_snaprealm_and_children(realm);
11fdf7f2 4670 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4671 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4672 } else {
11fdf7f2 4673 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4674 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4675 }
4676
4677 if (!first_realm)
4678 first_realm = realm;
4679 else
4680 put_snap_realm(realm);
4681 }
4682
4683 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4684 q != dirty_realms.end();
4685 ++q) {
4686 SnapRealm *realm = q->first;
4687 // if there are new snaps ?
4688 if (has_new_snaps(q->second, realm->get_snap_context())) {
4689 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4690 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4691 while (!r.end()) {
4692 Inode *in = *r;
4693 ++r;
4694 queue_cap_snap(in, q->second);
4695 }
4696 } else {
4697 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4698 }
4699 put_snap_realm(realm);
4700 }
4701
4702 if (realm_ret)
4703 *realm_ret = first_realm;
4704 else
4705 put_snap_realm(first_realm);
4706}
4707
11fdf7f2 4708void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4709{
11fdf7f2 4710 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae
FG
4711 mds_rank_t mds = mds_rank_t(m->get_source().num());
4712 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4713 if (!session) {
7c673cae
FG
4714 return;
4715 }
4716
4717 got_mds_push(session);
4718
4719 map<Inode*, SnapContext> to_move;
4720 SnapRealm *realm = 0;
4721
4722 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4723 ceph_assert(m->head.split);
7c673cae 4724 SnapRealmInfo info;
11fdf7f2
TL
4725 auto p = m->bl.cbegin();
4726 decode(info, p);
4727 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4728
4729 // flush, then move, ino's.
4730 realm = get_snap_realm(info.ino());
4731 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4732 for (auto& ino : m->split_inos) {
4733 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4734 if (inode_map.count(vino)) {
4735 Inode *in = inode_map[vino];
4736 if (!in->snaprealm || in->snaprealm == realm)
4737 continue;
4738 if (in->snaprealm->created > info.created()) {
4739 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4740 << *in->snaprealm << dendl;
4741 continue;
4742 }
4743 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4744
4745
4746 in->snaprealm_item.remove_myself();
4747 to_move[in] = in->snaprealm->get_snap_context();
4748 put_snap_realm(in->snaprealm);
4749 }
4750 }
4751
4752 // move child snaprealms, too
11fdf7f2
TL
4753 for (auto& child_realm : m->split_realms) {
4754 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
4755 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
4756 if (!child)
4757 continue;
4758 adjust_realm_parent(child, realm->ino);
4759 put_snap_realm(child);
4760 }
4761 }
4762
4763 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4764
4765 if (realm) {
4766 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4767 Inode *in = p->first;
4768 in->snaprealm = realm;
4769 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4770 realm->nref++;
4771 // queue for snap writeback
4772 if (has_new_snaps(p->second, realm->get_snap_context()))
4773 queue_cap_snap(in, p->second);
4774 }
4775 put_snap_realm(realm);
4776 }
7c673cae
FG
4777}
4778
11fdf7f2 4779void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
4780{
4781 mds_rank_t mds = mds_rank_t(m->get_source().num());
4782 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4783 if (!session) {
7c673cae
FG
4784 return;
4785 }
4786
4787 got_mds_push(session);
4788
11fdf7f2 4789 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
4790
4791 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4792 if (inode_map.count(vino)) {
4793 Inode *in = NULL;
4794 in = inode_map[vino];
4795
4796 if (in) {
4797 in->quota = m->quota;
4798 in->rstat = m->rstat;
4799 }
4800 }
7c673cae
FG
4801}
4802
11fdf7f2 4803void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
4804{
4805 mds_rank_t mds = mds_rank_t(m->get_source().num());
4806 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4807 if (!session) {
7c673cae
FG
4808 return;
4809 }
4810
4811 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4812 // Pause RADOS operations until we see the required epoch
4813 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4814 }
4815
4816 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4817 // Record the barrier so that we will transmit it to MDS when releasing
4818 set_cap_epoch_barrier(m->osd_epoch_barrier);
4819 }
4820
4821 got_mds_push(session);
4822
11fdf7f2 4823 Inode *in;
7c673cae 4824 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
4825 if (auto it = inode_map.find(vino); it != inode_map.end()) {
4826 in = it->second;
4827 } else {
7c673cae 4828 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 4829 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
4830 session->enqueue_cap_release(
4831 m->get_ino(),
4832 m->get_cap_id(),
4833 m->get_seq(),
4834 m->get_mseq(),
4835 cap_epoch_barrier);
4836 } else {
11fdf7f2 4837 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 4838 }
7c673cae
FG
4839
4840 // in case the mds is waiting on e.g. a revocation
4841 flush_cap_releases();
4842 return;
4843 }
4844
4845 switch (m->get_op()) {
11fdf7f2
TL
4846 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session, in, m);
4847 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session, in, m);
4848 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session, in, m);
7c673cae
FG
4849 }
4850
11fdf7f2
TL
4851 if (auto it = in->caps.find(mds); it != in->caps.end()) {
4852 Cap &cap = in->caps.at(mds);
7c673cae 4853
11fdf7f2
TL
4854 switch (m->get_op()) {
4855 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4856 case CEPH_CAP_OP_IMPORT:
4857 case CEPH_CAP_OP_REVOKE:
4858 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, &cap, m);
4859 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, &cap, m);
4860 }
4861 } else {
4862 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
4863 return;
7c673cae
FG
4864 }
4865}
4866
11fdf7f2 4867void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4868{
4869 mds_rank_t mds = session->mds_num;
4870
11fdf7f2 4871 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4872 << " IMPORT from mds." << mds << dendl;
4873
4874 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4875 Cap *cap = NULL;
4876 UserPerm cap_perms;
11fdf7f2
TL
4877 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
4878 cap = &it->second;
4879 cap_perms = cap->latest_perms;
7c673cae
FG
4880 }
4881
4882 // add/update it
4883 SnapRealm *realm = NULL;
4884 update_snap_trace(m->snapbl, &realm);
4885
4886 add_update_cap(in, session, m->get_cap_id(),
a8e16298
TL
4887 m->get_caps(), m->get_wanted(), m->get_seq(), m->get_mseq(),
4888 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
4889
4890 if (cap && cap->cap_id == m->peer.cap_id) {
4891 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4892 }
4893
4894 if (realm)
4895 put_snap_realm(realm);
4896
4897 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4898 // reflush any/all caps (if we are now the auth_cap)
4899 if (in->cap_snaps.size())
4900 flush_snaps(in, true);
4901 if (in->flushing_caps)
4902 flush_caps(in, session);
4903 }
4904}
4905
11fdf7f2 4906void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4907{
4908 mds_rank_t mds = session->mds_num;
4909
11fdf7f2 4910 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
4911 << " EXPORT from mds." << mds << dendl;
4912
11fdf7f2
TL
4913 auto it = in->caps.find(mds);
4914 if (it != in->caps.end()) {
4915 Cap &cap = it->second;
4916 if (cap.cap_id == m->get_cap_id()) {
4917 if (m->peer.cap_id) {
4918 const auto peer_mds = mds_rank_t(m->peer.mds);
4919 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4920 auto it = in->caps.find(peer_mds);
4921 if (it != in->caps.end()) {
4922 Cap &tcap = it->second;
4923 if (tcap.cap_id == m->peer.cap_id &&
4924 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
4925 tcap.cap_id = m->peer.cap_id;
4926 tcap.seq = m->peer.seq - 1;
4927 tcap.issue_seq = tcap.seq;
4928 tcap.issued |= cap.issued;
4929 tcap.implemented |= cap.issued;
4930 if (&cap == in->auth_cap)
4931 in->auth_cap = &tcap;
4932 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
4933 adjust_session_flushing_caps(in, session, tsession);
4934 }
4935 } else {
4936 add_update_cap(in, tsession, m->peer.cap_id, cap.issued, 0,
4937 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4938 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4939 cap.latest_perms);
4940 }
7c673cae 4941 } else {
11fdf7f2
TL
4942 if (cap.wanted | cap.issued)
4943 in->flags |= I_CAP_DROPPED;
7c673cae 4944 }
7c673cae 4945
11fdf7f2
TL
4946 remove_cap(&cap, false);
4947 }
7c673cae 4948 }
7c673cae
FG
4949}
4950
11fdf7f2 4951void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
4952{
4953 mds_rank_t mds = session->mds_num;
11fdf7f2 4954 ceph_assert(in->caps.count(mds));
7c673cae 4955
11fdf7f2 4956 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
4957 << " size " << in->size << " -> " << m->get_size()
4958 << dendl;
4959
1adf2230
AA
4960 int issued;
4961 in->caps_issued(&issued);
4962 issued |= in->caps_dirty();
4963 update_inode_file_size(in, issued, m->get_size(),
4964 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
4965}
4966
11fdf7f2 4967void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
4968{
4969 ceph_tid_t flush_ack_tid = m->get_client_tid();
4970 int dirty = m->get_dirty();
4971 int cleaned = 0;
4972 int flushed = 0;
4973
11fdf7f2
TL
4974 auto it = in->flushing_cap_tids.begin();
4975 if (it->first < flush_ack_tid) {
4976 ldout(cct, 0) << __func__ << " mds." << session->mds_num
4977 << " got unexpected flush ack tid " << flush_ack_tid
4978 << " expected is " << it->first << dendl;
4979 }
4980 for (; it != in->flushing_cap_tids.end(); ) {
7c673cae
FG
4981 if (it->first == flush_ack_tid)
4982 cleaned = it->second;
4983 if (it->first <= flush_ack_tid) {
4984 session->flushing_caps_tids.erase(it->first);
4985 in->flushing_cap_tids.erase(it++);
4986 ++flushed;
4987 continue;
4988 }
4989 cleaned &= ~it->second;
4990 if (!cleaned)
4991 break;
4992 ++it;
4993 }
4994
11fdf7f2 4995 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
4996 << " cleaned " << ccap_string(cleaned) << " on " << *in
4997 << " with " << ccap_string(dirty) << dendl;
4998
4999 if (flushed) {
5000 signal_cond_list(in->waitfor_caps);
5001 if (session->flushing_caps_tids.empty() ||
5002 *session->flushing_caps_tids.begin() > flush_ack_tid)
5003 sync_cond.Signal();
5004 }
5005
5006 if (!dirty) {
5007 in->cap_dirtier_uid = -1;
5008 in->cap_dirtier_gid = -1;
5009 }
5010
5011 if (!cleaned) {
5012 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5013 } else {
5014 if (in->flushing_caps) {
5015 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5016 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5017 in->flushing_caps &= ~cleaned;
5018 if (in->flushing_caps == 0) {
5019 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5020 num_flushing_caps--;
5021 if (in->cap_snaps.empty())
5022 in->flushing_cap_item.remove_myself();
5023 }
5024 if (!in->caps_dirty())
5025 put_inode(in);
5026 }
5027 }
7c673cae
FG
5028}
5029
5030
11fdf7f2 5031void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5032{
5033 mds_rank_t mds = session->mds_num;
11fdf7f2 5034 ceph_assert(in->caps.count(mds));
7c673cae
FG
5035 snapid_t follows = m->get_snap_follows();
5036
11fdf7f2
TL
5037 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5038 auto& capsnap = it->second;
7c673cae
FG
5039 if (m->get_client_tid() != capsnap.flush_tid) {
5040 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
5041 } else {
11fdf7f2 5042 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5043 << " on " << *in << dendl;
5044 InodeRef tmp_ref;
5045 if (in->get_num_ref() == 1)
5046 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
5047 if (in->flushing_caps == 0 && in->cap_snaps.empty())
5048 in->flushing_cap_item.remove_myself();
5049 session->flushing_caps_tids.erase(capsnap.flush_tid);
11fdf7f2 5050 in->cap_snaps.erase(it);
7c673cae
FG
5051 }
5052 } else {
11fdf7f2 5053 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5054 << " on " << *in << dendl;
5055 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5056 }
7c673cae
FG
5057}
5058
5059class C_Client_DentryInvalidate : public Context {
5060private:
5061 Client *client;
5062 vinodeno_t dirino;
5063 vinodeno_t ino;
5064 string name;
5065public:
5066 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5067 client(c), name(dn->name) {
5068 if (client->use_faked_inos()) {
5069 dirino.ino = dn->dir->parent_inode->faked_ino;
5070 if (del)
5071 ino.ino = dn->inode->faked_ino;
5072 } else {
5073 dirino = dn->dir->parent_inode->vino();
5074 if (del)
5075 ino = dn->inode->vino();
5076 }
5077 if (!del)
5078 ino.ino = inodeno_t();
5079 }
5080 void finish(int r) override {
5081 // _async_dentry_invalidate is responsible for its own locking
11fdf7f2 5082 ceph_assert(!client->client_lock.is_locked_by_me());
7c673cae
FG
5083 client->_async_dentry_invalidate(dirino, ino, name);
5084 }
5085};
5086
5087void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5088{
5089 if (unmounting)
5090 return;
11fdf7f2 5091 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae
FG
5092 << " in dir " << dirino << dendl;
5093 dentry_invalidate_cb(callback_handle, dirino, ino, name);
5094}
5095
5096void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5097{
5098 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5099 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5100}
5101
5102void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5103{
5104 int ref = in->get_num_ref();
494da23a 5105 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5106
5107 if (in->dir && !in->dir->dentries.empty()) {
5108 for (auto p = in->dir->dentries.begin();
5109 p != in->dir->dentries.end(); ) {
5110 Dentry *dn = p->second;
5111 ++p;
5112 /* rmsnap removes whole subtree, need trim inodes recursively.
5113 * we don't need to invalidate dentries recursively. because
5114 * invalidating a directory dentry effectively invalidate
5115 * whole subtree */
5116 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5117 _try_to_trim_inode(dn->inode.get(), false);
5118
5119 if (dn->lru_is_expireable())
5120 unlink(dn, true, false); // keep dir, drop dentry
5121 }
5122 if (in->dir->dentries.empty()) {
5123 close_dir(in->dir);
5124 --ref;
5125 }
5126 }
5127
5128 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
5129 InodeRef snapdir = open_snapdir(in);
5130 _try_to_trim_inode(snapdir.get(), false);
5131 --ref;
5132 }
5133
494da23a 5134 if (ref > 0) {
11fdf7f2
TL
5135 auto q = in->dentries.begin();
5136 while (q != in->dentries.end()) {
5137 Dentry *dn = *q;
5138 ++q;
494da23a
TL
5139 if( in->ll_ref > 0 && sched_inval) {
5140 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5141 // so in->dentries doesn't always reflect the state of kernel's dcache.
5142 _schedule_invalidate_dentry_callback(dn, true);
5143 }
7c673cae
FG
5144 unlink(dn, true, true);
5145 }
5146 }
5147}
5148
11fdf7f2 5149void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5150{
5151 mds_rank_t mds = session->mds_num;
5152 int used = get_caps_used(in);
5153 int wanted = in->caps_wanted();
5154
a8e16298
TL
5155 const unsigned new_caps = m->get_caps();
5156 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5157 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5158 << " mds." << mds << " seq " << m->get_seq()
5159 << " caps now " << ccap_string(new_caps)
a8e16298
TL
5160 << " was " << ccap_string(cap->issued)
5161 << (was_stale ? "" : " (stale)") << dendl;
5162
5163 if (was_stale)
5164 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5165 cap->seq = m->get_seq();
28e407b8 5166 cap->gen = session->cap_gen;
7c673cae 5167
11fdf7f2 5168 check_cap_issue(in, new_caps);
a8e16298 5169
7c673cae 5170 // update inode
1adf2230
AA
5171 int issued;
5172 in->caps_issued(&issued);
5173 issued |= in->caps_dirty();
7c673cae 5174
1adf2230
AA
5175 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5176 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5177 in->mode = m->head.mode;
5178 in->uid = m->head.uid;
5179 in->gid = m->head.gid;
5180 in->btime = m->btime;
5181 }
5182 bool deleted_inode = false;
1adf2230
AA
5183 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5184 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae
FG
5185 in->nlink = m->head.nlink;
5186 if (in->nlink == 0 &&
5187 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5188 deleted_inode = true;
5189 }
1adf2230 5190 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5191 m->xattrbl.length() &&
5192 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5193 auto p = m->xattrbl.cbegin();
5194 decode(in->xattrs, p);
7c673cae
FG
5195 in->xattr_version = m->head.xattr_version;
5196 }
28e407b8
AA
5197
5198 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5199 in->dirstat.nfiles = m->get_nfiles();
5200 in->dirstat.nsubdirs = m->get_nsubdirs();
5201 }
5202
1adf2230
AA
5203 if (new_caps & CEPH_CAP_ANY_RD) {
5204 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5205 m->get_ctime(), m->get_mtime(), m->get_atime());
5206 }
5207
5208 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5209 in->layout = m->get_layout();
5210 update_inode_file_size(in, issued, m->get_size(),
5211 m->get_truncate_seq(), m->get_truncate_size());
5212 }
5213
5214 if (m->inline_version > in->inline_version) {
5215 in->inline_data = m->inline_data;
5216 in->inline_version = m->inline_version;
5217 }
5218
5219 /* always take a newer change attr */
5220 if (m->get_change_attr() > in->change_attr)
5221 in->change_attr = m->get_change_attr();
7c673cae
FG
5222
5223 // max_size
5224 if (cap == in->auth_cap &&
1adf2230
AA
5225 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5226 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5227 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5228 in->max_size = m->get_max_size();
5229 if (in->max_size > in->wanted_max_size) {
5230 in->wanted_max_size = 0;
5231 in->requested_max_size = 0;
5232 }
5233 }
5234
5235 bool check = false;
a8e16298
TL
5236 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5237 (wanted & ~(cap->wanted | new_caps))) {
5238 // If mds is importing cap, prior cap messages that update 'wanted'
5239 // may get dropped by mds (migrate seq mismatch).
5240 //
5241 // We don't send cap message to update 'wanted' if what we want are
5242 // already issued. If mds revokes caps, cap message that releases caps
5243 // also tells mds what we want. But if caps got revoked by mds forcedly
5244 // (session stale). We may haven't told mds what we want.
7c673cae 5245 check = true;
a8e16298 5246 }
7c673cae 5247
7c673cae
FG
5248
5249 // update caps
a8e16298 5250 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5251 if (revoked) {
5252 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5253 cap->issued = new_caps;
5254 cap->implemented |= new_caps;
5255
b32b8144
FG
5256 // recall delegations if we're losing caps necessary for them
5257 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5258 in->recall_deleg(false);
5259 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5260 in->recall_deleg(true);
5261
11fdf7f2
TL
5262 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5263 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5264 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5265 // waitin' for flush
11fdf7f2 5266 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
7c673cae
FG
5267 if (_release(in))
5268 check = true;
5269 } else {
5270 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5271 check = true;
5272 }
a8e16298
TL
5273 } else if (cap->issued == new_caps) {
5274 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5275 } else {
a8e16298 5276 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5277 cap->issued = new_caps;
5278 cap->implemented |= new_caps;
5279
5280 if (cap == in->auth_cap) {
5281 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5282 for (const auto &p : in->caps) {
5283 if (&p.second == cap)
7c673cae 5284 continue;
11fdf7f2 5285 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5286 check = true;
5287 break;
5288 }
5289 }
5290 }
5291 }
5292
5293 if (check)
5294 check_caps(in, 0);
5295
5296 // wake up waiters
5297 if (new_caps)
5298 signal_cond_list(in->waitfor_caps);
5299
5300 // may drop inode's last ref
5301 if (deleted_inode)
5302 _try_to_trim_inode(in, true);
7c673cae
FG
5303}
5304
7c673cae
FG
5305int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5306{
5307 if (perms.uid() == 0)
5308 return 0;
5309
5310 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5311 int ret = _posix_acl_permission(in, perms, want);
5312 if (ret != -EAGAIN)
5313 return ret;
5314 }
5315
5316 // check permissions before doing anything else
5317 if (!in->check_mode(perms, want))
5318 return -EACCES;
5319 return 0;
5320}
5321
5322int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5323 const UserPerm& perms)
5324{
5325 int r = _getattr_for_perm(in, perms);
5326 if (r < 0)
5327 goto out;
5328
5329 r = 0;
5330 if (strncmp(name, "system.", 7) == 0) {
5331 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5332 r = -EPERM;
5333 } else {
5334 r = inode_permission(in, perms, want);
5335 }
5336out:
1adf2230 5337 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5338 return r;
5339}
5340
5341ostream& operator<<(ostream &out, const UserPerm& perm) {
5342 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5343 return out;
5344}
5345
5346int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5347 const UserPerm& perms)
5348{
181888fb 5349 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5350 int r = _getattr_for_perm(in, perms);
5351 if (r < 0)
5352 goto out;
5353
5354 if (mask & CEPH_SETATTR_SIZE) {
5355 r = inode_permission(in, perms, MAY_WRITE);
5356 if (r < 0)
5357 goto out;
5358 }
5359
5360 r = -EPERM;
5361 if (mask & CEPH_SETATTR_UID) {
5362 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5363 goto out;
5364 }
5365 if (mask & CEPH_SETATTR_GID) {
5366 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5367 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5368 goto out;
5369 }
5370
5371 if (mask & CEPH_SETATTR_MODE) {
5372 if (perms.uid() != 0 && perms.uid() != in->uid)
5373 goto out;
5374
5375 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5376 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5377 stx->stx_mode &= ~S_ISGID;
5378 }
5379
5380 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5381 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5382 if (perms.uid() != 0 && perms.uid() != in->uid) {
5383 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5384 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5385 check_mask |= CEPH_SETATTR_MTIME;
5386 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5387 check_mask |= CEPH_SETATTR_ATIME;
5388 if (check_mask & mask) {
5389 goto out;
5390 } else {
5391 r = inode_permission(in, perms, MAY_WRITE);
5392 if (r < 0)
5393 goto out;
5394 }
5395 }
5396 }
5397 r = 0;
5398out:
5399 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5400 return r;
5401}
5402
5403int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5404{
181888fb 5405 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5406 unsigned want = 0;
5407
5408 if ((flags & O_ACCMODE) == O_WRONLY)
5409 want = MAY_WRITE;
5410 else if ((flags & O_ACCMODE) == O_RDWR)
5411 want = MAY_READ | MAY_WRITE;
5412 else if ((flags & O_ACCMODE) == O_RDONLY)
5413 want = MAY_READ;
5414 if (flags & O_TRUNC)
5415 want |= MAY_WRITE;
5416
5417 int r = 0;
5418 switch (in->mode & S_IFMT) {
5419 case S_IFLNK:
5420 r = -ELOOP;
5421 goto out;
5422 case S_IFDIR:
5423 if (want & MAY_WRITE) {
5424 r = -EISDIR;
5425 goto out;
5426 }
5427 break;
5428 }
5429
5430 r = _getattr_for_perm(in, perms);
5431 if (r < 0)
5432 goto out;
5433
5434 r = inode_permission(in, perms, want);
5435out:
5436 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5437 return r;
5438}
5439
5440int Client::may_lookup(Inode *dir, const UserPerm& perms)
5441{
181888fb 5442 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5443 int r = _getattr_for_perm(dir, perms);
5444 if (r < 0)
5445 goto out;
5446
5447 r = inode_permission(dir, perms, MAY_EXEC);
5448out:
5449 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5450 return r;
5451}
5452
5453int Client::may_create(Inode *dir, const UserPerm& perms)
5454{
181888fb 5455 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5456 int r = _getattr_for_perm(dir, perms);
5457 if (r < 0)
5458 goto out;
5459
5460 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5461out:
5462 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5463 return r;
5464}
5465
5466int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5467{
181888fb 5468 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5469 int r = _getattr_for_perm(dir, perms);
5470 if (r < 0)
5471 goto out;
5472
5473 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5474 if (r < 0)
5475 goto out;
5476
5477 /* 'name == NULL' means rmsnap */
5478 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5479 InodeRef otherin;
5480 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5481 if (r < 0)
5482 goto out;
5483 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5484 r = -EPERM;
5485 }
5486out:
5487 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5488 return r;
5489}
5490
5491int Client::may_hardlink(Inode *in, const UserPerm& perms)
5492{
181888fb 5493 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5494 int r = _getattr_for_perm(in, perms);
5495 if (r < 0)
5496 goto out;
5497
5498 if (perms.uid() == 0 || perms.uid() == in->uid) {
5499 r = 0;
5500 goto out;
5501 }
5502
5503 r = -EPERM;
5504 if (!S_ISREG(in->mode))
5505 goto out;
5506
5507 if (in->mode & S_ISUID)
5508 goto out;
5509
5510 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5511 goto out;
5512
5513 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5514out:
5515 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5516 return r;
5517}
5518
5519int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5520{
5521 int mask = CEPH_STAT_CAP_MODE;
5522 bool force = false;
5523 if (acl_type != NO_ACL) {
5524 mask |= CEPH_STAT_CAP_XATTR;
5525 force = in->xattr_version == 0;
5526 }
5527 return _getattr(in, mask, perms, force);
5528}
5529
5530vinodeno_t Client::_get_vino(Inode *in)
5531{
5532 /* The caller must hold the client lock */
5533 return vinodeno_t(in->ino, in->snapid);
5534}
5535
7c673cae
FG
5536/**
5537 * Resolve an MDS spec to a list of MDS daemon GIDs.
5538 *
5539 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5540 * It may be '*' in which case it matches all GIDs.
5541 *
5542 * If no error is returned, the `targets` vector will be populated with at least
5543 * one MDS.
5544 */
5545int Client::resolve_mds(
5546 const std::string &mds_spec,
5547 std::vector<mds_gid_t> *targets)
5548{
11fdf7f2
TL
5549 ceph_assert(fsmap);
5550 ceph_assert(targets != nullptr);
7c673cae
FG
5551
5552 mds_role_t role;
5553 std::stringstream ss;
5554 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5555 if (role_r == 0) {
5556 // We got a role, resolve it to a GID
5557 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5558 << role << "'" << dendl;
5559 targets->push_back(
5560 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5561 return 0;
5562 }
5563
5564 std::string strtol_err;
5565 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5566 if (strtol_err.empty()) {
5567 // It is a possible GID
5568 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5569 if (fsmap->gid_exists(mds_gid)) {
5570 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5571 targets->push_back(mds_gid);
5572 } else {
5573 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5574 << dendl;
5575 return -ENOENT;
5576 }
5577 } else if (mds_spec == "*") {
5578 // It is a wildcard: use all MDSs
5579 const auto mds_info = fsmap->get_mds_info();
5580
5581 if (mds_info.empty()) {
5582 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5583 return -ENOENT;
5584 }
5585
5586 for (const auto i : mds_info) {
5587 targets->push_back(i.first);
5588 }
5589 } else {
5590 // It did not parse as an integer, it is not a wildcard, it must be a name
5591 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5592 if (mds_gid == 0) {
5593 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5594
5595 lderr(cct) << "FSMap: " << *fsmap << dendl;
5596
5597 return -ENOENT;
5598 } else {
5599 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5600 << "' to GID " << mds_gid << dendl;
5601 targets->push_back(mds_gid);
5602 }
5603 }
5604
5605 return 0;
5606}
5607
5608
5609/**
5610 * Authenticate with mon and establish global ID
5611 */
5612int Client::authenticate()
5613{
11fdf7f2 5614 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
5615
5616 if (monclient->is_authenticated()) {
5617 return 0;
5618 }
5619
5620 client_lock.Unlock();
5621 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5622 client_lock.Lock();
5623 if (r < 0) {
5624 return r;
5625 }
5626
5627 whoami = monclient->get_global_id();
5628 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5629
5630 return 0;
5631}
5632
5633int Client::fetch_fsmap(bool user)
5634{
5635 int r;
5636 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5637 // rather than MDSMap because no one MDSMap contains all the daemons, and
5638 // a `tell` can address any daemon.
5639 version_t fsmap_latest;
5640 do {
5641 C_SaferCond cond;
5642 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5643 client_lock.Unlock();
5644 r = cond.wait();
5645 client_lock.Lock();
5646 } while (r == -EAGAIN);
5647
5648 if (r < 0) {
5649 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5650 return r;
5651 }
5652
5653 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5654
5655 if (user) {
5656 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5657 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5658 monclient->renew_subs();
5659 wait_on_list(waiting_for_fsmap);
5660 }
11fdf7f2
TL
5661 ceph_assert(fsmap_user);
5662 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5663 } else {
5664 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5665 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5666 monclient->renew_subs();
5667 wait_on_list(waiting_for_fsmap);
5668 }
11fdf7f2
TL
5669 ceph_assert(fsmap);
5670 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5671 }
5672 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5673 << fsmap_latest << dendl;
5674 return 0;
5675}
5676
5677/**
5678 *
5679 * @mds_spec one of ID, rank, GID, "*"
5680 *
5681 */
5682int Client::mds_command(
5683 const std::string &mds_spec,
5684 const vector<string>& cmd,
5685 const bufferlist& inbl,
5686 bufferlist *outbl,
5687 string *outs,
5688 Context *onfinish)
5689{
11fdf7f2 5690 std::lock_guard lock(client_lock);
7c673cae 5691
181888fb
FG
5692 if (!initialized)
5693 return -ENOTCONN;
7c673cae
FG
5694
5695 int r;
5696 r = authenticate();
5697 if (r < 0) {
5698 return r;
5699 }
5700
5701 r = fetch_fsmap(false);
5702 if (r < 0) {
5703 return r;
5704 }
5705
5706 // Look up MDS target(s) of the command
5707 std::vector<mds_gid_t> targets;
5708 r = resolve_mds(mds_spec, &targets);
5709 if (r < 0) {
5710 return r;
5711 }
5712
5713 // If daemons are laggy, we won't send them commands. If all
5714 // are laggy then we fail.
5715 std::vector<mds_gid_t> non_laggy;
5716 for (const auto gid : targets) {
5717 const auto info = fsmap->get_info_gid(gid);
5718 if (!info.laggy()) {
5719 non_laggy.push_back(gid);
5720 }
5721 }
5722 if (non_laggy.size() == 0) {
5723 *outs = "All targeted MDS daemons are laggy";
5724 return -ENOENT;
5725 }
5726
5727 if (metadata.empty()) {
5728 // We are called on an unmounted client, so metadata
5729 // won't be initialized yet.
5730 populate_metadata("");
5731 }
5732
5733 // Send commands to targets
5734 C_GatherBuilder gather(cct, onfinish);
5735 for (const auto target_gid : non_laggy) {
5736 const auto info = fsmap->get_info_gid(target_gid);
5737
5738 // Open a connection to the target MDS
11fdf7f2 5739 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae
FG
5740
5741 // Generate MDSCommandOp state
5742 auto &op = command_table.start_command();
5743
5744 op.on_finish = gather.new_sub();
5745 op.cmd = cmd;
5746 op.outbl = outbl;
5747 op.outs = outs;
5748 op.inbl = inbl;
5749 op.mds_gid = target_gid;
5750 op.con = conn;
5751
5752 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5753 << " tid=" << op.tid << cmd << dendl;
5754
5755 // Construct and send MCommand
11fdf7f2
TL
5756 auto m = op.get_message(monclient->get_fsid());
5757 conn->send_message2(std::move(m));
7c673cae
FG
5758 }
5759 gather.activate();
5760
5761 return 0;
5762}
5763
11fdf7f2 5764void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
5765{
5766 ceph_tid_t const tid = m->get_tid();
5767
5768 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5769
5770 if (!command_table.exists(tid)) {
5771 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
5772 return;
5773 }
5774
5775 auto &op = command_table.get_command(tid);
5776 if (op.outbl) {
11fdf7f2 5777 *op.outbl = m->get_data();
7c673cae
FG
5778 }
5779 if (op.outs) {
5780 *op.outs = m->rs;
5781 }
5782
5783 if (op.on_finish) {
5784 op.on_finish->complete(m->r);
5785 }
5786
5787 command_table.erase(tid);
7c673cae
FG
5788}
5789
5790// -------------------
5791// MOUNT
5792
11fdf7f2 5793int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 5794{
7c673cae
FG
5795 int r = authenticate();
5796 if (r < 0) {
5797 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5798 return r;
5799 }
5800
11fdf7f2
TL
5801 std::string resolved_fs_name;
5802 if (fs_name.empty()) {
5803 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
5804 } else {
5805 resolved_fs_name = fs_name;
5806 }
5807
7c673cae 5808 std::string want = "mdsmap";
11fdf7f2 5809 if (!resolved_fs_name.empty()) {
7c673cae
FG
5810 r = fetch_fsmap(true);
5811 if (r < 0)
5812 return r;
11fdf7f2
TL
5813 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
5814 if (fscid == FS_CLUSTER_ID_NONE) {
7c673cae 5815 return -ENOENT;
11fdf7f2 5816 }
7c673cae
FG
5817
5818 std::ostringstream oss;
11fdf7f2 5819 oss << want << "." << fscid;
7c673cae
FG
5820 want = oss.str();
5821 }
5822 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5823
5824 monclient->sub_want(want, 0, 0);
5825 monclient->renew_subs();
5826
11fdf7f2
TL
5827 return 0;
5828}
5829
5830int Client::mount(const std::string &mount_root, const UserPerm& perms,
5831 bool require_mds, const std::string &fs_name)
5832{
5833 std::lock_guard lock(client_lock);
5834
5835 if (mounted) {
5836 ldout(cct, 5) << "already mounted" << dendl;
5837 return 0;
5838 }
5839
5840 unmounting = false;
5841
5842 int r = subscribe_mdsmap(fs_name);
5843 if (r < 0) {
5844 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
5845 return r;
5846 }
5847
7c673cae
FG
5848 tick(); // start tick
5849
5850 if (require_mds) {
5851 while (1) {
5852 auto availability = mdsmap->is_cluster_available();
5853 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5854 // Error out
5855 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5856 return CEPH_FUSE_NO_MDS_UP;
5857 } else if (availability == MDSMap::AVAILABLE) {
5858 // Continue to mount
5859 break;
5860 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5861 // Else, wait. MDSMonitor will update the map to bring
5862 // us to a conclusion eventually.
5863 wait_on_list(waiting_for_mdsmap);
5864 } else {
5865 // Unexpected value!
5866 ceph_abort();
5867 }
5868 }
5869 }
5870
5871 populate_metadata(mount_root.empty() ? "/" : mount_root);
5872
5873 filepath fp(CEPH_INO_ROOT);
5874 if (!mount_root.empty()) {
5875 fp = filepath(mount_root.c_str());
5876 }
5877 while (true) {
5878 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5879 req->set_filepath(fp);
5880 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5881 int res = make_request(req, perms);
5882 if (res < 0) {
5883 if (res == -EACCES && root) {
5884 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5885 break;
5886 }
5887 return res;
5888 }
5889
5890 if (fp.depth())
5891 fp.pop_dentry();
5892 else
5893 break;
5894 }
5895
11fdf7f2 5896 ceph_assert(root);
7c673cae
FG
5897 _ll_get(root);
5898
5899 mounted = true;
5900
5901 // trace?
5902 if (!cct->_conf->client_trace.empty()) {
5903 traceout.open(cct->_conf->client_trace.c_str());
5904 if (traceout.is_open()) {
5905 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5906 } else {
5907 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5908 }
5909 }
5910
5911 /*
5912 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5913 ldout(cct, 3) << "op: struct stat st;" << dendl;
5914 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5915 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5916 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5917 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5918 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5919 ldout(cct, 3) << "op: int fd;" << dendl;
5920 */
5921 return 0;
5922}
5923
5924// UNMOUNT
5925
5926void Client::_close_sessions()
5927{
5928 while (!mds_sessions.empty()) {
5929 // send session closes!
11fdf7f2
TL
5930 for (auto &p : mds_sessions) {
5931 if (p.second.state != MetaSession::STATE_CLOSING) {
5932 _close_mds_session(&p.second);
7c673cae
FG
5933 }
5934 }
5935
5936 // wait for sessions to close
5937 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5938 mount_cond.Wait(client_lock);
5939 }
5940}
5941
31f18b77
FG
5942void Client::flush_mdlog_sync()
5943{
5944 if (mds_requests.empty())
5945 return;
11fdf7f2
TL
5946 for (auto &p : mds_sessions) {
5947 flush_mdlog(&p.second);
31f18b77
FG
5948 }
5949}
5950
5951void Client::flush_mdlog(MetaSession *session)
5952{
5953 // Only send this to Luminous or newer MDS daemons, older daemons
5954 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5955 const uint64_t features = session->con->get_features();
5956 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
11fdf7f2
TL
5957 auto m = MClientSession::create(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5958 session->con->send_message2(std::move(m));
31f18b77
FG
5959 }
5960}
5961
5962
11fdf7f2
TL
5963void Client::_abort_mds_sessions(int err)
5964{
5965 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
5966 auto req = p->second;
5967 ++p;
5968 // unsafe requests will be removed during close session below.
5969 if (req->got_unsafe)
5970 continue;
5971
5972 req->abort(err);
5973 if (req->caller_cond) {
5974 req->kick = true;
5975 req->caller_cond->Signal();
5976 }
5977 }
5978
5979 // Process aborts on any requests that were on this waitlist.
5980 // Any requests that were on a waiting_for_open session waitlist
5981 // will get kicked during close session below.
5982 signal_cond_list(waiting_for_mdsmap);
5983
5984 // Force-close all sessions
5985 while(!mds_sessions.empty()) {
5986 auto& session = mds_sessions.begin()->second;
5987 _closed_mds_session(&session);
5988 }
5989}
5990
5991void Client::_unmount(bool abort)
7c673cae 5992{
181888fb
FG
5993 if (unmounting)
5994 return;
7c673cae 5995
11fdf7f2
TL
5996 if (abort || blacklisted) {
5997 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blacklisted)") << dendl;
5998 } else {
5999 ldout(cct, 2) << "unmounting" << dendl;
6000 }
7c673cae
FG
6001 unmounting = true;
6002
b32b8144
FG
6003 deleg_timeout = 0;
6004
11fdf7f2
TL
6005 if (abort) {
6006 // Abort all mds sessions
6007 _abort_mds_sessions(-ENOTCONN);
6008
6009 objecter->op_cancel_writes(-ENOTCONN);
6010 } else {
6011 // flush the mdlog for pending requests, if any
6012 flush_mdlog_sync();
6013 }
6014
7c673cae
FG
6015 while (!mds_requests.empty()) {
6016 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
6017 mount_cond.Wait(client_lock);
6018 }
6019
6020 if (tick_event)
6021 timer.cancel_event(tick_event);
6022 tick_event = 0;
6023
6024 cwd.reset();
6025
6026 // clean up any unclosed files
6027 while (!fd_map.empty()) {
6028 Fh *fh = fd_map.begin()->second;
6029 fd_map.erase(fd_map.begin());
6030 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6031 _release_fh(fh);
6032 }
6033
6034 while (!ll_unclosed_fh_set.empty()) {
6035 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6036 Fh *fh = *it;
6037 ll_unclosed_fh_set.erase(fh);
6038 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6039 _release_fh(fh);
6040 }
6041
6042 while (!opened_dirs.empty()) {
6043 dir_result_t *dirp = *opened_dirs.begin();
6044 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6045 _closedir(dirp);
6046 }
6047
6048 _ll_drop_pins();
6049
6050 while (unsafe_sync_write > 0) {
6051 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
6052 mount_cond.Wait(client_lock);
6053 }
6054
6055 if (cct->_conf->client_oc) {
6056 // flush/release all buffered data
11fdf7f2
TL
6057 std::list<InodeRef> anchor;
6058 for (auto& p : inode_map) {
6059 Inode *in = p.second;
7c673cae 6060 if (!in) {
11fdf7f2
TL
6061 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6062 ceph_assert(in);
7c673cae 6063 }
11fdf7f2
TL
6064
6065 // prevent inode from getting freed
6066 anchor.emplace_back(in);
6067
6068 if (abort || blacklisted) {
6069 objectcacher->purge_set(&in->oset);
6070 } else if (!in->caps.empty()) {
7c673cae
FG
6071 _release(in);
6072 _flush(in, new C_Client_FlushComplete(this, in));
6073 }
6074 }
6075 }
6076
11fdf7f2
TL
6077 if (abort || blacklisted) {
6078 for (auto p = dirty_list.begin(); !p.end(); ) {
6079 Inode *in = *p;
6080 ++p;
6081 if (in->dirty_caps) {
6082 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6083 in->mark_caps_clean();
6084 put_inode(in);
6085 }
6086 }
6087 } else {
6088 flush_caps_sync();
6089 wait_sync_caps(last_flush_tid);
6090 }
7c673cae
FG
6091
6092 // empty lru cache
7c673cae
FG
6093 trim_cache();
6094
6095 while (lru.lru_get_size() > 0 ||
6096 !inode_map.empty()) {
6097 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6098 << "+" << inode_map.size() << " items"
6099 << ", waiting (for caps to release?)"
6100 << dendl;
6101 utime_t until = ceph_clock_now() + utime_t(5, 0);
6102 int r = mount_cond.WaitUntil(client_lock, until);
6103 if (r == ETIMEDOUT) {
6104 dump_cache(NULL);
6105 }
6106 }
11fdf7f2
TL
6107 ceph_assert(lru.lru_get_size() == 0);
6108 ceph_assert(inode_map.empty());
7c673cae
FG
6109
6110 // stop tracing
6111 if (!cct->_conf->client_trace.empty()) {
6112 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6113 traceout.close();
6114 }
6115
6116 _close_sessions();
6117
6118 mounted = false;
6119
6120 ldout(cct, 2) << "unmounted." << dendl;
6121}
6122
b32b8144
FG
6123void Client::unmount()
6124{
11fdf7f2
TL
6125 std::lock_guard lock(client_lock);
6126 _unmount(false);
6127}
6128
6129void Client::abort_conn()
6130{
6131 std::lock_guard lock(client_lock);
6132 _unmount(true);
b32b8144
FG
6133}
6134
7c673cae
FG
6135void Client::flush_cap_releases()
6136{
6137 // send any cap releases
11fdf7f2
TL
6138 for (auto &p : mds_sessions) {
6139 auto &session = p.second;
6140 if (session.release && mdsmap->is_clientreplay_or_active_or_stopping(
6141 p.first)) {
7c673cae
FG
6142 if (cct->_conf->client_inject_release_failure) {
6143 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6144 } else {
11fdf7f2 6145 session.con->send_message2(std::move(session.release));
7c673cae 6146 }
11fdf7f2 6147 session.release.reset();
7c673cae
FG
6148 }
6149 }
6150}
6151
6152void Client::tick()
6153{
6154 if (cct->_conf->client_debug_inject_tick_delay > 0) {
6155 sleep(cct->_conf->client_debug_inject_tick_delay);
11fdf7f2
TL
6156 ceph_assert(0 == cct->_conf.set_val("client_debug_inject_tick_delay", "0"));
6157 cct->_conf.apply_changes(nullptr);
7c673cae
FG
6158 }
6159
6160 ldout(cct, 21) << "tick" << dendl;
3efd9988
FG
6161 tick_event = timer.add_event_after(
6162 cct->_conf->client_tick_interval,
6163 new FunctionContext([this](int) {
6164 // Called back via Timer, which takes client_lock for us
11fdf7f2 6165 ceph_assert(client_lock.is_locked_by_me());
3efd9988
FG
6166 tick();
6167 }));
7c673cae
FG
6168 utime_t now = ceph_clock_now();
6169
6170 if (!mounted && !mds_requests.empty()) {
6171 MetaRequest *req = mds_requests.begin()->second;
6172 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
6173 req->abort(-ETIMEDOUT);
6174 if (req->caller_cond) {
6175 req->kick = true;
6176 req->caller_cond->Signal();
6177 }
6178 signal_cond_list(waiting_for_mdsmap);
11fdf7f2
TL
6179 for (auto &p : mds_sessions) {
6180 signal_context_list(p.second.waiting_for_open);
6181 }
7c673cae
FG
6182 }
6183 }
6184
6185 if (mdsmap->get_epoch()) {
6186 // renew caps?
6187 utime_t el = now - last_cap_renew;
6188 if (el > mdsmap->get_session_timeout() / 3.0)
6189 renew_caps();
6190
6191 flush_cap_releases();
6192 }
6193
6194 // delayed caps
28e407b8 6195 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6196 while (!p.end()) {
6197 Inode *in = *p;
6198 ++p;
6199 if (in->hold_caps_until > now)
6200 break;
28e407b8 6201 delayed_list.pop_front();
7c673cae
FG
6202 check_caps(in, CHECK_CAPS_NODELAY);
6203 }
6204
6205 trim_cache(true);
6206}
6207
6208void Client::renew_caps()
6209{
6210 ldout(cct, 10) << "renew_caps()" << dendl;
6211 last_cap_renew = ceph_clock_now();
6212
11fdf7f2
TL
6213 for (auto &p : mds_sessions) {
6214 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6215 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
6216 renew_caps(&p.second);
7c673cae
FG
6217 }
6218}
6219
6220void Client::renew_caps(MetaSession *session)
6221{
6222 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6223 session->last_cap_renew_request = ceph_clock_now();
6224 uint64_t seq = ++session->cap_renew_seq;
11fdf7f2 6225 session->con->send_message2(MClientSession::create(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6226}
6227
6228
6229// ===============================================================
6230// high level (POSIXy) interface
6231
6232int Client::_do_lookup(Inode *dir, const string& name, int mask,
6233 InodeRef *target, const UserPerm& perms)
6234{
6235 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6236 MetaRequest *req = new MetaRequest(op);
6237 filepath path;
6238 dir->make_nosnap_relative_path(path);
6239 path.push_dentry(name);
6240 req->set_filepath(path);
6241 req->set_inode(dir);
6242 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6243 mask |= DEBUG_GETATTR_CAPS;
6244 req->head.args.getattr.mask = mask;
6245
11fdf7f2 6246 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6247
6248 int r = make_request(req, perms, target);
11fdf7f2 6249 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6250 return r;
6251}
6252
6253int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6254 const UserPerm& perms)
6255{
6256 int r = 0;
6257 Dentry *dn = NULL;
6258
7c673cae 6259 if (dname == "..") {
11fdf7f2
TL
6260 if (dir->dentries.empty()) {
6261 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6262 filepath path(dir->ino);
6263 req->set_filepath(path);
6264
6265 InodeRef tmptarget;
6266 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6267
6268 if (r == 0) {
6269 Inode *tempino = tmptarget.get();
6270 _ll_get(tempino);
6271 *target = tempino;
6272 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6273 } else {
6274 *target = dir;
6275 }
6276 }
7c673cae
FG
6277 else
6278 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6279 goto done;
6280 }
6281
6282 if (dname == ".") {
6283 *target = dir;
6284 goto done;
6285 }
6286
11fdf7f2
TL
6287 if (!dir->is_dir()) {
6288 r = -ENOTDIR;
6289 goto done;
6290 }
6291
7c673cae
FG
6292 if (dname.length() > NAME_MAX) {
6293 r = -ENAMETOOLONG;
6294 goto done;
6295 }
6296
6297 if (dname == cct->_conf->client_snapdir &&
6298 dir->snapid == CEPH_NOSNAP) {
6299 *target = open_snapdir(dir);
6300 goto done;
6301 }
6302
6303 if (dir->dir &&
6304 dir->dir->dentries.count(dname)) {
6305 dn = dir->dir->dentries[dname];
6306
11fdf7f2 6307 ldout(cct, 20) << __func__ << " have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
7c673cae
FG
6308 << " seq " << dn->lease_seq
6309 << dendl;
6310
94b18763 6311 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
6312 // is dn lease valid?
6313 utime_t now = ceph_clock_now();
6314 if (dn->lease_mds >= 0 &&
6315 dn->lease_ttl > now &&
6316 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6317 MetaSession &s = mds_sessions.at(dn->lease_mds);
6318 if (s.cap_ttl > now &&
6319 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6320 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6321 // make trim_caps() behave.
6322 dir->try_touch_cap(dn->lease_mds);
6323 goto hit_dn;
6324 }
11fdf7f2 6325 ldout(cct, 20) << " bad lease, cap_ttl " << s.cap_ttl << ", cap_gen " << s.cap_gen
7c673cae
FG
6326 << " vs lease_gen " << dn->lease_gen << dendl;
6327 }
6328 // dir lease?
94b18763 6329 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6330 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6331 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6332 goto hit_dn;
6333 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6334 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae
FG
6335 << *dir << " dn '" << dname << "'" << dendl;
6336 return -ENOENT;
6337 }
6338 }
6339 } else {
6340 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6341 }
6342 } else {
6343 // can we conclude ENOENT locally?
94b18763 6344 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6345 (dir->flags & I_COMPLETE)) {
11fdf7f2 6346 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
7c673cae
FG
6347 return -ENOENT;
6348 }
6349 }
6350
6351 r = _do_lookup(dir, dname, mask, target, perms);
6352 goto done;
6353
6354 hit_dn:
6355 if (dn->inode) {
6356 *target = dn->inode;
6357 } else {
6358 r = -ENOENT;
6359 }
6360 touch_dn(dn);
6361
6362 done:
6363 if (r < 0)
11fdf7f2 6364 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6365 else
11fdf7f2 6366 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6367 return r;
6368}
6369
6370int Client::get_or_create(Inode *dir, const char* name,
6371 Dentry **pdn, bool expect_null)
6372{
6373 // lookup
11fdf7f2 6374 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6375 dir->open_dir();
6376 if (dir->dir->dentries.count(name)) {
6377 Dentry *dn = dir->dir->dentries[name];
6378
6379 // is dn lease valid?
6380 utime_t now = ceph_clock_now();
6381 if (dn->inode &&
6382 dn->lease_mds >= 0 &&
6383 dn->lease_ttl > now &&
6384 mds_sessions.count(dn->lease_mds)) {
11fdf7f2
TL
6385 MetaSession &s = mds_sessions.at(dn->lease_mds);
6386 if (s.cap_ttl > now &&
6387 s.cap_gen == dn->lease_gen) {
7c673cae
FG
6388 if (expect_null)
6389 return -EEXIST;
6390 }
6391 }
6392 *pdn = dn;
6393 } else {
6394 // otherwise link up a new one
6395 *pdn = link(dir->dir, name, NULL, NULL);
6396 }
6397
6398 // success
6399 return 0;
6400}
6401
6402int Client::path_walk(const filepath& origpath, InodeRef *end,
6403 const UserPerm& perms, bool followsym, int mask)
6404{
6405 filepath path = origpath;
6406 InodeRef cur;
6407 if (origpath.absolute())
6408 cur = root;
6409 else
6410 cur = cwd;
11fdf7f2 6411 ceph_assert(cur);
7c673cae 6412
11fdf7f2 6413 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
6414
6415 int symlinks = 0;
6416
6417 unsigned i=0;
6418 while (i < path.depth() && cur) {
6419 int caps = 0;
6420 const string &dname = path[i];
6421 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6422 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6423 InodeRef next;
6424 if (cct->_conf->client_permissions) {
6425 int r = may_lookup(cur.get(), perms);
6426 if (r < 0)
6427 return r;
6428 caps = CEPH_CAP_AUTH_SHARED;
6429 }
6430
6431 /* Get extra requested caps on the last component */
6432 if (i == (path.depth() - 1))
6433 caps |= mask;
6434 int r = _lookup(cur.get(), dname, caps, &next, perms);
6435 if (r < 0)
6436 return r;
6437 // only follow trailing symlink if followsym. always follow
6438 // 'directory' symlinks.
6439 if (next && next->is_symlink()) {
6440 symlinks++;
6441 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6442 if (symlinks > MAXSYMLINKS) {
6443 return -ELOOP;
6444 }
6445
6446 if (i < path.depth() - 1) {
6447 // dir symlink
6448 // replace consumed components of path with symlink dir target
6449 filepath resolved(next->symlink.c_str());
6450 resolved.append(path.postfixpath(i + 1));
6451 path = resolved;
6452 i = 0;
6453 if (next->symlink[0] == '/') {
6454 cur = root;
6455 }
6456 continue;
6457 } else if (followsym) {
6458 if (next->symlink[0] == '/') {
6459 path = next->symlink.c_str();
6460 i = 0;
6461 // reset position
6462 cur = root;
6463 } else {
6464 filepath more(next->symlink.c_str());
6465 // we need to remove the symlink component from off of the path
6466 // before adding the target that the symlink points to. remain
6467 // at the same position in the path.
6468 path.pop_dentry();
6469 path.append(more);
6470 }
6471 continue;
6472 }
6473 }
6474 cur.swap(next);
6475 i++;
6476 }
6477 if (!cur)
6478 return -ENOENT;
6479 if (end)
6480 end->swap(cur);
6481 return 0;
6482}
6483
6484
6485// namespace ops
6486
6487int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6488{
11fdf7f2 6489 std::lock_guard lock(client_lock);
7c673cae
FG
6490 tout(cct) << "link" << std::endl;
6491 tout(cct) << relexisting << std::endl;
6492 tout(cct) << relpath << std::endl;
6493
181888fb
FG
6494 if (unmounting)
6495 return -ENOTCONN;
6496
7c673cae
FG
6497 filepath existing(relexisting);
6498
6499 InodeRef in, dir;
6500 int r = path_walk(existing, &in, perm, true);
6501 if (r < 0)
6502 return r;
6503 if (std::string(relpath) == "/") {
6504 r = -EEXIST;
6505 return r;
6506 }
6507 filepath path(relpath);
6508 string name = path.last_dentry();
6509 path.pop_dentry();
6510
6511 r = path_walk(path, &dir, perm, true);
6512 if (r < 0)
6513 return r;
6514 if (cct->_conf->client_permissions) {
6515 if (S_ISDIR(in->mode)) {
6516 r = -EPERM;
6517 return r;
6518 }
6519 r = may_hardlink(in.get(), perm);
6520 if (r < 0)
6521 return r;
6522 r = may_create(dir.get(), perm);
6523 if (r < 0)
6524 return r;
6525 }
6526 r = _link(in.get(), dir.get(), name.c_str(), perm);
6527 return r;
6528}
6529
6530int Client::unlink(const char *relpath, const UserPerm& perm)
6531{
11fdf7f2
TL
6532 std::lock_guard lock(client_lock);
6533 tout(cct) << __func__ << std::endl;
7c673cae
FG
6534 tout(cct) << relpath << std::endl;
6535
181888fb
FG
6536 if (unmounting)
6537 return -ENOTCONN;
6538
7c673cae
FG
6539 if (std::string(relpath) == "/")
6540 return -EISDIR;
6541
6542 filepath path(relpath);
6543 string name = path.last_dentry();
6544 path.pop_dentry();
6545 InodeRef dir;
6546 int r = path_walk(path, &dir, perm);
6547 if (r < 0)
6548 return r;
6549 if (cct->_conf->client_permissions) {
6550 r = may_delete(dir.get(), name.c_str(), perm);
6551 if (r < 0)
6552 return r;
6553 }
6554 return _unlink(dir.get(), name.c_str(), perm);
6555}
6556
6557int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6558{
11fdf7f2
TL
6559 std::lock_guard lock(client_lock);
6560 tout(cct) << __func__ << std::endl;
7c673cae
FG
6561 tout(cct) << relfrom << std::endl;
6562 tout(cct) << relto << std::endl;
6563
181888fb
FG
6564 if (unmounting)
6565 return -ENOTCONN;
6566
7c673cae
FG
6567 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6568 return -EBUSY;
6569
6570 filepath from(relfrom);
6571 filepath to(relto);
6572 string fromname = from.last_dentry();
6573 from.pop_dentry();
6574 string toname = to.last_dentry();
6575 to.pop_dentry();
6576
6577 InodeRef fromdir, todir;
6578 int r = path_walk(from, &fromdir, perm);
6579 if (r < 0)
6580 goto out;
6581 r = path_walk(to, &todir, perm);
6582 if (r < 0)
6583 goto out;
6584
6585 if (cct->_conf->client_permissions) {
6586 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6587 if (r < 0)
6588 return r;
6589 r = may_delete(todir.get(), toname.c_str(), perm);
6590 if (r < 0 && r != -ENOENT)
6591 return r;
6592 }
6593 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6594out:
6595 return r;
6596}
6597
6598// dirs
6599
6600int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6601{
11fdf7f2
TL
6602 std::lock_guard lock(client_lock);
6603 tout(cct) << __func__ << std::endl;
7c673cae
FG
6604 tout(cct) << relpath << std::endl;
6605 tout(cct) << mode << std::endl;
11fdf7f2 6606 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 6607
181888fb
FG
6608 if (unmounting)
6609 return -ENOTCONN;
6610
7c673cae
FG
6611 if (std::string(relpath) == "/")
6612 return -EEXIST;
6613
6614 filepath path(relpath);
6615 string name = path.last_dentry();
6616 path.pop_dentry();
6617 InodeRef dir;
6618 int r = path_walk(path, &dir, perm);
6619 if (r < 0)
6620 return r;
6621 if (cct->_conf->client_permissions) {
6622 r = may_create(dir.get(), perm);
6623 if (r < 0)
6624 return r;
6625 }
6626 return _mkdir(dir.get(), name.c_str(), mode, perm);
6627}
6628
6629int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6630{
11fdf7f2 6631 std::lock_guard lock(client_lock);
7c673cae 6632 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 6633 tout(cct) << __func__ << std::endl;
7c673cae
FG
6634 tout(cct) << relpath << std::endl;
6635 tout(cct) << mode << std::endl;
6636
181888fb
FG
6637 if (unmounting)
6638 return -ENOTCONN;
6639
7c673cae
FG
6640 //get through existing parts of path
6641 filepath path(relpath);
6642 unsigned int i;
6643 int r = 0, caps = 0;
6644 InodeRef cur, next;
6645 cur = cwd;
6646 for (i=0; i<path.depth(); ++i) {
6647 if (cct->_conf->client_permissions) {
6648 r = may_lookup(cur.get(), perms);
6649 if (r < 0)
6650 break;
6651 caps = CEPH_CAP_AUTH_SHARED;
6652 }
6653 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6654 if (r < 0)
6655 break;
6656 cur.swap(next);
6657 }
7c673cae 6658 if (r!=-ENOENT) return r;
11fdf7f2 6659 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
6660 //make new directory at each level
6661 for (; i<path.depth(); ++i) {
6662 if (cct->_conf->client_permissions) {
6663 r = may_create(cur.get(), perms);
6664 if (r < 0)
6665 return r;
6666 }
6667 //make new dir
6668 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 6669
7c673cae 6670 //check proper creation/existence
c07f9fc5
FG
6671 if(-EEXIST == r && i < path.depth() - 1) {
6672 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6673 }
6674 if (r < 0)
6675 return r;
7c673cae
FG
6676 //move to new dir and continue
6677 cur.swap(next);
11fdf7f2 6678 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
6679 << filepath(cur->ino).get_path() << dendl;
6680 }
6681 return 0;
6682}
6683
6684int Client::rmdir(const char *relpath, const UserPerm& perms)
6685{
11fdf7f2
TL
6686 std::lock_guard lock(client_lock);
6687 tout(cct) << __func__ << std::endl;
7c673cae
FG
6688 tout(cct) << relpath << std::endl;
6689
181888fb
FG
6690 if (unmounting)
6691 return -ENOTCONN;
6692
7c673cae
FG
6693 if (std::string(relpath) == "/")
6694 return -EBUSY;
6695
6696 filepath path(relpath);
6697 string name = path.last_dentry();
6698 path.pop_dentry();
6699 InodeRef dir;
6700 int r = path_walk(path, &dir, perms);
6701 if (r < 0)
6702 return r;
6703 if (cct->_conf->client_permissions) {
6704 int r = may_delete(dir.get(), name.c_str(), perms);
6705 if (r < 0)
6706 return r;
6707 }
6708 return _rmdir(dir.get(), name.c_str(), perms);
6709}
6710
6711int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6712{
11fdf7f2
TL
6713 std::lock_guard lock(client_lock);
6714 tout(cct) << __func__ << std::endl;
7c673cae
FG
6715 tout(cct) << relpath << std::endl;
6716 tout(cct) << mode << std::endl;
6717 tout(cct) << rdev << std::endl;
6718
181888fb
FG
6719 if (unmounting)
6720 return -ENOTCONN;
6721
7c673cae
FG
6722 if (std::string(relpath) == "/")
6723 return -EEXIST;
6724
6725 filepath path(relpath);
6726 string name = path.last_dentry();
6727 path.pop_dentry();
6728 InodeRef dir;
6729 int r = path_walk(path, &dir, perms);
6730 if (r < 0)
6731 return r;
6732 if (cct->_conf->client_permissions) {
6733 int r = may_create(dir.get(), perms);
6734 if (r < 0)
6735 return r;
6736 }
6737 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6738}
6739
6740// symlinks
6741
6742int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6743{
11fdf7f2
TL
6744 std::lock_guard lock(client_lock);
6745 tout(cct) << __func__ << std::endl;
7c673cae
FG
6746 tout(cct) << target << std::endl;
6747 tout(cct) << relpath << std::endl;
6748
181888fb
FG
6749 if (unmounting)
6750 return -ENOTCONN;
6751
7c673cae
FG
6752 if (std::string(relpath) == "/")
6753 return -EEXIST;
6754
6755 filepath path(relpath);
6756 string name = path.last_dentry();
6757 path.pop_dentry();
6758 InodeRef dir;
6759 int r = path_walk(path, &dir, perms);
6760 if (r < 0)
6761 return r;
6762 if (cct->_conf->client_permissions) {
6763 int r = may_create(dir.get(), perms);
6764 if (r < 0)
6765 return r;
6766 }
6767 return _symlink(dir.get(), name.c_str(), target, perms);
6768}
6769
6770int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6771{
11fdf7f2
TL
6772 std::lock_guard lock(client_lock);
6773 tout(cct) << __func__ << std::endl;
7c673cae
FG
6774 tout(cct) << relpath << std::endl;
6775
181888fb
FG
6776 if (unmounting)
6777 return -ENOTCONN;
6778
7c673cae
FG
6779 filepath path(relpath);
6780 InodeRef in;
6781 int r = path_walk(path, &in, perms, false);
6782 if (r < 0)
6783 return r;
6784
6785 return _readlink(in.get(), buf, size);
6786}
6787
6788int Client::_readlink(Inode *in, char *buf, size_t size)
6789{
6790 if (!in->is_symlink())
6791 return -EINVAL;
6792
6793 // copy into buf (at most size bytes)
6794 int r = in->symlink.length();
6795 if (r > (int)size)
6796 r = size;
6797 memcpy(buf, in->symlink.c_str(), r);
6798 return r;
6799}
6800
6801
6802// inode stuff
6803
6804int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6805{
94b18763 6806 bool yes = in->caps_issued_mask(mask, true);
7c673cae 6807
11fdf7f2 6808 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
6809 if (yes && !force)
6810 return 0;
6811
6812 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6813 filepath path;
6814 in->make_nosnap_relative_path(path);
6815 req->set_filepath(path);
6816 req->set_inode(in);
6817 req->head.args.getattr.mask = mask;
6818
6819 int res = make_request(req, perms);
11fdf7f2 6820 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
6821 return res;
6822}
6823
6824int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6825 const UserPerm& perms, InodeRef *inp)
6826{
6827 int issued = in->caps_issued();
6828
11fdf7f2 6829 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
6830 ccap_string(issued) << dendl;
6831
6832 if (in->snapid != CEPH_NOSNAP) {
6833 return -EROFS;
6834 }
6835 if ((mask & CEPH_SETATTR_SIZE) &&
6836 (unsigned long)stx->stx_size > in->size &&
6837 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6838 perms)) {
6839 return -EDQUOT;
6840 }
6841
6842 // make the change locally?
6843 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6844 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6845 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6846 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6847 << in->cap_dirtier_gid << ", forcing sync setattr"
6848 << dendl;
6849 /*
6850 * This works because we implicitly flush the caps as part of the
6851 * request, so the cap update check will happen with the writeback
6852 * cap context, and then the setattr check will happen with the
6853 * caller's context.
6854 *
6855 * In reality this pattern is likely pretty rare (different users
6856 * setattr'ing the same file). If that turns out not to be the
6857 * case later, we can build a more complex pipelined cap writeback
6858 * infrastructure...
6859 */
6860 if (!mask)
6861 mask |= CEPH_SETATTR_CTIME;
6862 goto force_request;
6863 }
6864
6865 if (!mask) {
6866 // caller just needs us to bump the ctime
6867 in->ctime = ceph_clock_now();
6868 in->cap_dirtier_uid = perms.uid();
6869 in->cap_dirtier_gid = perms.gid();
6870 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 6871 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 6872 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 6873 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 6874 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 6875 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
6876 else
6877 mask |= CEPH_SETATTR_CTIME;
6878 }
6879
6880 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6881 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6882
6883 mask &= ~CEPH_SETATTR_KILL_SGUID;
6884
6885 if (mask & CEPH_SETATTR_UID) {
6886 in->ctime = ceph_clock_now();
6887 in->cap_dirtier_uid = perms.uid();
6888 in->cap_dirtier_gid = perms.gid();
6889 in->uid = stx->stx_uid;
28e407b8 6890 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6891 mask &= ~CEPH_SETATTR_UID;
6892 kill_sguid = true;
6893 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6894 }
6895 if (mask & CEPH_SETATTR_GID) {
6896 in->ctime = ceph_clock_now();
6897 in->cap_dirtier_uid = perms.uid();
6898 in->cap_dirtier_gid = perms.gid();
6899 in->gid = stx->stx_gid;
28e407b8 6900 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6901 mask &= ~CEPH_SETATTR_GID;
6902 kill_sguid = true;
6903 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6904 }
6905
6906 if (mask & CEPH_SETATTR_MODE) {
6907 in->ctime = ceph_clock_now();
6908 in->cap_dirtier_uid = perms.uid();
6909 in->cap_dirtier_gid = perms.gid();
6910 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 6911 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6912 mask &= ~CEPH_SETATTR_MODE;
6913 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
181888fb 6914 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7c673cae 6915 /* Must squash the any setuid/setgid bits with an ownership change */
181888fb 6916 in->mode &= ~(S_ISUID|S_ISGID);
28e407b8 6917 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6918 }
6919
6920 if (mask & CEPH_SETATTR_BTIME) {
6921 in->ctime = ceph_clock_now();
6922 in->cap_dirtier_uid = perms.uid();
6923 in->cap_dirtier_gid = perms.gid();
6924 in->btime = utime_t(stx->stx_btime);
28e407b8 6925 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
6926 mask &= ~CEPH_SETATTR_BTIME;
6927 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6928 }
6929 } else if (mask & CEPH_SETATTR_SIZE) {
6930 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6931 mask |= CEPH_SETATTR_KILL_SGUID;
6932 }
6933
6934 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6935 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6936 if (mask & CEPH_SETATTR_MTIME)
6937 in->mtime = utime_t(stx->stx_mtime);
6938 if (mask & CEPH_SETATTR_ATIME)
6939 in->atime = utime_t(stx->stx_atime);
6940 in->ctime = ceph_clock_now();
6941 in->cap_dirtier_uid = perms.uid();
6942 in->cap_dirtier_gid = perms.gid();
6943 in->time_warp_seq++;
28e407b8 6944 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae
FG
6945 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6946 }
6947 }
6948 if (!mask) {
6949 in->change_attr++;
6950 return 0;
6951 }
6952
6953force_request:
6954 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6955
6956 filepath path;
6957
6958 in->make_nosnap_relative_path(path);
6959 req->set_filepath(path);
6960 req->set_inode(in);
6961
6962 if (mask & CEPH_SETATTR_KILL_SGUID) {
6963 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6964 }
6965 if (mask & CEPH_SETATTR_MODE) {
6966 req->head.args.setattr.mode = stx->stx_mode;
6967 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6968 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6969 }
6970 if (mask & CEPH_SETATTR_UID) {
6971 req->head.args.setattr.uid = stx->stx_uid;
6972 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6973 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6974 }
6975 if (mask & CEPH_SETATTR_GID) {
6976 req->head.args.setattr.gid = stx->stx_gid;
6977 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6978 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6979 }
6980 if (mask & CEPH_SETATTR_BTIME) {
6981 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6982 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6983 }
6984 if (mask & CEPH_SETATTR_MTIME) {
6985 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
94b18763 6986 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
6987 CEPH_CAP_FILE_WR;
6988 }
6989 if (mask & CEPH_SETATTR_ATIME) {
6990 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6991 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6992 CEPH_CAP_FILE_WR;
6993 }
6994 if (mask & CEPH_SETATTR_SIZE) {
6995 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6996 req->head.args.setattr.size = stx->stx_size;
6997 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6998 } else { //too big!
6999 put_request(req);
7000 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7001 return -EFBIG;
7002 }
94b18763 7003 req->inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7c673cae
FG
7004 CEPH_CAP_FILE_WR;
7005 }
7006 req->head.args.setattr.mask = mask;
7007
7008 req->regetattr_mask = mask;
7009
7010 int res = make_request(req, perms, inp);
7011 ldout(cct, 10) << "_setattr result=" << res << dendl;
7012 return res;
7013}
7014
7015/* Note that we only care about attrs that setattr cares about */
7016void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7017{
7018 stx->stx_size = st->st_size;
7019 stx->stx_mode = st->st_mode;
7020 stx->stx_uid = st->st_uid;
7021 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7022#ifdef __APPLE__
7023 stx->stx_mtime = st->st_mtimespec;
7024 stx->stx_atime = st->st_atimespec;
7025#else
7c673cae
FG
7026 stx->stx_mtime = st->st_mtim;
7027 stx->stx_atime = st->st_atim;
11fdf7f2 7028#endif
7c673cae
FG
7029}
7030
7031int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7032 const UserPerm& perms, InodeRef *inp)
7033{
7034 int ret = _do_setattr(in, stx, mask, perms, inp);
7035 if (ret < 0)
7036 return ret;
7037 if (mask & CEPH_SETATTR_MODE)
7038 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7039 return ret;
7040}
7041
7042int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7043 const UserPerm& perms)
7044{
7045 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7046 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7047 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7048 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7049 if (cct->_conf->client_permissions) {
7050 int r = may_setattr(in.get(), stx, mask, perms);
7051 if (r < 0)
7052 return r;
7053 }
7054 return __setattrx(in.get(), stx, mask, perms);
7055}
7056
7057int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7058 const UserPerm& perms)
7059{
7060 struct ceph_statx stx;
7061
7062 stat_to_statx(attr, &stx);
7063 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7064
7065 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7066 mask &= ~CEPH_SETATTR_UID;
7067 }
7068 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7069 mask &= ~CEPH_SETATTR_GID;
7070 }
7071
7c673cae
FG
7072 return _setattrx(in, &stx, mask, perms);
7073}
7074
7075int Client::setattr(const char *relpath, struct stat *attr, int mask,
7076 const UserPerm& perms)
7077{
11fdf7f2
TL
7078 std::lock_guard lock(client_lock);
7079 tout(cct) << __func__ << std::endl;
7c673cae
FG
7080 tout(cct) << relpath << std::endl;
7081 tout(cct) << mask << std::endl;
7082
181888fb
FG
7083 if (unmounting)
7084 return -ENOTCONN;
7085
7c673cae
FG
7086 filepath path(relpath);
7087 InodeRef in;
7088 int r = path_walk(path, &in, perms);
7089 if (r < 0)
7090 return r;
7091 return _setattr(in, attr, mask, perms);
7092}
7093
7094int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7095 const UserPerm& perms, int flags)
7096{
11fdf7f2
TL
7097 std::lock_guard lock(client_lock);
7098 tout(cct) << __func__ << std::endl;
7c673cae
FG
7099 tout(cct) << relpath << std::endl;
7100 tout(cct) << mask << std::endl;
7101
181888fb
FG
7102 if (unmounting)
7103 return -ENOTCONN;
7104
7c673cae
FG
7105 filepath path(relpath);
7106 InodeRef in;
7107 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7108 if (r < 0)
7109 return r;
7110 return _setattrx(in, stx, mask, perms);
7111}
7112
7113int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7114{
11fdf7f2
TL
7115 std::lock_guard lock(client_lock);
7116 tout(cct) << __func__ << std::endl;
7c673cae
FG
7117 tout(cct) << fd << std::endl;
7118 tout(cct) << mask << std::endl;
7119
181888fb
FG
7120 if (unmounting)
7121 return -ENOTCONN;
7122
7c673cae
FG
7123 Fh *f = get_filehandle(fd);
7124 if (!f)
7125 return -EBADF;
7126#if defined(__linux__) && defined(O_PATH)
7127 if (f->flags & O_PATH)
7128 return -EBADF;
7129#endif
7130 return _setattr(f->inode, attr, mask, perms);
7131}
7132
7133int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7134{
11fdf7f2
TL
7135 std::lock_guard lock(client_lock);
7136 tout(cct) << __func__ << std::endl;
7c673cae
FG
7137 tout(cct) << fd << std::endl;
7138 tout(cct) << mask << std::endl;
7139
181888fb
FG
7140 if (unmounting)
7141 return -ENOTCONN;
7142
7c673cae
FG
7143 Fh *f = get_filehandle(fd);
7144 if (!f)
7145 return -EBADF;
7146#if defined(__linux__) && defined(O_PATH)
7147 if (f->flags & O_PATH)
7148 return -EBADF;
7149#endif
7150 return _setattrx(f->inode, stx, mask, perms);
7151}
7152
7153int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7154 frag_info_t *dirstat, int mask)
7155{
11fdf7f2
TL
7156 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7157 std::lock_guard lock(client_lock);
7c673cae
FG
7158 tout(cct) << "stat" << std::endl;
7159 tout(cct) << relpath << std::endl;
181888fb
FG
7160
7161 if (unmounting)
7162 return -ENOTCONN;
7163
7c673cae
FG
7164 filepath path(relpath);
7165 InodeRef in;
7166 int r = path_walk(path, &in, perms, true, mask);
7167 if (r < 0)
7168 return r;
7169 r = _getattr(in, mask, perms);
7170 if (r < 0) {
11fdf7f2 7171 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7172 return r;
7173 }
7174 fill_stat(in, stbuf, dirstat);
11fdf7f2 7175 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7176 return r;
7177}
7178
7179unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7180{
7181 unsigned mask = 0;
7182
7183 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7184 if (flags & AT_NO_ATTR_SYNC)
7185 goto out;
7186
7187 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7188 mask |= CEPH_CAP_PIN;
7189 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7190 mask |= CEPH_CAP_AUTH_SHARED;
7191 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7192 mask |= CEPH_CAP_LINK_SHARED;
7193 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7194 mask |= CEPH_CAP_FILE_SHARED;
7195 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7196 mask |= CEPH_CAP_XATTR_SHARED;
7197out:
7198 return mask;
7199}
7200
7201int Client::statx(const char *relpath, struct ceph_statx *stx,
7202 const UserPerm& perms,
7203 unsigned int want, unsigned int flags)
7204{
11fdf7f2
TL
7205 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " want " << want << ")" << dendl;
7206 std::lock_guard lock(client_lock);
7c673cae
FG
7207 tout(cct) << "statx" << std::endl;
7208 tout(cct) << relpath << std::endl;
181888fb
FG
7209
7210 if (unmounting)
7211 return -ENOTCONN;
7212
7c673cae
FG
7213 filepath path(relpath);
7214 InodeRef in;
7215
7216 unsigned mask = statx_to_mask(flags, want);
7217
7218 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7219 if (r < 0)
7220 return r;
7221
7222 r = _getattr(in, mask, perms);
7223 if (r < 0) {
11fdf7f2 7224 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7225 return r;
7226 }
7227
7228 fill_statx(in, mask, stx);
11fdf7f2 7229 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7c673cae
FG
7230 return r;
7231}
7232
7233int Client::lstat(const char *relpath, struct stat *stbuf,
7234 const UserPerm& perms, frag_info_t *dirstat, int mask)
7235{
11fdf7f2
TL
7236 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7237 std::lock_guard lock(client_lock);
7238 tout(cct) << __func__ << std::endl;
7c673cae 7239 tout(cct) << relpath << std::endl;
181888fb
FG
7240
7241 if (unmounting)
7242 return -ENOTCONN;
7243
7c673cae
FG
7244 filepath path(relpath);
7245 InodeRef in;
7246 // don't follow symlinks
7247 int r = path_walk(path, &in, perms, false, mask);
7248 if (r < 0)
7249 return r;
7250 r = _getattr(in, mask, perms);
7251 if (r < 0) {
11fdf7f2 7252 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7253 return r;
7254 }
7255 fill_stat(in, stbuf, dirstat);
11fdf7f2 7256 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7257 return r;
7258}
7259
7260int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7261{
11fdf7f2 7262 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7263 << " mode 0" << oct << in->mode << dec
7264 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7265 memset(st, 0, sizeof(struct stat));
7266 if (use_faked_inos())
7267 st->st_ino = in->faked_ino;
7268 else
7269 st->st_ino = in->ino;
7270 st->st_dev = in->snapid;
7271 st->st_mode = in->mode;
7272 st->st_rdev = in->rdev;
28e407b8
AA
7273 if (in->is_dir()) {
7274 switch (in->nlink) {
7275 case 0:
7276 st->st_nlink = 0; /* dir is unlinked */
7277 break;
7278 case 1:
7279 st->st_nlink = 1 /* parent dentry */
7280 + 1 /* <dir>/. */
7281 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7282 break;
7283 default:
7284 ceph_abort();
7285 }
7286 } else {
7287 st->st_nlink = in->nlink;
7288 }
7c673cae
FG
7289 st->st_uid = in->uid;
7290 st->st_gid = in->gid;
7291 if (in->ctime > in->mtime) {
7292 stat_set_ctime_sec(st, in->ctime.sec());
7293 stat_set_ctime_nsec(st, in->ctime.nsec());
7294 } else {
7295 stat_set_ctime_sec(st, in->mtime.sec());
7296 stat_set_ctime_nsec(st, in->mtime.nsec());
7297 }
7298 stat_set_atime_sec(st, in->atime.sec());
7299 stat_set_atime_nsec(st, in->atime.nsec());
7300 stat_set_mtime_sec(st, in->mtime.sec());
7301 stat_set_mtime_nsec(st, in->mtime.nsec());
7302 if (in->is_dir()) {
7303 if (cct->_conf->client_dirsize_rbytes)
7304 st->st_size = in->rstat.rbytes;
7305 else
7306 st->st_size = in->dirstat.size();
7307 st->st_blocks = 1;
7308 } else {
7309 st->st_size = in->size;
7310 st->st_blocks = (in->size + 511) >> 9;
7311 }
11fdf7f2 7312 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7313
7314 if (dirstat)
7315 *dirstat = in->dirstat;
7316 if (rstat)
7317 *rstat = in->rstat;
7318
7319 return in->caps_issued();
7320}
7321
7322void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7323{
11fdf7f2 7324 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
7325 << " mode 0" << oct << in->mode << dec
7326 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7327 memset(stx, 0, sizeof(struct ceph_statx));
7328
7329 /*
7330 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7331 * so that all bits are set.
7332 */
7333 if (!mask)
7334 mask = ~0;
7335
7336 /* These are always considered to be available */
7337 stx->stx_dev = in->snapid;
11fdf7f2 7338 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
7339
7340 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7341 stx->stx_mode = S_IFMT & in->mode;
7342 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7343 stx->stx_rdev = in->rdev;
7344 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7345
7346 if (mask & CEPH_CAP_AUTH_SHARED) {
7347 stx->stx_uid = in->uid;
7348 stx->stx_gid = in->gid;
7349 stx->stx_mode = in->mode;
7350 in->btime.to_timespec(&stx->stx_btime);
7351 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7352 }
7353
7354 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
7355 if (in->is_dir()) {
7356 switch (in->nlink) {
7357 case 0:
7358 stx->stx_nlink = 0; /* dir is unlinked */
7359 break;
7360 case 1:
7361 stx->stx_nlink = 1 /* parent dentry */
7362 + 1 /* <dir>/. */
7363 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
7364 break;
7365 default:
7366 ceph_abort();
7367 }
7368 } else {
7369 stx->stx_nlink = in->nlink;
7370 }
7c673cae
FG
7371 stx->stx_mask |= CEPH_STATX_NLINK;
7372 }
7373
7374 if (mask & CEPH_CAP_FILE_SHARED) {
7375
7376 in->atime.to_timespec(&stx->stx_atime);
7377 in->mtime.to_timespec(&stx->stx_mtime);
7378
7379 if (in->is_dir()) {
7380 if (cct->_conf->client_dirsize_rbytes)
7381 stx->stx_size = in->rstat.rbytes;
7382 else
7383 stx->stx_size = in->dirstat.size();
7384 stx->stx_blocks = 1;
7385 } else {
7386 stx->stx_size = in->size;
7387 stx->stx_blocks = (in->size + 511) >> 9;
7388 }
7389 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7390 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7391 }
7392
7393 /* Change time and change_attr both require all shared caps to view */
7394 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7395 stx->stx_version = in->change_attr;
7396 if (in->ctime > in->mtime)
7397 in->ctime.to_timespec(&stx->stx_ctime);
7398 else
7399 in->mtime.to_timespec(&stx->stx_ctime);
7400 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7401 }
7402
7403}
7404
7405void Client::touch_dn(Dentry *dn)
7406{
7407 lru.lru_touch(dn);
7408}
7409
7410int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7411{
11fdf7f2
TL
7412 std::lock_guard lock(client_lock);
7413 tout(cct) << __func__ << std::endl;
7c673cae
FG
7414 tout(cct) << relpath << std::endl;
7415 tout(cct) << mode << std::endl;
181888fb
FG
7416
7417 if (unmounting)
7418 return -ENOTCONN;
7419
7c673cae
FG
7420 filepath path(relpath);
7421 InodeRef in;
7422 int r = path_walk(path, &in, perms);
7423 if (r < 0)
7424 return r;
7425 struct stat attr;
7426 attr.st_mode = mode;
7427 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7428}
7429
7430int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7431{
11fdf7f2
TL
7432 std::lock_guard lock(client_lock);
7433 tout(cct) << __func__ << std::endl;
7c673cae
FG
7434 tout(cct) << fd << std::endl;
7435 tout(cct) << mode << std::endl;
181888fb
FG
7436
7437 if (unmounting)
7438 return -ENOTCONN;
7439
7c673cae
FG
7440 Fh *f = get_filehandle(fd);
7441 if (!f)
7442 return -EBADF;
7443#if defined(__linux__) && defined(O_PATH)
7444 if (f->flags & O_PATH)
7445 return -EBADF;
7446#endif
7447 struct stat attr;
7448 attr.st_mode = mode;
7449 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7450}
7451
7452int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7453{
11fdf7f2
TL
7454 std::lock_guard lock(client_lock);
7455 tout(cct) << __func__ << std::endl;
7c673cae
FG
7456 tout(cct) << relpath << std::endl;
7457 tout(cct) << mode << std::endl;
181888fb
FG
7458
7459 if (unmounting)
7460 return -ENOTCONN;
7461
7c673cae
FG
7462 filepath path(relpath);
7463 InodeRef in;
7464 // don't follow symlinks
7465 int r = path_walk(path, &in, perms, false);
7466 if (r < 0)
7467 return r;
7468 struct stat attr;
7469 attr.st_mode = mode;
7470 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7471}
7472
7473int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7474 const UserPerm& perms)
7475{
11fdf7f2
TL
7476 std::lock_guard lock(client_lock);
7477 tout(cct) << __func__ << std::endl;
7c673cae
FG
7478 tout(cct) << relpath << std::endl;
7479 tout(cct) << new_uid << std::endl;
7480 tout(cct) << new_gid << std::endl;
181888fb
FG
7481
7482 if (unmounting)
7483 return -ENOTCONN;
7484
7c673cae
FG
7485 filepath path(relpath);
7486 InodeRef in;
7487 int r = path_walk(path, &in, perms);
7488 if (r < 0)
7489 return r;
7490 struct stat attr;
7491 attr.st_uid = new_uid;
7492 attr.st_gid = new_gid;
181888fb 7493 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
7494}
7495
7496int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7497{
11fdf7f2
TL
7498 std::lock_guard lock(client_lock);
7499 tout(cct) << __func__ << std::endl;
7c673cae
FG
7500 tout(cct) << fd << std::endl;
7501 tout(cct) << new_uid << std::endl;
7502 tout(cct) << new_gid << std::endl;
181888fb
FG
7503
7504 if (unmounting)
7505 return -ENOTCONN;
7506
7c673cae
FG
7507 Fh *f = get_filehandle(fd);
7508 if (!f)
7509 return -EBADF;
7510#if defined(__linux__) && defined(O_PATH)
7511 if (f->flags & O_PATH)
7512 return -EBADF;
7513#endif
7514 struct stat attr;
7515 attr.st_uid = new_uid;
7516 attr.st_gid = new_gid;
7517 int mask = 0;
7518 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7519 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7520 return _setattr(f->inode, &attr, mask, perms);
7521}
7522
7523int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7524 const UserPerm& perms)
7525{
11fdf7f2
TL
7526 std::lock_guard lock(client_lock);
7527 tout(cct) << __func__ << std::endl;
7c673cae
FG
7528 tout(cct) << relpath << std::endl;
7529 tout(cct) << new_uid << std::endl;
7530 tout(cct) << new_gid << std::endl;
181888fb
FG
7531
7532 if (unmounting)
7533 return -ENOTCONN;
7534
7c673cae
FG
7535 filepath path(relpath);
7536 InodeRef in;
7537 // don't follow symlinks
7538 int r = path_walk(path, &in, perms, false);
7539 if (r < 0)
7540 return r;
7541 struct stat attr;
7542 attr.st_uid = new_uid;
7543 attr.st_gid = new_gid;
7544 int mask = 0;
7545 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7546 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7547 return _setattr(in, &attr, mask, perms);
7548}
7549
11fdf7f2
TL
7550static void attr_set_atime_and_mtime(struct stat *attr,
7551 const utime_t &atime,
7552 const utime_t &mtime)
7553{
7554 stat_set_atime_sec(attr, atime.tv.tv_sec);
7555 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
7556 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
7557 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
7558}
7559
7560// for [l]utime() invoke the timeval variant as the timespec
7561// variant are not yet implemented. for futime[s](), invoke
7562// the timespec variant.
7c673cae
FG
7563int Client::utime(const char *relpath, struct utimbuf *buf,
7564 const UserPerm& perms)
7565{
11fdf7f2
TL
7566 struct timeval tv[2];
7567 tv[0].tv_sec = buf->actime;
7568 tv[0].tv_usec = 0;
7569 tv[1].tv_sec = buf->modtime;
7570 tv[1].tv_usec = 0;
7571
7572 return utimes(relpath, tv, perms);
7573}
7574
7575int Client::lutime(const char *relpath, struct utimbuf *buf,
7576 const UserPerm& perms)
7577{
7578 struct timeval tv[2];
7579 tv[0].tv_sec = buf->actime;
7580 tv[0].tv_usec = 0;
7581 tv[1].tv_sec = buf->modtime;
7582 tv[1].tv_usec = 0;
7583
7584 return lutimes(relpath, tv, perms);
7585}
7586
7587int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
7588{
7589 struct timespec ts[2];
7590 ts[0].tv_sec = buf->actime;
7591 ts[0].tv_nsec = 0;
7592 ts[1].tv_sec = buf->modtime;
7593 ts[1].tv_nsec = 0;
7594
7595 return futimens(fd, ts, perms);
7596}
7597
7598int Client::utimes(const char *relpath, struct timeval times[2],
7599 const UserPerm& perms)
7600{
7601 std::lock_guard lock(client_lock);
7602 tout(cct) << __func__ << std::endl;
7c673cae 7603 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7604 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7605 << std::endl;
7606 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7607 << std::endl;
181888fb
FG
7608
7609 if (unmounting)
7610 return -ENOTCONN;
7611
7c673cae
FG
7612 filepath path(relpath);
7613 InodeRef in;
7614 int r = path_walk(path, &in, perms);
7615 if (r < 0)
7616 return r;
7617 struct stat attr;
11fdf7f2
TL
7618 utime_t atime(times[0]);
7619 utime_t mtime(times[1]);
7620
7621 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7622 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7623}
7624
11fdf7f2
TL
7625int Client::lutimes(const char *relpath, struct timeval times[2],
7626 const UserPerm& perms)
7c673cae 7627{
11fdf7f2
TL
7628 std::lock_guard lock(client_lock);
7629 tout(cct) << __func__ << std::endl;
7c673cae 7630 tout(cct) << relpath << std::endl;
11fdf7f2
TL
7631 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
7632 << std::endl;
7633 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
7634 << std::endl;
181888fb
FG
7635
7636 if (unmounting)
7637 return -ENOTCONN;
7638
7c673cae
FG
7639 filepath path(relpath);
7640 InodeRef in;
7c673cae
FG
7641 int r = path_walk(path, &in, perms, false);
7642 if (r < 0)
7643 return r;
7644 struct stat attr;
11fdf7f2
TL
7645 utime_t atime(times[0]);
7646 utime_t mtime(times[1]);
7647
7648 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
7649 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7650}
7651
11fdf7f2
TL
7652int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
7653{
7654 struct timespec ts[2];
7655 ts[0].tv_sec = times[0].tv_sec;
7656 ts[0].tv_nsec = times[0].tv_usec * 1000;
7657 ts[1].tv_sec = times[1].tv_sec;
7658 ts[1].tv_nsec = times[1].tv_usec * 1000;
7659
7660 return futimens(fd, ts, perms);
7661}
7662
7663int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
7664{
7665 std::lock_guard lock(client_lock);
7666 tout(cct) << __func__ << std::endl;
7667 tout(cct) << fd << std::endl;
7668 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
7669 << std::endl;
7670 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
7671 << std::endl;
7672
7673 if (unmounting)
7674 return -ENOTCONN;
7675
7676 Fh *f = get_filehandle(fd);
7677 if (!f)
7678 return -EBADF;
7679#if defined(__linux__) && defined(O_PATH)
7680 if (f->flags & O_PATH)
7681 return -EBADF;
7682#endif
7683 struct stat attr;
7684 utime_t atime(times[0]);
7685 utime_t mtime(times[1]);
7686
7687 attr_set_atime_and_mtime(&attr, atime, mtime);
7688 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7689}
7690
7c673cae
FG
7691int Client::flock(int fd, int operation, uint64_t owner)
7692{
11fdf7f2
TL
7693 std::lock_guard lock(client_lock);
7694 tout(cct) << __func__ << std::endl;
7c673cae
FG
7695 tout(cct) << fd << std::endl;
7696 tout(cct) << operation << std::endl;
7697 tout(cct) << owner << std::endl;
181888fb
FG
7698
7699 if (unmounting)
7700 return -ENOTCONN;
7701
7c673cae
FG
7702 Fh *f = get_filehandle(fd);
7703 if (!f)
7704 return -EBADF;
7705
7706 return _flock(f, operation, owner);
7707}
7708
7709int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7710{
11fdf7f2
TL
7711 std::lock_guard lock(client_lock);
7712 tout(cct) << __func__ << std::endl;
7c673cae 7713 tout(cct) << relpath << std::endl;
181888fb
FG
7714
7715 if (unmounting)
7716 return -ENOTCONN;
7717
7c673cae
FG
7718 filepath path(relpath);
7719 InodeRef in;
7720 int r = path_walk(path, &in, perms, true);
7721 if (r < 0)
7722 return r;
7723 if (cct->_conf->client_permissions) {
7724 int r = may_open(in.get(), O_RDONLY, perms);
7725 if (r < 0)
7726 return r;
7727 }
7728 r = _opendir(in.get(), dirpp, perms);
7729 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7730 if (r != -ENOTDIR)
7731 tout(cct) << (unsigned long)*dirpp << std::endl;
7732 return r;
7733}
7734
7735int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7736{
7737 if (!in->is_dir())
7738 return -ENOTDIR;
7739 *dirpp = new dir_result_t(in, perms);
7740 opened_dirs.insert(*dirpp);
11fdf7f2 7741 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
7742 return 0;
7743}
7744
7745
7746int Client::closedir(dir_result_t *dir)
7747{
11fdf7f2
TL
7748 std::lock_guard lock(client_lock);
7749 tout(cct) << __func__ << std::endl;
7c673cae
FG
7750 tout(cct) << (unsigned long)dir << std::endl;
7751
11fdf7f2 7752 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
7c673cae
FG
7753 _closedir(dir);
7754 return 0;
7755}
7756
7757void Client::_closedir(dir_result_t *dirp)
7758{
11fdf7f2 7759 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
7c673cae 7760 if (dirp->inode) {
11fdf7f2 7761 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
7762 dirp->inode.reset();
7763 }
7764 _readdir_drop_dirp_buffer(dirp);
7765 opened_dirs.erase(dirp);
7766 delete dirp;
7767}
7768
7769void Client::rewinddir(dir_result_t *dirp)
7770{
11fdf7f2
TL
7771 std::lock_guard lock(client_lock);
7772 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb
FG
7773
7774 if (unmounting)
7775 return;
7776
7c673cae
FG
7777 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7778 _readdir_drop_dirp_buffer(d);
7779 d->reset();
7780}
7781
7782loff_t Client::telldir(dir_result_t *dirp)
7783{
7784 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 7785 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
7786 return d->offset;
7787}
7788
7789void Client::seekdir(dir_result_t *dirp, loff_t offset)
7790{
11fdf7f2 7791 std::lock_guard lock(client_lock);
7c673cae 7792
11fdf7f2 7793 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 7794
181888fb
FG
7795 if (unmounting)
7796 return;
7797
7c673cae
FG
7798 if (offset == dirp->offset)
7799 return;
7800
7801 if (offset > dirp->offset)
7802 dirp->release_count = 0; // bump if we do a forward seek
7803 else
7804 dirp->ordered_count = 0; // disable filling readdir cache
7805
7806 if (dirp->hash_order()) {
7807 if (dirp->offset > offset) {
7808 _readdir_drop_dirp_buffer(dirp);
7809 dirp->reset();
7810 }
7811 } else {
7812 if (offset == 0 ||
7813 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7814 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7815 _readdir_drop_dirp_buffer(dirp);
7816 dirp->reset();
7817 }
7818 }
7819
7820 dirp->offset = offset;
7821}
7822
7823
7824//struct dirent {
7825// ino_t d_ino; /* inode number */
7826// off_t d_off; /* offset to the next dirent */
7827// unsigned short d_reclen; /* length of this record */
7828// unsigned char d_type; /* type of file */
7829// char d_name[256]; /* filename */
7830//};
7831void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7832{
7833 strncpy(de->d_name, name, 255);
7834 de->d_name[255] = '\0';
7835#ifndef __CYGWIN__
7836 de->d_ino = ino;
11fdf7f2 7837#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
7838 de->d_off = next_off;
7839#endif
7840 de->d_reclen = 1;
7841 de->d_type = IFTODT(type);
11fdf7f2 7842 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
7843 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7844#endif
7845}
7846
7847void Client::_readdir_next_frag(dir_result_t *dirp)
7848{
7849 frag_t fg = dirp->buffer_frag;
7850
7851 if (fg.is_rightmost()) {
11fdf7f2 7852 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
7853 dirp->set_end();
7854 return;
7855 }
7856
7857 // advance
7858 fg = fg.next();
11fdf7f2 7859 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
7860
7861 if (dirp->hash_order()) {
7862 // keep last_name
7863 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7864 if (dirp->offset < new_offset) // don't decrease offset
7865 dirp->offset = new_offset;
7866 } else {
7867 dirp->last_name.clear();
7868 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7869 _readdir_rechoose_frag(dirp);
7870 }
7871}
7872
7873void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7874{
11fdf7f2 7875 ceph_assert(dirp->inode);
7c673cae
FG
7876
7877 if (dirp->hash_order())
7878 return;
7879
7880 frag_t cur = frag_t(dirp->offset_high());
7881 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7882 if (fg != cur) {
11fdf7f2 7883 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
7884 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7885 dirp->last_name.clear();
7886 dirp->next_offset = 2;
7887 }
7888}
7889
7890void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7891{
11fdf7f2 7892 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
7893 dirp->buffer.clear();
7894}
7895
7896int Client::_readdir_get_frag(dir_result_t *dirp)
7897{
11fdf7f2
TL
7898 ceph_assert(dirp);
7899 ceph_assert(dirp->inode);
7c673cae
FG
7900
7901 // get the current frag.
7902 frag_t fg;
7903 if (dirp->hash_order())
7904 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7905 else
7906 fg = frag_t(dirp->offset_high());
7907
11fdf7f2 7908 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
7909 << " offset " << hex << dirp->offset << dec << dendl;
7910
7911 int op = CEPH_MDS_OP_READDIR;
7912 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7913 op = CEPH_MDS_OP_LSSNAP;
7914
7915 InodeRef& diri = dirp->inode;
7916
7917 MetaRequest *req = new MetaRequest(op);
7918 filepath path;
7919 diri->make_nosnap_relative_path(path);
7920 req->set_filepath(path);
7921 req->set_inode(diri.get());
7922 req->head.args.readdir.frag = fg;
7923 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7924 if (dirp->last_name.length()) {
94b18763 7925 req->path2.set_path(dirp->last_name);
7c673cae
FG
7926 } else if (dirp->hash_order()) {
7927 req->head.args.readdir.offset_hash = dirp->offset_high();
7928 }
7929 req->dirp = dirp;
7930
7931 bufferlist dirbl;
7932 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7933
7934 if (res == -EAGAIN) {
11fdf7f2 7935 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
7936 _readdir_rechoose_frag(dirp);
7937 return _readdir_get_frag(dirp);
7938 }
7939
7940 if (res == 0) {
11fdf7f2 7941 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
7942 << " size " << dirp->buffer.size() << dendl;
7943 } else {
11fdf7f2 7944 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
7945 dirp->set_end();
7946 }
7947
7948 return res;
7949}
7950
7951struct dentry_off_lt {
7952 bool operator()(const Dentry* dn, int64_t off) const {
7953 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7954 }
7955};
7956
7957int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7958 int caps, bool getref)
7959{
11fdf7f2
TL
7960 ceph_assert(client_lock.is_locked());
7961 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
7962 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7963 << dendl;
7964 Dir *dir = dirp->inode->dir;
7965
7966 if (!dir) {
7967 ldout(cct, 10) << " dir is empty" << dendl;
7968 dirp->set_end();
7969 return 0;
7970 }
7971
7972 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7973 dir->readdir_cache.end(),
7974 dirp->offset, dentry_off_lt());
7975
7976 string dn_name;
7977 while (true) {
7978 if (!dirp->inode->is_complete_and_ordered())
7979 return -EAGAIN;
7980 if (pd == dir->readdir_cache.end())
7981 break;
7982 Dentry *dn = *pd;
7983 if (dn->inode == NULL) {
7984 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7985 ++pd;
7986 continue;
7987 }
7988 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7989 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7990 ++pd;
7991 continue;
7992 }
7993
7994 int r = _getattr(dn->inode, caps, dirp->perms);
7995 if (r < 0)
7996 return r;
7997
7998 struct ceph_statx stx;
7999 struct dirent de;
8000 fill_statx(dn->inode, caps, &stx);
8001
8002 uint64_t next_off = dn->offset + 1;
8003 ++pd;
8004 if (pd == dir->readdir_cache.end())
8005 next_off = dir_result_t::END;
8006
8007 Inode *in = NULL;
8008 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8009 if (getref) {
8010 in = dn->inode.get();
8011 _ll_get(in);
8012 }
8013
8014 dn_name = dn->name; // fill in name while we have lock
8015
8016 client_lock.Unlock();
8017 r = cb(p, &de, &stx, next_off, in); // _next_ offset
8018 client_lock.Lock();
8019 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8020 << " = " << r << dendl;
8021 if (r < 0) {
8022 return r;
8023 }
8024
8025 dirp->offset = next_off;
8026 if (dirp->at_end())
8027 dirp->next_offset = 2;
8028 else
8029 dirp->next_offset = dirp->offset_low();
8030 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8031 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8032 if (r > 0)
8033 return r;
8034 }
8035
11fdf7f2 8036 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8037 dirp->set_end();
8038 return 0;
8039}
8040
8041int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8042 unsigned want, unsigned flags, bool getref)
8043{
8044 int caps = statx_to_mask(flags, want);
8045
11fdf7f2 8046 std::lock_guard lock(client_lock);
7c673cae 8047
181888fb
FG
8048 if (unmounting)
8049 return -ENOTCONN;
8050
7c673cae
FG
8051 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8052
11fdf7f2 8053 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8054 << dec << " at_end=" << dirp->at_end()
8055 << " hash_order=" << dirp->hash_order() << dendl;
8056
8057 struct dirent de;
8058 struct ceph_statx stx;
8059 memset(&de, 0, sizeof(de));
8060 memset(&stx, 0, sizeof(stx));
8061
8062 InodeRef& diri = dirp->inode;
8063
8064 if (dirp->at_end())
8065 return 0;
8066
8067 if (dirp->offset == 0) {
8068 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8069 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8070 uint64_t next_off = 1;
8071
8072 int r;
8073 r = _getattr(diri, caps, dirp->perms);
8074 if (r < 0)
8075 return r;
8076
8077 fill_statx(diri, caps, &stx);
8078 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8079
8080 Inode *inode = NULL;
8081 if (getref) {
8082 inode = diri.get();
8083 _ll_get(inode);
8084 }
8085
8086 client_lock.Unlock();
8087 r = cb(p, &de, &stx, next_off, inode);
8088 client_lock.Lock();
8089 if (r < 0)
8090 return r;
8091
8092 dirp->offset = next_off;
8093 if (r > 0)
8094 return r;
8095 }
8096 if (dirp->offset == 1) {
8097 ldout(cct, 15) << " including .." << dendl;
8098 uint64_t next_off = 2;
8099 InodeRef in;
11fdf7f2 8100 if (diri->dentries.empty())
7c673cae
FG
8101 in = diri;
8102 else
94b18763 8103 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8104
8105 int r;
94b18763 8106 r = _getattr(in, caps, dirp->perms);
7c673cae
FG
8107 if (r < 0)
8108 return r;
8109
8110 fill_statx(in, caps, &stx);
8111 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8112
8113 Inode *inode = NULL;
8114 if (getref) {
8115 inode = in.get();
8116 _ll_get(inode);
8117 }
8118
8119 client_lock.Unlock();
8120 r = cb(p, &de, &stx, next_off, inode);
8121 client_lock.Lock();
8122 if (r < 0)
8123 return r;
8124
8125 dirp->offset = next_off;
8126 if (r > 0)
8127 return r;
8128 }
8129
8130 // can we read from our cache?
8131 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8132 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8133 << dirp->inode->is_complete_and_ordered()
8134 << " issued " << ccap_string(dirp->inode->caps_issued())
8135 << dendl;
8136 if (dirp->inode->snapid != CEPH_SNAPDIR &&
8137 dirp->inode->is_complete_and_ordered() &&
94b18763 8138 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae
FG
8139 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
8140 if (err != -EAGAIN)
8141 return err;
8142 }
8143
8144 while (1) {
8145 if (dirp->at_end())
8146 return 0;
8147
8148 bool check_caps = true;
8149 if (!dirp->is_cached()) {
8150 int r = _readdir_get_frag(dirp);
8151 if (r)
8152 return r;
8153 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
8154 // different than the requested one. (our dirfragtree was outdated)
8155 check_caps = false;
8156 }
8157 frag_t fg = dirp->buffer_frag;
8158
8159 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
8160 << " offset " << hex << dirp->offset << dendl;
8161
8162 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
8163 dirp->offset, dir_result_t::dentry_off_lt());
8164 it != dirp->buffer.end();
8165 ++it) {
8166 dir_result_t::dentry &entry = *it;
8167
8168 uint64_t next_off = entry.offset + 1;
8169
8170 int r;
8171 if (check_caps) {
8172 r = _getattr(entry.inode, caps, dirp->perms);
8173 if (r < 0)
8174 return r;
8175 }
8176
8177 fill_statx(entry.inode, caps, &stx);
8178 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
8179
8180 Inode *inode = NULL;
8181 if (getref) {
8182 inode = entry.inode.get();
8183 _ll_get(inode);
8184 }
8185
8186 client_lock.Unlock();
8187 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
8188 client_lock.Lock();
8189
8190 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
8191 << " = " << r << dendl;
8192 if (r < 0)
8193 return r;
8194
8195 dirp->offset = next_off;
8196 if (r > 0)
8197 return r;
8198 }
8199
8200 if (dirp->next_offset > 2) {
8201 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
8202 _readdir_drop_dirp_buffer(dirp);
8203 continue; // more!
8204 }
8205
8206 if (!fg.is_rightmost()) {
8207 // next frag!
8208 _readdir_next_frag(dirp);
8209 continue;
8210 }
8211
8212 if (diri->shared_gen == dirp->start_shared_gen &&
8213 diri->dir_release_count == dirp->release_count) {
8214 if (diri->dir_ordered_count == dirp->ordered_count) {
8215 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
8216 if (diri->dir) {
11fdf7f2 8217 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
8218 diri->dir->readdir_cache.resize(dirp->cache_index);
8219 }
8220 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
8221 } else {
8222 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
8223 diri->flags |= I_COMPLETE;
8224 }
8225 }
8226
8227 dirp->set_end();
8228 return 0;
8229 }
8230 ceph_abort();
8231 return 0;
8232}
8233
8234
8235int Client::readdir_r(dir_result_t *d, struct dirent *de)
8236{
8237 return readdirplus_r(d, de, 0, 0, 0, NULL);
8238}
8239
8240/*
8241 * readdirplus_r
8242 *
8243 * returns
8244 * 1 if we got a dirent
8245 * 0 for end of directory
8246 * <0 on error
8247 */
8248
8249struct single_readdir {
8250 struct dirent *de;
8251 struct ceph_statx *stx;
8252 Inode *inode;
8253 bool full;
8254};
8255
8256static int _readdir_single_dirent_cb(void *p, struct dirent *de,
8257 struct ceph_statx *stx, off_t off,
8258 Inode *in)
8259{
8260 single_readdir *c = static_cast<single_readdir *>(p);
8261
8262 if (c->full)
8263 return -1; // already filled this dirent
8264
8265 *c->de = *de;
8266 if (c->stx)
8267 *c->stx = *stx;
8268 c->inode = in;
8269 c->full = true;
8270 return 1;
8271}
8272
8273struct dirent *Client::readdir(dir_result_t *d)
8274{
8275 int ret;
8276 static struct dirent de;
8277 single_readdir sr;
8278 sr.de = &de;
8279 sr.stx = NULL;
8280 sr.inode = NULL;
8281 sr.full = false;
8282
8283 // our callback fills the dirent and sets sr.full=true on first
8284 // call, and returns -1 the second time around.
8285 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
8286 if (ret < -1) {
8287 errno = -ret; // this sucks.
8288 return (dirent *) NULL;
8289 }
8290 if (sr.full) {
8291 return &de;
8292 }
8293 return (dirent *) NULL;
8294}
8295
8296int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
8297 struct ceph_statx *stx, unsigned want,
8298 unsigned flags, Inode **out)
8299{
8300 single_readdir sr;
8301 sr.de = de;
8302 sr.stx = stx;
8303 sr.inode = NULL;
8304 sr.full = false;
8305
8306 // our callback fills the dirent and sets sr.full=true on first
8307 // call, and returns -1 the second time around.
8308 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
8309 if (r < -1)
8310 return r;
8311 if (out)
8312 *out = sr.inode;
8313 if (sr.full)
8314 return 1;
8315 return 0;
8316}
8317
8318
8319/* getdents */
8320struct getdents_result {
8321 char *buf;
8322 int buflen;
8323 int pos;
8324 bool fullent;
8325};
8326
8327static int _readdir_getdent_cb(void *p, struct dirent *de,
8328 struct ceph_statx *stx, off_t off, Inode *in)
8329{
8330 struct getdents_result *c = static_cast<getdents_result *>(p);
8331
8332 int dlen;
8333 if (c->fullent)
8334 dlen = sizeof(*de);
8335 else
8336 dlen = strlen(de->d_name) + 1;
8337
8338 if (c->pos + dlen > c->buflen)
8339 return -1; // doesn't fit
8340
8341 if (c->fullent) {
8342 memcpy(c->buf + c->pos, de, sizeof(*de));
8343 } else {
8344 memcpy(c->buf + c->pos, de->d_name, dlen);
8345 }
8346 c->pos += dlen;
8347 return 0;
8348}
8349
8350int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8351{
8352 getdents_result gr;
8353 gr.buf = buf;
8354 gr.buflen = buflen;
8355 gr.fullent = fullent;
8356 gr.pos = 0;
8357
8358 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8359
8360 if (r < 0) { // some error
8361 if (r == -1) { // buffer ran out of space
8362 if (gr.pos) { // but we got some entries already!
8363 return gr.pos;
8364 } // or we need a larger buffer
8365 return -ERANGE;
8366 } else { // actual error, return it
8367 return r;
8368 }
8369 }
8370 return gr.pos;
8371}
8372
8373
8374/* getdir */
8375struct getdir_result {
8376 list<string> *contents;
8377 int num;
8378};
8379
8380static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8381{
8382 getdir_result *r = static_cast<getdir_result *>(p);
8383
8384 r->contents->push_back(de->d_name);
8385 r->num++;
8386 return 0;
8387}
8388
8389int Client::getdir(const char *relpath, list<string>& contents,
8390 const UserPerm& perms)
8391{
8392 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8393 {
11fdf7f2 8394 std::lock_guard lock(client_lock);
7c673cae
FG
8395 tout(cct) << "getdir" << std::endl;
8396 tout(cct) << relpath << std::endl;
8397 }
8398
8399 dir_result_t *d;
8400 int r = opendir(relpath, &d, perms);
8401 if (r < 0)
8402 return r;
8403
8404 getdir_result gr;
8405 gr.contents = &contents;
8406 gr.num = 0;
8407 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8408
8409 closedir(d);
8410
8411 if (r < 0)
8412 return r;
8413 return gr.num;
8414}
8415
8416
8417/****** file i/o **********/
8418int Client::open(const char *relpath, int flags, const UserPerm& perms,
8419 mode_t mode, int stripe_unit, int stripe_count,
8420 int object_size, const char *data_pool)
8421{
8422 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
11fdf7f2 8423 std::lock_guard lock(client_lock);
7c673cae
FG
8424 tout(cct) << "open" << std::endl;
8425 tout(cct) << relpath << std::endl;
8426 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8427
181888fb
FG
8428 if (unmounting)
8429 return -ENOTCONN;
8430
7c673cae
FG
8431 Fh *fh = NULL;
8432
8433#if defined(__linux__) && defined(O_PATH)
8434 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8435 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8436 * in kernel (fs/open.c). */
8437 if (flags & O_PATH)
8438 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8439#endif
8440
8441 filepath path(relpath);
8442 InodeRef in;
8443 bool created = false;
8444 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8445 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8446 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8447
8448 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8449 return -EEXIST;
8450
8451#if defined(__linux__) && defined(O_PATH)
8452 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8453#else
8454 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8455#endif
8456 return -ELOOP;
8457
8458 if (r == -ENOENT && (flags & O_CREAT)) {
8459 filepath dirpath = path;
8460 string dname = dirpath.last_dentry();
8461 dirpath.pop_dentry();
8462 InodeRef dir;
8463 r = path_walk(dirpath, &dir, perms, true,
8464 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8465 if (r < 0)
8466 goto out;
8467 if (cct->_conf->client_permissions) {
8468 r = may_create(dir.get(), perms);
8469 if (r < 0)
8470 goto out;
8471 }
8472 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8473 stripe_count, object_size, data_pool, &created, perms);
8474 }
8475 if (r < 0)
8476 goto out;
8477
8478 if (!created) {
8479 // posix says we can only check permissions of existing files
8480 if (cct->_conf->client_permissions) {
8481 r = may_open(in.get(), flags, perms);
8482 if (r < 0)
8483 goto out;
8484 }
8485 }
8486
8487 if (!fh)
8488 r = _open(in.get(), flags, mode, &fh, perms);
8489 if (r >= 0) {
8490 // allocate a integer file descriptor
11fdf7f2 8491 ceph_assert(fh);
7c673cae 8492 r = get_fd();
11fdf7f2 8493 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
8494 fd_map[r] = fh;
8495 }
8496
8497 out:
8498 tout(cct) << r << std::endl;
8499 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8500 return r;
8501}
8502
8503int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8504{
8505 /* Use default file striping parameters */
8506 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8507}
8508
8509int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8510 const UserPerm& perms)
8511{
11fdf7f2
TL
8512 std::lock_guard lock(client_lock);
8513 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 8514
181888fb
FG
8515 if (unmounting)
8516 return -ENOTCONN;
8517
7c673cae
FG
8518 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8519 filepath path(ino);
8520 req->set_filepath(path);
8521
8522 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8523 char f[30];
8524 sprintf(f, "%u", h);
8525 filepath path2(dirino);
8526 path2.push_dentry(string(f));
8527 req->set_filepath2(path2);
8528
8529 int r = make_request(req, perms, NULL, NULL,
8530 rand() % mdsmap->get_num_in_mds());
11fdf7f2 8531 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
8532 return r;
8533}
8534
8535
8536/**
8537 * Load inode into local cache.
8538 *
8539 * If inode pointer is non-NULL, and take a reference on
8540 * the resulting Inode object in one operation, so that caller
8541 * can safely assume inode will still be there after return.
8542 */
1adf2230 8543int Client::_lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
7c673cae 8544{
11fdf7f2 8545 ldout(cct, 8) << __func__ << " enter(" << ino << ")" << dendl;
7c673cae 8546
181888fb
FG
8547 if (unmounting)
8548 return -ENOTCONN;
8549
7c673cae
FG
8550 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8551 filepath path(ino);
8552 req->set_filepath(path);
8553
8554 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8555 if (r == 0 && inode != NULL) {
8556 vinodeno_t vino(ino, CEPH_NOSNAP);
8557 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 8558 ceph_assert(p != inode_map.end());
7c673cae
FG
8559 *inode = p->second;
8560 _ll_get(*inode);
8561 }
11fdf7f2 8562 ldout(cct, 8) << __func__ << " exit(" << ino << ") = " << r << dendl;
7c673cae
FG
8563 return r;
8564}
8565
1adf2230
AA
8566int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8567{
11fdf7f2 8568 std::lock_guard lock(client_lock);
1adf2230
AA
8569 return _lookup_ino(ino, perms, inode);
8570}
7c673cae
FG
8571
8572/**
8573 * Find the parent inode of `ino` and insert it into
8574 * our cache. Conditionally also set `parent` to a referenced
8575 * Inode* if caller provides non-NULL value.
8576 */
1adf2230 8577int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 8578{
11fdf7f2 8579 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8580
7c673cae
FG
8581 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8582 filepath path(ino->ino);
8583 req->set_filepath(path);
8584
8585 InodeRef target;
8586 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8587 // Give caller a reference to the parent ino if they provided a pointer.
8588 if (parent != NULL) {
8589 if (r == 0) {
8590 *parent = target.get();
8591 _ll_get(*parent);
11fdf7f2 8592 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
8593 } else {
8594 *parent = NULL;
8595 }
8596 }
11fdf7f2 8597 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8598 return r;
8599}
8600
7c673cae
FG
8601/**
8602 * Populate the parent dentry for `ino`, provided it is
8603 * a child of `parent`.
8604 */
1adf2230 8605int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 8606{
11fdf7f2
TL
8607 ceph_assert(parent->is_dir());
8608 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 8609
181888fb
FG
8610 if (unmounting)
8611 return -ENOTCONN;
8612
7c673cae
FG
8613 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8614 req->set_filepath2(filepath(parent->ino));
8615 req->set_filepath(filepath(ino->ino));
8616 req->set_inode(ino);
8617
8618 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 8619 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
8620 return r;
8621}
8622
1adf2230
AA
8623int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8624{
11fdf7f2 8625 std::lock_guard lock(client_lock);
1adf2230
AA
8626 return _lookup_name(ino, parent, perms);
8627}
7c673cae 8628
11fdf7f2 8629Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 8630{
11fdf7f2
TL
8631 ceph_assert(in);
8632 Fh *f = new Fh(in, flags, cmode, perms);
7c673cae 8633
11fdf7f2 8634 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
8635
8636 if (in->snapid != CEPH_NOSNAP) {
8637 in->snap_cap_refs++;
8638 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8639 << ccap_string(in->caps_issued()) << dendl;
8640 }
8641
11fdf7f2 8642 const auto& conf = cct->_conf;
7c673cae
FG
8643 f->readahead.set_trigger_requests(1);
8644 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8645 uint64_t max_readahead = Readahead::NO_LIMIT;
8646 if (conf->client_readahead_max_bytes) {
11fdf7f2 8647 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
8648 }
8649 if (conf->client_readahead_max_periods) {
11fdf7f2 8650 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
8651 }
8652 f->readahead.set_max_readahead_size(max_readahead);
8653 vector<uint64_t> alignments;
8654 alignments.push_back(in->layout.get_period());
8655 alignments.push_back(in->layout.stripe_unit);
8656 f->readahead.set_alignments(alignments);
8657
8658 return f;
8659}
8660
8661int Client::_release_fh(Fh *f)
8662{
8663 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8664 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8665 Inode *in = f->inode.get();
11fdf7f2 8666 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 8667
b32b8144
FG
8668 in->unset_deleg(f);
8669
7c673cae
FG
8670 if (in->snapid == CEPH_NOSNAP) {
8671 if (in->put_open_ref(f->mode)) {
8672 _flush(in, new C_Client_FlushComplete(this, in));
8673 check_caps(in, 0);
8674 }
8675 } else {
11fdf7f2 8676 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
8677 in->snap_cap_refs--;
8678 }
8679
8680 _release_filelocks(f);
8681
8682 // Finally, read any async err (i.e. from flushes)
8683 int err = f->take_async_err();
8684 if (err != 0) {
11fdf7f2 8685 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
8686 << cpp_strerror(err) << dendl;
8687 } else {
11fdf7f2 8688 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
8689 }
8690
8691 _put_fh(f);
8692
8693 return err;
8694}
8695
8696void Client::_put_fh(Fh *f)
8697{
8698 int left = f->put();
8699 if (!left) {
8700 delete f;
8701 }
8702}
8703
8704int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8705 const UserPerm& perms)
8706{
8707 if (in->snapid != CEPH_NOSNAP &&
8708 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8709 return -EROFS;
8710 }
8711
8712 // use normalized flags to generate cmode
11fdf7f2
TL
8713 int cflags = ceph_flags_sys2wire(flags);
8714 if (cct->_conf.get_val<bool>("client_force_lazyio"))
8715 cflags |= CEPH_O_LAZY;
8716
8717 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
8718 int want = ceph_caps_for_mode(cmode);
8719 int result = 0;
8720
8721 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8722
b32b8144 8723 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
8724 // update wanted?
8725 check_caps(in, CHECK_CAPS_NODELAY);
8726 } else {
b32b8144 8727
7c673cae
FG
8728 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8729 filepath path;
8730 in->make_nosnap_relative_path(path);
8731 req->set_filepath(path);
11fdf7f2 8732 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
8733 req->head.args.open.mode = mode;
8734 req->head.args.open.pool = -1;
8735 if (cct->_conf->client_debug_getattr_caps)
8736 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8737 else
8738 req->head.args.open.mask = 0;
8739 req->head.args.open.old_size = in->size; // for O_TRUNC
8740 req->set_inode(in);
8741 result = make_request(req, perms);
b32b8144
FG
8742
8743 /*
8744 * NFS expects that delegations will be broken on a conflicting open,
8745 * not just when there is actual conflicting access to the file. SMB leases
8746 * and oplocks also have similar semantics.
8747 *
8748 * Ensure that clients that have delegations enabled will wait on minimal
8749 * caps during open, just to ensure that other clients holding delegations
8750 * return theirs first.
8751 */
8752 if (deleg_timeout && result == 0) {
8753 int need = 0, have;
8754
8755 if (cmode & CEPH_FILE_MODE_WR)
8756 need |= CEPH_CAP_FILE_WR;
8757 if (cmode & CEPH_FILE_MODE_RD)
8758 need |= CEPH_CAP_FILE_RD;
8759
8760 result = get_caps(in, need, want, &have, -1);
8761 if (result < 0) {
1adf2230 8762 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
8763 " . Denying open: " <<
8764 cpp_strerror(result) << dendl;
8765 in->put_open_ref(cmode);
8766 } else {
8767 put_cap_ref(in, need);
8768 }
8769 }
7c673cae
FG
8770 }
8771
8772 // success?
8773 if (result >= 0) {
8774 if (fhp)
8775 *fhp = _create_fh(in, flags, cmode, perms);
8776 } else {
8777 in->put_open_ref(cmode);
8778 }
8779
8780 trim_cache();
8781
8782 return result;
8783}
8784
8785int Client::_renew_caps(Inode *in)
8786{
8787 int wanted = in->caps_file_wanted();
8788 if (in->is_any_caps() &&
8789 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8790 check_caps(in, CHECK_CAPS_NODELAY);
8791 return 0;
8792 }
8793
8794 int flags = 0;
8795 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8796 flags = O_RDWR;
8797 else if (wanted & CEPH_CAP_FILE_RD)
8798 flags = O_RDONLY;
8799 else if (wanted & CEPH_CAP_FILE_WR)
8800 flags = O_WRONLY;
8801
8802 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8803 filepath path;
8804 in->make_nosnap_relative_path(path);
8805 req->set_filepath(path);
8806 req->head.args.open.flags = flags;
8807 req->head.args.open.pool = -1;
8808 if (cct->_conf->client_debug_getattr_caps)
8809 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8810 else
8811 req->head.args.open.mask = 0;
8812 req->set_inode(in);
8813
8814 // duplicate in case Cap goes away; not sure if that race is a concern?
8815 const UserPerm *pperm = in->get_best_perms();
8816 UserPerm perms;
8817 if (pperm != NULL)
8818 perms = *pperm;
8819 int ret = make_request(req, perms);
8820 return ret;
8821}
8822
8823int Client::close(int fd)
8824{
8825 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
11fdf7f2 8826 std::lock_guard lock(client_lock);
7c673cae
FG
8827 tout(cct) << "close" << std::endl;
8828 tout(cct) << fd << std::endl;
8829
181888fb
FG
8830 if (unmounting)
8831 return -ENOTCONN;
8832
7c673cae
FG
8833 Fh *fh = get_filehandle(fd);
8834 if (!fh)
8835 return -EBADF;
8836 int err = _release_fh(fh);
8837 fd_map.erase(fd);
8838 put_fd(fd);
8839 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8840 return err;
8841}
8842
8843
8844// ------------
8845// read, write
8846
8847loff_t Client::lseek(int fd, loff_t offset, int whence)
8848{
11fdf7f2 8849 std::lock_guard lock(client_lock);
7c673cae
FG
8850 tout(cct) << "lseek" << std::endl;
8851 tout(cct) << fd << std::endl;
8852 tout(cct) << offset << std::endl;
8853 tout(cct) << whence << std::endl;
8854
181888fb
FG
8855 if (unmounting)
8856 return -ENOTCONN;
8857
7c673cae
FG
8858 Fh *f = get_filehandle(fd);
8859 if (!f)
8860 return -EBADF;
8861#if defined(__linux__) && defined(O_PATH)
8862 if (f->flags & O_PATH)
8863 return -EBADF;
8864#endif
8865 return _lseek(f, offset, whence);
8866}
8867
8868loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8869{
8870 Inode *in = f->inode.get();
8871 int r;
11fdf7f2 8872 loff_t pos = -1;
7c673cae
FG
8873
8874 switch (whence) {
8875 case SEEK_SET:
11fdf7f2 8876 pos = offset;
7c673cae
FG
8877 break;
8878
8879 case SEEK_CUR:
11fdf7f2 8880 pos += offset;
7c673cae
FG
8881 break;
8882
8883 case SEEK_END:
8884 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8885 if (r < 0)
8886 return r;
11fdf7f2 8887 pos = in->size + offset;
7c673cae
FG
8888 break;
8889
8890 default:
8891 ceph_abort();
8892 }
8893
11fdf7f2
TL
8894 if (pos < 0) {
8895 return -EINVAL;
8896 } else {
8897 f->pos = pos;
8898 }
8899
1adf2230 8900 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
8901 return f->pos;
8902}
8903
8904
8905void Client::lock_fh_pos(Fh *f)
8906{
11fdf7f2 8907 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8908
8909 if (f->pos_locked || !f->pos_waiters.empty()) {
8910 Cond cond;
8911 f->pos_waiters.push_back(&cond);
11fdf7f2 8912 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
7c673cae
FG
8913 while (f->pos_locked || f->pos_waiters.front() != &cond)
8914 cond.Wait(client_lock);
11fdf7f2
TL
8915 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
8916 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
8917 f->pos_waiters.pop_front();
8918 }
8919
8920 f->pos_locked = true;
8921}
8922
8923void Client::unlock_fh_pos(Fh *f)
8924{
11fdf7f2 8925 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
8926 f->pos_locked = false;
8927}
8928
8929int Client::uninline_data(Inode *in, Context *onfinish)
8930{
8931 if (!in->inline_data.length()) {
8932 onfinish->complete(0);
8933 return 0;
8934 }
8935
8936 char oid_buf[32];
8937 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8938 object_t oid = oid_buf;
8939
8940 ObjectOperation create_ops;
8941 create_ops.create(false);
8942
8943 objecter->mutate(oid,
8944 OSDMap::file_to_object_locator(in->layout),
8945 create_ops,
8946 in->snaprealm->get_snap_context(),
8947 ceph::real_clock::now(),
8948 0,
8949 NULL);
8950
8951 bufferlist inline_version_bl;
11fdf7f2 8952 encode(in->inline_version, inline_version_bl);
7c673cae
FG
8953
8954 ObjectOperation uninline_ops;
8955 uninline_ops.cmpxattr("inline_version",
8956 CEPH_OSD_CMPXATTR_OP_GT,
8957 CEPH_OSD_CMPXATTR_MODE_U64,
8958 inline_version_bl);
8959 bufferlist inline_data = in->inline_data;
8960 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8961 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8962
8963 objecter->mutate(oid,
8964 OSDMap::file_to_object_locator(in->layout),
8965 uninline_ops,
8966 in->snaprealm->get_snap_context(),
8967 ceph::real_clock::now(),
8968 0,
8969 onfinish);
8970
8971 return 0;
8972}
8973
8974//
8975
8976// blocking osd interface
8977
8978int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8979{
11fdf7f2 8980 std::lock_guard lock(client_lock);
7c673cae
FG
8981 tout(cct) << "read" << std::endl;
8982 tout(cct) << fd << std::endl;
8983 tout(cct) << size << std::endl;
8984 tout(cct) << offset << std::endl;
8985
181888fb
FG
8986 if (unmounting)
8987 return -ENOTCONN;
8988
7c673cae
FG
8989 Fh *f = get_filehandle(fd);
8990 if (!f)
8991 return -EBADF;
8992#if defined(__linux__) && defined(O_PATH)
8993 if (f->flags & O_PATH)
8994 return -EBADF;
8995#endif
8996 bufferlist bl;
11fdf7f2
TL
8997 /* We can't return bytes written larger than INT_MAX, clamp size to that */
8998 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
8999 int r = _read(f, offset, size, &bl);
9000 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9001 if (r >= 0) {
9002 bl.copy(0, bl.length(), buf);
9003 r = bl.length();
9004 }
9005 return r;
9006}
9007
9008int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9009{
9010 if (iovcnt < 0)
9011 return -EINVAL;
9012 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9013}
9014
11fdf7f2 9015int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9016{
11fdf7f2
TL
9017 int want, have = 0;
9018 bool movepos = false;
9019 std::unique_ptr<C_SaferCond> onuninline;
9020 int64_t r = 0;
9021 const auto& conf = cct->_conf;
7c673cae 9022 Inode *in = f->inode.get();
11fdf7f2
TL
9023 utime_t lat;
9024 utime_t start = ceph_clock_now();
7c673cae
FG
9025
9026 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
9027 return -EBADF;
9028 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9029
7c673cae
FG
9030 if (offset < 0) {
9031 lock_fh_pos(f);
9032 offset = f->pos;
9033 movepos = true;
9034 }
9035 loff_t start_pos = offset;
9036
9037 if (in->inline_version == 0) {
11fdf7f2 9038 r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 9039 if (r < 0) {
11fdf7f2 9040 goto done;
c07f9fc5 9041 }
11fdf7f2 9042 ceph_assert(in->inline_version > 0);
7c673cae
FG
9043 }
9044
9045retry:
11fdf7f2
TL
9046 if (f->mode & CEPH_FILE_MODE_LAZY)
9047 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
9048 else
9049 want = CEPH_CAP_FILE_CACHE;
9050 r = get_caps(in, CEPH_CAP_FILE_RD, want, &have, -1);
c07f9fc5 9051 if (r < 0) {
11fdf7f2 9052 goto done;
c07f9fc5 9053 }
7c673cae 9054 if (f->flags & O_DIRECT)
11fdf7f2 9055 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9056
9057 if (in->inline_version < CEPH_INLINE_NONE) {
9058 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
9059 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
9060 uninline_data(in, onuninline.get());
7c673cae
FG
9061 } else {
9062 uint32_t len = in->inline_data.length();
7c673cae
FG
9063 uint64_t endoff = offset + size;
9064 if (endoff > in->size)
9065 endoff = in->size;
9066
9067 if (offset < len) {
9068 if (endoff <= len) {
9069 bl->substr_of(in->inline_data, offset, endoff - offset);
9070 } else {
9071 bl->substr_of(in->inline_data, offset, len - offset);
9072 bl->append_zero(endoff - len);
9073 }
11fdf7f2 9074 r = endoff - offset;
7c673cae
FG
9075 } else if ((uint64_t)offset < endoff) {
9076 bl->append_zero(endoff - offset);
11fdf7f2
TL
9077 r = endoff - offset;
9078 } else {
9079 r = 0;
7c673cae 9080 }
7c673cae
FG
9081 goto success;
9082 }
9083 }
9084
9085 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
9086 conf->client_oc &&
9087 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9088
9089 if (f->flags & O_RSYNC) {
9090 _flush_range(in, offset, size);
9091 }
9092 r = _read_async(f, offset, size, bl);
9093 if (r < 0)
9094 goto done;
9095 } else {
9096 if (f->flags & O_DIRECT)
9097 _flush_range(in, offset, size);
9098
9099 bool checkeof = false;
9100 r = _read_sync(f, offset, size, bl, &checkeof);
9101 if (r < 0)
9102 goto done;
9103 if (checkeof) {
9104 offset += r;
9105 size -= r;
9106
9107 put_cap_ref(in, CEPH_CAP_FILE_RD);
9108 have = 0;
9109 // reverify size
9110 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9111 if (r < 0)
9112 goto done;
9113
9114 // eof? short read.
9115 if ((uint64_t)offset < in->size)
9116 goto retry;
9117 }
9118 }
9119
9120success:
11fdf7f2 9121 ceph_assert(r >= 0);
7c673cae
FG
9122 if (movepos) {
9123 // adjust fd pos
11fdf7f2 9124 f->pos = start_pos + r;
7c673cae 9125 }
11fdf7f2
TL
9126
9127 lat = ceph_clock_now();
9128 lat -= start;
9129 logger->tinc(l_c_read, lat);
7c673cae
FG
9130
9131done:
9132 // done!
11fdf7f2 9133
7c673cae
FG
9134 if (onuninline) {
9135 client_lock.Unlock();
11fdf7f2 9136 int ret = onuninline->wait();
7c673cae 9137 client_lock.Lock();
11fdf7f2 9138 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
9139 in->inline_data.clear();
9140 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9141 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9142 check_caps(in, 0);
9143 } else
11fdf7f2 9144 r = ret;
7c673cae 9145 }
11fdf7f2 9146 if (have) {
7c673cae 9147 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
9148 }
9149 if (movepos) {
9150 unlock_fh_pos(f);
9151 }
9152 return r;
7c673cae
FG
9153}
9154
9155Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
9156 client(c), f(f) {
9157 f->get();
9158 f->readahead.inc_pending();
9159}
9160
9161Client::C_Readahead::~C_Readahead() {
9162 f->readahead.dec_pending();
9163 client->_put_fh(f);
9164}
9165
9166void Client::C_Readahead::finish(int r) {
9167 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
9168 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9169}
9170
9171int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
9172{
11fdf7f2 9173 const auto& conf = cct->_conf;
7c673cae
FG
9174 Inode *in = f->inode.get();
9175
11fdf7f2 9176 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9177
9178 // trim read based on file size?
9179 if (off >= in->size)
9180 return 0;
9181 if (len == 0)
9182 return 0;
9183 if (off + len > in->size) {
9184 len = in->size - off;
9185 }
9186
9187 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
9188 << " max_bytes=" << f->readahead.get_max_readahead_size()
9189 << " max_periods=" << conf->client_readahead_max_periods << dendl;
9190
9191 // read (and possibly block)
11fdf7f2
TL
9192 int r = 0;
9193 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 9194 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 9195 off, len, bl, 0, &onfinish);
7c673cae
FG
9196 if (r == 0) {
9197 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9198 client_lock.Unlock();
11fdf7f2 9199 r = onfinish.wait();
7c673cae
FG
9200 client_lock.Lock();
9201 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
7c673cae
FG
9202 }
9203
9204 if(f->readahead.get_min_readahead_size() > 0) {
9205 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
9206 if (readahead_extent.second > 0) {
9207 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
9208 << " (caller wants " << off << "~" << len << ")" << dendl;
9209 Context *onfinish2 = new C_Readahead(this, f);
9210 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
9211 readahead_extent.first, readahead_extent.second,
9212 NULL, 0, onfinish2);
9213 if (r2 == 0) {
9214 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
9215 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
9216 } else {
9217 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
9218 delete onfinish2;
9219 }
9220 }
9221 }
9222
9223 return r;
9224}
9225
9226int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
9227 bool *checkeof)
9228{
9229 Inode *in = f->inode.get();
9230 uint64_t pos = off;
9231 int left = len;
9232 int read = 0;
9233
11fdf7f2 9234 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
9235
9236 Mutex flock("Client::_read_sync flock");
9237 Cond cond;
9238 while (left > 0) {
11fdf7f2 9239 C_SaferCond onfinish("Client::_read_sync flock");
7c673cae
FG
9240 bufferlist tbl;
9241
9242 int wanted = left;
9243 filer->read_trunc(in->ino, &in->layout, in->snapid,
9244 pos, left, &tbl, 0,
9245 in->truncate_size, in->truncate_seq,
11fdf7f2 9246 &onfinish);
7c673cae 9247 client_lock.Unlock();
11fdf7f2 9248 int r = onfinish.wait();
7c673cae
FG
9249 client_lock.Lock();
9250
9251 // if we get ENOENT from OSD, assume 0 bytes returned
9252 if (r == -ENOENT)
9253 r = 0;
9254 if (r < 0)
9255 return r;
9256 if (tbl.length()) {
9257 r = tbl.length();
9258
9259 read += r;
9260 pos += r;
9261 left -= r;
9262 bl->claim_append(tbl);
9263 }
9264 // short read?
9265 if (r >= 0 && r < wanted) {
9266 if (pos < in->size) {
9267 // zero up to known EOF
9268 int64_t some = in->size - pos;
9269 if (some > left)
9270 some = left;
11fdf7f2
TL
9271 auto z = buffer::ptr_node::create(some);
9272 z->zero();
9273 bl->push_back(std::move(z));
7c673cae
FG
9274 read += some;
9275 pos += some;
9276 left -= some;
9277 if (left == 0)
9278 return read;
9279 }
9280
9281 *checkeof = true;
9282 return read;
9283 }
9284 }
9285 return read;
9286}
9287
9288
9289/*
9290 * we keep count of uncommitted sync writes on the inode, so that
9291 * fsync can DDRT.
9292 */
9293void Client::_sync_write_commit(Inode *in)
9294{
11fdf7f2 9295 ceph_assert(unsafe_sync_write > 0);
7c673cae
FG
9296 unsafe_sync_write--;
9297
9298 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9299
11fdf7f2 9300 ldout(cct, 15) << __func__ << " unsafe_sync_write = " << unsafe_sync_write << dendl;
7c673cae 9301 if (unsafe_sync_write == 0 && unmounting) {
11fdf7f2 9302 ldout(cct, 10) << __func__ << " -- no more unsafe writes, unmount can proceed" << dendl;
7c673cae
FG
9303 mount_cond.Signal();
9304 }
9305}
9306
9307int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
9308{
11fdf7f2 9309 std::lock_guard lock(client_lock);
7c673cae
FG
9310 tout(cct) << "write" << std::endl;
9311 tout(cct) << fd << std::endl;
9312 tout(cct) << size << std::endl;
9313 tout(cct) << offset << std::endl;
9314
181888fb
FG
9315 if (unmounting)
9316 return -ENOTCONN;
9317
7c673cae
FG
9318 Fh *fh = get_filehandle(fd);
9319 if (!fh)
9320 return -EBADF;
9321#if defined(__linux__) && defined(O_PATH)
9322 if (fh->flags & O_PATH)
9323 return -EBADF;
9324#endif
11fdf7f2
TL
9325 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9326 size = std::min(size, (loff_t)INT_MAX);
9327 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
9328 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9329 return r;
9330}
9331
9332int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9333{
9334 if (iovcnt < 0)
9335 return -EINVAL;
9336 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9337}
9338
11fdf7f2
TL
9339int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
9340 unsigned iovcnt, int64_t offset, bool write,
9341 bool clamp_to_int)
7c673cae 9342{
7c673cae
FG
9343#if defined(__linux__) && defined(O_PATH)
9344 if (fh->flags & O_PATH)
9345 return -EBADF;
9346#endif
9347 loff_t totallen = 0;
9348 for (unsigned i = 0; i < iovcnt; i++) {
9349 totallen += iov[i].iov_len;
9350 }
11fdf7f2
TL
9351
9352 /*
9353 * Some of the API functions take 64-bit size values, but only return
9354 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
9355 * we don't do I/Os larger than the values we can return.
9356 */
9357 if (clamp_to_int) {
9358 totallen = std::min(totallen, (loff_t)INT_MAX);
9359 }
7c673cae 9360 if (write) {
11fdf7f2
TL
9361 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9362 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
9363 return w;
9364 } else {
9365 bufferlist bl;
11fdf7f2
TL
9366 int64_t r = _read(fh, offset, totallen, &bl);
9367 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
9368 if (r <= 0)
9369 return r;
9370
9371 int bufoff = 0;
9372 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9373 /*
9374 * This piece of code aims to handle the case that bufferlist does not have enough data
9375 * to fill in the iov
9376 */
9377 if (resid < iov[j].iov_len) {
9378 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9379 break;
9380 } else {
9381 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9382 }
9383 resid -= iov[j].iov_len;
9384 bufoff += iov[j].iov_len;
9385 }
9386 return r;
9387 }
9388}
9389
11fdf7f2
TL
9390int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9391{
9392 std::lock_guard lock(client_lock);
9393 tout(cct) << fd << std::endl;
9394 tout(cct) << offset << std::endl;
9395
9396 if (unmounting)
9397 return -ENOTCONN;
9398
9399 Fh *fh = get_filehandle(fd);
9400 if (!fh)
9401 return -EBADF;
9402 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
9403}
9404
9405int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9406 const struct iovec *iov, int iovcnt)
7c673cae 9407{
f64942e4
AA
9408 uint64_t fpos = 0;
9409
7c673cae
FG
9410 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9411 return -EFBIG;
9412
9413 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9414 Inode *in = f->inode.get();
9415
9416 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9417 return -ENOSPC;
9418 }
9419
11fdf7f2 9420 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
9421
9422 // was Fh opened as writeable?
9423 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9424 return -EBADF;
9425
7c673cae
FG
9426 // use/adjust fd pos?
9427 if (offset < 0) {
9428 lock_fh_pos(f);
9429 /*
9430 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9431 * change out from under us.
9432 */
9433 if (f->flags & O_APPEND) {
9434 int r = _lseek(f, 0, SEEK_END);
9435 if (r < 0) {
9436 unlock_fh_pos(f);
9437 return r;
9438 }
9439 }
9440 offset = f->pos;
f64942e4 9441 fpos = offset+size;
7c673cae
FG
9442 unlock_fh_pos(f);
9443 }
9444
11fdf7f2
TL
9445 // check quota
9446 uint64_t endoff = offset + size;
9447 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9448 f->actor_perms)) {
9449 return -EDQUOT;
9450 }
9451
7c673cae
FG
9452 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9453
9454 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9455
9456 // time it.
9457 utime_t start = ceph_clock_now();
9458
9459 if (in->inline_version == 0) {
9460 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9461 if (r < 0)
9462 return r;
11fdf7f2 9463 ceph_assert(in->inline_version > 0);
7c673cae
FG
9464 }
9465
9466 // copy into fresh buffer (since our write may be resub, async)
9467 bufferlist bl;
9468 if (buf) {
9469 if (size > 0)
9470 bl.append(buf, size);
9471 } else if (iov){
9472 for (int i = 0; i < iovcnt; i++) {
9473 if (iov[i].iov_len > 0) {
9474 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9475 }
9476 }
9477 }
9478
9479 utime_t lat;
9480 uint64_t totalwritten;
11fdf7f2
TL
9481 int want, have;
9482 if (f->mode & CEPH_FILE_MODE_LAZY)
9483 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
9484 else
9485 want = CEPH_CAP_FILE_BUFFER;
9486 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
9487 if (r < 0)
9488 return r;
9489
9490 /* clear the setuid/setgid bits, if any */
181888fb 9491 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
9492 struct ceph_statx stx = { 0 };
9493
9494 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9495 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9496 if (r < 0)
9497 return r;
9498 } else {
9499 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9500 }
9501
9502 if (f->flags & O_DIRECT)
11fdf7f2 9503 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
9504
9505 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9506
11fdf7f2
TL
9507 std::unique_ptr<C_SaferCond> onuninline = nullptr;
9508
7c673cae
FG
9509 if (in->inline_version < CEPH_INLINE_NONE) {
9510 if (endoff > cct->_conf->client_max_inline_size ||
9511 endoff > CEPH_INLINE_MAX_SIZE ||
9512 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
9513 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
9514 uninline_data(in, onuninline.get());
7c673cae
FG
9515 } else {
9516 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9517
9518 uint32_t len = in->inline_data.length();
9519
9520 if (endoff < len)
9521 in->inline_data.copy(endoff, len - endoff, bl);
9522
9523 if (offset < len)
9524 in->inline_data.splice(offset, len - offset);
9525 else if (offset > len)
9526 in->inline_data.append_zero(offset - len);
9527
9528 in->inline_data.append(bl);
9529 in->inline_version++;
9530
9531 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9532
9533 goto success;
9534 }
9535 }
9536
11fdf7f2
TL
9537 if (cct->_conf->client_oc &&
9538 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
9539 // do buffered write
9540 if (!in->oset.dirty_or_tx)
9541 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9542
9543 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9544
9545 // async, caching, non-blocking.
9546 r = objectcacher->file_write(&in->oset, &in->layout,
9547 in->snaprealm->get_snap_context(),
9548 offset, size, bl, ceph::real_clock::now(),
9549 0);
9550 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9551
9552 if (r < 0)
9553 goto done;
9554
9555 // flush cached write if O_SYNC is set on file fh
9556 // O_DSYNC == O_SYNC on linux < 2.6.33
9557 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9558 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9559 _flush_range(in, offset, size);
9560 }
9561 } else {
9562 if (f->flags & O_DIRECT)
9563 _flush_range(in, offset, size);
9564
9565 // simple, non-atomic sync write
11fdf7f2 9566 C_SaferCond onfinish("Client::_write flock");
7c673cae
FG
9567 unsafe_sync_write++;
9568 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9569
9570 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9571 offset, size, bl, ceph::real_clock::now(), 0,
9572 in->truncate_size, in->truncate_seq,
11fdf7f2 9573 &onfinish);
7c673cae 9574 client_lock.Unlock();
11fdf7f2 9575 onfinish.wait();
7c673cae
FG
9576 client_lock.Lock();
9577 _sync_write_commit(in);
9578 }
9579
9580 // if we get here, write was successful, update client metadata
9581success:
9582 // time
9583 lat = ceph_clock_now();
9584 lat -= start;
9585 logger->tinc(l_c_wrlat, lat);
9586
f64942e4
AA
9587 if (fpos) {
9588 lock_fh_pos(f);
9589 f->pos = fpos;
9590 unlock_fh_pos(f);
9591 }
7c673cae 9592 totalwritten = size;
11fdf7f2 9593 r = (int64_t)totalwritten;
7c673cae
FG
9594
9595 // extend file?
9596 if (totalwritten + offset > in->size) {
9597 in->size = totalwritten + offset;
28e407b8 9598 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 9599
11fdf7f2 9600 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 9601 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
9602 } else if (is_max_size_approaching(in)) {
9603 check_caps(in, 0);
7c673cae
FG
9604 }
9605
9606 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9607 } else {
9608 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9609 }
9610
9611 // mtime
91327a77 9612 in->mtime = in->ctime = ceph_clock_now();
7c673cae 9613 in->change_attr++;
28e407b8 9614 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9615
9616done:
9617
11fdf7f2 9618 if (nullptr != onuninline) {
7c673cae 9619 client_lock.Unlock();
11fdf7f2 9620 int uninline_ret = onuninline->wait();
7c673cae
FG
9621 client_lock.Lock();
9622
9623 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9624 in->inline_data.clear();
9625 in->inline_version = CEPH_INLINE_NONE;
28e407b8 9626 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
9627 check_caps(in, 0);
9628 } else
9629 r = uninline_ret;
9630 }
9631
9632 put_cap_ref(in, CEPH_CAP_FILE_WR);
9633 return r;
9634}
9635
9636int Client::_flush(Fh *f)
9637{
9638 Inode *in = f->inode.get();
9639 int err = f->take_async_err();
9640 if (err != 0) {
9641 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9642 << cpp_strerror(err) << dendl;
9643 } else {
9644 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9645 }
9646
9647 return err;
9648}
9649
9650int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9651{
9652 struct ceph_statx stx;
9653 stx.stx_size = length;
9654 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9655}
9656
9657int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9658{
11fdf7f2
TL
9659 std::lock_guard lock(client_lock);
9660 tout(cct) << __func__ << std::endl;
7c673cae
FG
9661 tout(cct) << fd << std::endl;
9662 tout(cct) << length << std::endl;
9663
181888fb
FG
9664 if (unmounting)
9665 return -ENOTCONN;
9666
7c673cae
FG
9667 Fh *f = get_filehandle(fd);
9668 if (!f)
9669 return -EBADF;
9670#if defined(__linux__) && defined(O_PATH)
9671 if (f->flags & O_PATH)
9672 return -EBADF;
9673#endif
9674 struct stat attr;
9675 attr.st_size = length;
9676 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9677}
9678
9679int Client::fsync(int fd, bool syncdataonly)
9680{
11fdf7f2 9681 std::lock_guard lock(client_lock);
7c673cae
FG
9682 tout(cct) << "fsync" << std::endl;
9683 tout(cct) << fd << std::endl;
9684 tout(cct) << syncdataonly << std::endl;
9685
181888fb
FG
9686 if (unmounting)
9687 return -ENOTCONN;
9688
7c673cae
FG
9689 Fh *f = get_filehandle(fd);
9690 if (!f)
9691 return -EBADF;
9692#if defined(__linux__) && defined(O_PATH)
9693 if (f->flags & O_PATH)
9694 return -EBADF;
9695#endif
9696 int r = _fsync(f, syncdataonly);
9697 if (r == 0) {
9698 // The IOs in this fsync were okay, but maybe something happened
9699 // in the background that we shoudl be reporting?
9700 r = f->take_async_err();
1adf2230 9701 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
9702 << ") = 0, async_err = " << r << dendl;
9703 } else {
9704 // Assume that an error we encountered during fsync, even reported
9705 // synchronously, would also have applied the error to the Fh, and we
9706 // should clear it here to avoid returning the same error again on next
9707 // call.
1adf2230 9708 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
9709 << r << dendl;
9710 f->take_async_err();
9711 }
9712 return r;
9713}
9714
9715int Client::_fsync(Inode *in, bool syncdataonly)
9716{
9717 int r = 0;
11fdf7f2 9718 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
9719 ceph_tid_t flush_tid = 0;
9720 InodeRef tmp_ref;
11fdf7f2
TL
9721 utime_t lat;
9722 utime_t start = ceph_clock_now();
7c673cae 9723
1adf2230 9724 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
9725
9726 if (cct->_conf->client_oc) {
11fdf7f2
TL
9727 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
9728 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
9729 _flush(in, object_cacher_completion.get());
7c673cae
FG
9730 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9731 }
9732
9733 if (!syncdataonly && in->dirty_caps) {
9734 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9735 if (in->flushing_caps)
9736 flush_tid = last_flush_tid;
9737 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9738
9739 if (!syncdataonly && !in->unsafe_ops.empty()) {
28e407b8
AA
9740 flush_mdlog_sync();
9741
7c673cae
FG
9742 MetaRequest *req = in->unsafe_ops.back();
9743 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9744
9745 req->get();
9746 wait_on_list(req->waitfor_safe);
9747 put_request(req);
9748 }
9749
11fdf7f2 9750 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
7c673cae 9751 client_lock.Unlock();
7c673cae 9752 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 9753 r = object_cacher_completion->wait();
7c673cae
FG
9754 client_lock.Lock();
9755 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9756 } else {
9757 // FIXME: this can starve
9758 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9759 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9760 << " uncommitted, waiting" << dendl;
9761 wait_on_list(in->waitfor_commit);
9762 }
9763 }
9764
9765 if (!r) {
9766 if (flush_tid > 0)
9767 wait_sync_caps(in, flush_tid);
9768
9769 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9770 } else {
1adf2230 9771 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
9772 << cpp_strerror(-r) << dendl;
9773 }
11fdf7f2
TL
9774
9775 lat = ceph_clock_now();
9776 lat -= start;
9777 logger->tinc(l_c_fsync, lat);
7c673cae
FG
9778
9779 return r;
9780}
9781
9782int Client::_fsync(Fh *f, bool syncdataonly)
9783{
1adf2230 9784 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
9785 return _fsync(f->inode.get(), syncdataonly);
9786}
9787
9788int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9789{
11fdf7f2 9790 std::lock_guard lock(client_lock);
7c673cae
FG
9791 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9792 tout(cct) << fd << std::endl;
9793
181888fb
FG
9794 if (unmounting)
9795 return -ENOTCONN;
9796
7c673cae
FG
9797 Fh *f = get_filehandle(fd);
9798 if (!f)
9799 return -EBADF;
9800 int r = _getattr(f->inode, mask, perms);
9801 if (r < 0)
9802 return r;
9803 fill_stat(f->inode, stbuf, NULL);
1adf2230 9804 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
9805 return r;
9806}
9807
9808int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9809 unsigned int want, unsigned int flags)
9810{
11fdf7f2 9811 std::lock_guard lock(client_lock);
7c673cae
FG
9812 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9813 tout(cct) << fd << std::endl;
9814
181888fb
FG
9815 if (unmounting)
9816 return -ENOTCONN;
9817
7c673cae
FG
9818 Fh *f = get_filehandle(fd);
9819 if (!f)
9820 return -EBADF;
9821
9822 unsigned mask = statx_to_mask(flags, want);
9823
9824 int r = 0;
94b18763 9825 if (mask && !f->inode->caps_issued_mask(mask, true)) {
7c673cae
FG
9826 r = _getattr(f->inode, mask, perms);
9827 if (r < 0) {
9828 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9829 return r;
9830 }
9831 }
9832
9833 fill_statx(f->inode, mask, stx);
9834 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9835 return r;
9836}
9837
9838// not written yet, but i want to link!
9839
9840int Client::chdir(const char *relpath, std::string &new_cwd,
9841 const UserPerm& perms)
9842{
11fdf7f2 9843 std::lock_guard lock(client_lock);
7c673cae
FG
9844 tout(cct) << "chdir" << std::endl;
9845 tout(cct) << relpath << std::endl;
181888fb
FG
9846
9847 if (unmounting)
9848 return -ENOTCONN;
9849
7c673cae
FG
9850 filepath path(relpath);
9851 InodeRef in;
9852 int r = path_walk(path, &in, perms);
9853 if (r < 0)
9854 return r;
9855 if (cwd != in)
9856 cwd.swap(in);
9857 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9858
b5b8bbf5 9859 _getcwd(new_cwd, perms);
7c673cae
FG
9860 return 0;
9861}
9862
b5b8bbf5 9863void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
9864{
9865 filepath path;
11fdf7f2 9866 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
9867
9868 Inode *in = cwd.get();
9869 while (in != root) {
11fdf7f2 9870 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
9871
9872 // A cwd or ancester is unlinked
11fdf7f2 9873 if (in->dentries.empty()) {
7c673cae
FG
9874 return;
9875 }
9876
9877 Dentry *dn = in->get_first_parent();
9878
9879
9880 if (!dn) {
9881 // look it up
11fdf7f2 9882 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
9883 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9884 filepath path(in->ino);
9885 req->set_filepath(path);
9886 req->set_inode(in);
9887 int res = make_request(req, perms);
9888 if (res < 0)
9889 break;
9890
9891 // start over
9892 path = filepath();
9893 in = cwd.get();
9894 continue;
9895 }
9896 path.push_front_dentry(dn->name);
9897 in = dn->dir->parent_inode;
9898 }
9899 dir = "/";
9900 dir += path.get_path();
9901}
9902
b5b8bbf5
FG
9903void Client::getcwd(string& dir, const UserPerm& perms)
9904{
11fdf7f2 9905 std::lock_guard l(client_lock);
181888fb
FG
9906 if (!unmounting)
9907 _getcwd(dir, perms);
b5b8bbf5
FG
9908}
9909
7c673cae
FG
9910int Client::statfs(const char *path, struct statvfs *stbuf,
9911 const UserPerm& perms)
9912{
11fdf7f2
TL
9913 std::lock_guard l(client_lock);
9914 tout(cct) << __func__ << std::endl;
91327a77 9915 unsigned long int total_files_on_fs;
7c673cae 9916
181888fb
FG
9917 if (unmounting)
9918 return -ENOTCONN;
9919
7c673cae
FG
9920 ceph_statfs stats;
9921 C_SaferCond cond;
d2e6a577
FG
9922
9923 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9924 if (data_pools.size() == 1) {
9925 objecter->get_fs_stats(stats, data_pools[0], &cond);
9926 } else {
9927 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9928 }
7c673cae
FG
9929
9930 client_lock.Unlock();
9931 int rval = cond.wait();
91327a77
AA
9932 assert(root);
9933 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
9934 client_lock.Lock();
9935
9936 if (rval < 0) {
9937 ldout(cct, 1) << "underlying call to statfs returned error: "
9938 << cpp_strerror(rval)
9939 << dendl;
9940 return rval;
9941 }
9942
9943 memset(stbuf, 0, sizeof(*stbuf));
9944
9945 /*
9946 * we're going to set a block size of 4MB so we can represent larger
9947 * FSes without overflowing. Additionally convert the space
9948 * measurements from KB to bytes while making them in terms of
9949 * blocks. We use 4MB only because it is big enough, and because it
9950 * actually *is* the (ceph) default block size.
9951 */
9952 const int CEPH_BLOCK_SHIFT = 22;
9953 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9954 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77
AA
9955 stbuf->f_files = total_files_on_fs;
9956 stbuf->f_ffree = 0;
7c673cae
FG
9957 stbuf->f_favail = -1;
9958 stbuf->f_fsid = -1; // ??
9959 stbuf->f_flag = 0; // ??
9960 stbuf->f_namemax = NAME_MAX;
9961
9962 // Usually quota_root will == root_ancestor, but if the mount root has no
9963 // quota but we can see a parent of it that does have a quota, we'll
9964 // respect that one instead.
11fdf7f2 9965 ceph_assert(root != nullptr);
7c673cae
FG
9966 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9967
9968 // get_quota_root should always give us something
9969 // because client quotas are always enabled
11fdf7f2 9970 ceph_assert(quota_root != nullptr);
7c673cae
FG
9971
9972 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9973
9974 // Skip the getattr if any sessions are stale, as we don't want to
9975 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9976 // is unhealthy.
9977 if (!_any_stale_sessions()) {
9978 int r = _getattr(quota_root, 0, perms, true);
9979 if (r != 0) {
9980 // Ignore return value: error getting latest inode metadata is not a good
9981 // reason to break "df".
9982 lderr(cct) << "Error in getattr on quota root 0x"
9983 << std::hex << quota_root->ino << std::dec
9984 << " statfs result may be outdated" << dendl;
9985 }
9986 }
9987
9988 // Special case: if there is a size quota set on the Inode acting
9989 // as the root for this client mount, then report the quota status
9990 // as the filesystem statistics.
9991 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9992 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
9993 // It is possible for a quota to be exceeded: arithmetic here must
9994 // handle case where used > total.
9995 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
9996
9997 stbuf->f_blocks = total;
9998 stbuf->f_bfree = free;
9999 stbuf->f_bavail = free;
10000 } else {
d2e6a577 10001 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
10002 // multiple pools may be used without one filesystem namespace via
10003 // layouts, this is the most correct thing we can do.
10004 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
10005 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10006 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
10007 }
10008
10009 return rval;
10010}
10011
10012int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
10013 struct flock *fl, uint64_t owner, bool removing)
10014{
11fdf7f2 10015 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
10016 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
10017 << " type " << fl->l_type << " owner " << owner
10018 << " " << fl->l_start << "~" << fl->l_len << dendl;
10019
10020 int lock_cmd;
10021 if (F_RDLCK == fl->l_type)
10022 lock_cmd = CEPH_LOCK_SHARED;
10023 else if (F_WRLCK == fl->l_type)
10024 lock_cmd = CEPH_LOCK_EXCL;
10025 else if (F_UNLCK == fl->l_type)
10026 lock_cmd = CEPH_LOCK_UNLOCK;
10027 else
10028 return -EIO;
10029
10030 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
10031 sleep = 0;
10032
10033 /*
10034 * Set the most significant bit, so that MDS knows the 'owner'
10035 * is sufficient to identify the owner of lock. (old code uses
10036 * both 'owner' and 'pid')
10037 */
10038 owner |= (1ULL << 63);
10039
10040 MetaRequest *req = new MetaRequest(op);
10041 filepath path;
10042 in->make_nosnap_relative_path(path);
10043 req->set_filepath(path);
10044 req->set_inode(in);
10045
10046 req->head.args.filelock_change.rule = lock_type;
10047 req->head.args.filelock_change.type = lock_cmd;
10048 req->head.args.filelock_change.owner = owner;
10049 req->head.args.filelock_change.pid = fl->l_pid;
10050 req->head.args.filelock_change.start = fl->l_start;
10051 req->head.args.filelock_change.length = fl->l_len;
10052 req->head.args.filelock_change.wait = sleep;
10053
10054 int ret;
10055 bufferlist bl;
10056
10057 if (sleep && switch_interrupt_cb) {
10058 // enable interrupt
10059 switch_interrupt_cb(callback_handle, req->get());
10060 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
10061 // disable interrupt
10062 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
10063 if (ret == 0 && req->aborted()) {
10064 // effect of this lock request has been revoked by the 'lock intr' request
10065 ret = req->get_abort_code();
10066 }
7c673cae
FG
10067 put_request(req);
10068 } else {
10069 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
10070 }
10071
10072 if (ret == 0) {
10073 if (op == CEPH_MDS_OP_GETFILELOCK) {
10074 ceph_filelock filelock;
11fdf7f2
TL
10075 auto p = bl.cbegin();
10076 decode(filelock, p);
7c673cae
FG
10077
10078 if (CEPH_LOCK_SHARED == filelock.type)
10079 fl->l_type = F_RDLCK;
10080 else if (CEPH_LOCK_EXCL == filelock.type)
10081 fl->l_type = F_WRLCK;
10082 else
10083 fl->l_type = F_UNLCK;
10084
10085 fl->l_whence = SEEK_SET;
10086 fl->l_start = filelock.start;
10087 fl->l_len = filelock.length;
10088 fl->l_pid = filelock.pid;
10089 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
10090 ceph_lock_state_t *lock_state;
10091 if (lock_type == CEPH_LOCK_FCNTL) {
10092 if (!in->fcntl_locks)
11fdf7f2
TL
10093 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10094 lock_state = in->fcntl_locks.get();
7c673cae
FG
10095 } else if (lock_type == CEPH_LOCK_FLOCK) {
10096 if (!in->flock_locks)
11fdf7f2
TL
10097 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10098 lock_state = in->flock_locks.get();
7c673cae
FG
10099 } else {
10100 ceph_abort();
10101 return -EINVAL;
10102 }
10103 _update_lock_state(fl, owner, lock_state);
10104
10105 if (!removing) {
10106 if (lock_type == CEPH_LOCK_FCNTL) {
10107 if (!fh->fcntl_locks)
11fdf7f2
TL
10108 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
10109 lock_state = fh->fcntl_locks.get();
7c673cae
FG
10110 } else {
10111 if (!fh->flock_locks)
11fdf7f2
TL
10112 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
10113 lock_state = fh->flock_locks.get();
7c673cae
FG
10114 }
10115 _update_lock_state(fl, owner, lock_state);
10116 }
10117 } else
10118 ceph_abort();
10119 }
10120 return ret;
10121}
10122
10123int Client::_interrupt_filelock(MetaRequest *req)
10124{
31f18b77
FG
10125 // Set abort code, but do not kick. The abort code prevents the request
10126 // from being re-sent.
10127 req->abort(-EINTR);
10128 if (req->mds < 0)
10129 return 0; // haven't sent the request
10130
7c673cae
FG
10131 Inode *in = req->inode();
10132
10133 int lock_type;
10134 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
10135 lock_type = CEPH_LOCK_FLOCK_INTR;
10136 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
10137 lock_type = CEPH_LOCK_FCNTL_INTR;
10138 else {
10139 ceph_abort();
10140 return -EINVAL;
10141 }
10142
10143 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
10144 filepath path;
10145 in->make_nosnap_relative_path(path);
10146 intr_req->set_filepath(path);
10147 intr_req->set_inode(in);
10148 intr_req->head.args.filelock_change = req->head.args.filelock_change;
10149 intr_req->head.args.filelock_change.rule = lock_type;
10150 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
10151
10152 UserPerm perms(req->get_uid(), req->get_gid());
10153 return make_request(intr_req, perms, NULL, NULL, -1);
10154}
10155
10156void Client::_encode_filelocks(Inode *in, bufferlist& bl)
10157{
10158 if (!in->fcntl_locks && !in->flock_locks)
10159 return;
10160
10161 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 10162 encode(nr_fcntl_locks, bl);
7c673cae 10163 if (nr_fcntl_locks) {
11fdf7f2 10164 auto &lock_state = in->fcntl_locks;
7c673cae
FG
10165 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10166 p != lock_state->held_locks.end();
10167 ++p)
11fdf7f2 10168 encode(p->second, bl);
7c673cae
FG
10169 }
10170
10171 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 10172 encode(nr_flock_locks, bl);
7c673cae 10173 if (nr_flock_locks) {
11fdf7f2 10174 auto &lock_state = in->flock_locks;
7c673cae
FG
10175 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10176 p != lock_state->held_locks.end();
10177 ++p)
11fdf7f2 10178 encode(p->second, bl);
7c673cae
FG
10179 }
10180
11fdf7f2 10181 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
10182 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
10183}
10184
10185void Client::_release_filelocks(Fh *fh)
10186{
10187 if (!fh->fcntl_locks && !fh->flock_locks)
10188 return;
10189
10190 Inode *in = fh->inode.get();
11fdf7f2 10191 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae
FG
10192
10193 list<pair<int, ceph_filelock> > to_release;
10194
10195 if (fh->fcntl_locks) {
11fdf7f2 10196 auto &lock_state = fh->fcntl_locks;
7c673cae
FG
10197 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10198 p != lock_state->held_locks.end();
10199 ++p)
10200 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
11fdf7f2 10201 lock_state.reset();
7c673cae
FG
10202 }
10203 if (fh->flock_locks) {
11fdf7f2 10204 auto &lock_state = fh->flock_locks;
7c673cae
FG
10205 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
10206 p != lock_state->held_locks.end();
10207 ++p)
10208 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
11fdf7f2 10209 lock_state.reset();
7c673cae
FG
10210 }
10211
10212 if (to_release.empty())
10213 return;
10214
11fdf7f2
TL
10215 // mds has already released filelocks if session was closed.
10216 if (in->caps.empty())
10217 return;
10218
7c673cae
FG
10219 struct flock fl;
10220 memset(&fl, 0, sizeof(fl));
10221 fl.l_whence = SEEK_SET;
10222 fl.l_type = F_UNLCK;
10223
10224 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
10225 p != to_release.end();
10226 ++p) {
10227 fl.l_start = p->second.start;
10228 fl.l_len = p->second.length;
10229 fl.l_pid = p->second.pid;
10230 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
10231 p->second.owner, true);
10232 }
10233}
10234
10235void Client::_update_lock_state(struct flock *fl, uint64_t owner,
10236 ceph_lock_state_t *lock_state)
10237{
10238 int lock_cmd;
10239 if (F_RDLCK == fl->l_type)
10240 lock_cmd = CEPH_LOCK_SHARED;
10241 else if (F_WRLCK == fl->l_type)
10242 lock_cmd = CEPH_LOCK_EXCL;
10243 else
10244 lock_cmd = CEPH_LOCK_UNLOCK;;
10245
10246 ceph_filelock filelock;
10247 filelock.start = fl->l_start;
10248 filelock.length = fl->l_len;
10249 filelock.client = 0;
10250 // see comment in _do_filelock()
10251 filelock.owner = owner | (1ULL << 63);
10252 filelock.pid = fl->l_pid;
10253 filelock.type = lock_cmd;
10254
10255 if (filelock.type == CEPH_LOCK_UNLOCK) {
10256 list<ceph_filelock> activated_locks;
10257 lock_state->remove_lock(filelock, activated_locks);
10258 } else {
10259 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 10260 ceph_assert(r);
7c673cae
FG
10261 }
10262}
10263
10264int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
10265{
10266 Inode *in = fh->inode.get();
10267 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
10268 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
10269 return ret;
10270}
10271
10272int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
10273{
10274 Inode *in = fh->inode.get();
10275 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
10276 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
10277 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
10278 return ret;
10279}
10280
10281int Client::_flock(Fh *fh, int cmd, uint64_t owner)
10282{
10283 Inode *in = fh->inode.get();
10284 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
10285
10286 int sleep = !(cmd & LOCK_NB);
10287 cmd &= ~LOCK_NB;
10288
10289 int type;
10290 switch (cmd) {
10291 case LOCK_SH:
10292 type = F_RDLCK;
10293 break;
10294 case LOCK_EX:
10295 type = F_WRLCK;
10296 break;
10297 case LOCK_UN:
10298 type = F_UNLCK;
10299 break;
10300 default:
10301 return -EINVAL;
10302 }
10303
10304 struct flock fl;
10305 memset(&fl, 0, sizeof(fl));
10306 fl.l_type = type;
10307 fl.l_whence = SEEK_SET;
10308
10309 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
10310 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
10311 return ret;
10312}
10313
10314int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
10315{
10316 /* Since the only thing this does is wrap a call to statfs, and
10317 statfs takes a lock, it doesn't seem we have a need to split it
10318 out. */
10319 return statfs(0, stbuf, perms);
10320}
10321
10322void Client::ll_register_callbacks(struct client_callback_args *args)
10323{
10324 if (!args)
10325 return;
11fdf7f2
TL
10326 std::lock_guard l(client_lock);
10327 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
10328 << " invalidate_ino_cb " << args->ino_cb
10329 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
10330 << " switch_interrupt_cb " << args->switch_intr_cb
10331 << " remount_cb " << args->remount_cb
10332 << dendl;
10333 callback_handle = args->handle;
10334 if (args->ino_cb) {
10335 ino_invalidate_cb = args->ino_cb;
10336 async_ino_invalidator.start();
10337 }
10338 if (args->dentry_cb) {
10339 dentry_invalidate_cb = args->dentry_cb;
10340 async_dentry_invalidator.start();
10341 }
10342 if (args->switch_intr_cb) {
10343 switch_interrupt_cb = args->switch_intr_cb;
10344 interrupt_finisher.start();
10345 }
10346 if (args->remount_cb) {
10347 remount_cb = args->remount_cb;
10348 remount_finisher.start();
10349 }
7c673cae
FG
10350 umask_cb = args->umask_cb;
10351}
10352
10353int Client::test_dentry_handling(bool can_invalidate)
10354{
10355 int r = 0;
10356
10357 can_invalidate_dentries = can_invalidate;
10358
10359 if (can_invalidate_dentries) {
11fdf7f2 10360 ceph_assert(dentry_invalidate_cb);
7c673cae 10361 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
b32b8144 10362 r = 0;
11fdf7f2
TL
10363 } else {
10364 ceph_assert(remount_cb);
7c673cae 10365 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 10366 r = _do_remount(false);
b32b8144 10367 }
11fdf7f2 10368
7c673cae
FG
10369 return r;
10370}
10371
10372int Client::_sync_fs()
10373{
11fdf7f2 10374 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
10375
10376 // flush file data
11fdf7f2
TL
10377 std::unique_ptr<C_SaferCond> cond = nullptr;
10378 if (cct->_conf->client_oc) {
10379 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
10380 objectcacher->flush_all(cond.get());
10381 }
7c673cae
FG
10382
10383 // flush caps
10384 flush_caps_sync();
10385 ceph_tid_t flush_tid = last_flush_tid;
10386
10387 // wait for unsafe mds requests
10388 wait_unsafe_requests();
10389
10390 wait_sync_caps(flush_tid);
10391
11fdf7f2 10392 if (nullptr != cond) {
7c673cae 10393 client_lock.Unlock();
11fdf7f2
TL
10394 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
10395 cond->wait();
10396 ldout(cct, 15) << __func__ << " flush finished" << dendl;
7c673cae
FG
10397 client_lock.Lock();
10398 }
10399
10400 return 0;
10401}
10402
10403int Client::sync_fs()
10404{
11fdf7f2 10405 std::lock_guard l(client_lock);
181888fb
FG
10406
10407 if (unmounting)
10408 return -ENOTCONN;
10409
7c673cae
FG
10410 return _sync_fs();
10411}
10412
10413int64_t Client::drop_caches()
10414{
11fdf7f2 10415 std::lock_guard l(client_lock);
7c673cae
FG
10416 return objectcacher->release_all();
10417}
10418
11fdf7f2
TL
10419int Client::_lazyio(Fh *fh, int enable)
10420{
10421 Inode *in = fh->inode.get();
10422 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
10423
10424 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
10425 return 0;
10426
10427 int orig_mode = fh->mode;
10428 if (enable) {
10429 fh->mode |= CEPH_FILE_MODE_LAZY;
10430 in->get_open_ref(fh->mode);
10431 in->put_open_ref(orig_mode);
10432 check_caps(in, CHECK_CAPS_NODELAY);
10433 } else {
10434 fh->mode &= ~CEPH_FILE_MODE_LAZY;
10435 in->get_open_ref(fh->mode);
10436 in->put_open_ref(orig_mode);
10437 check_caps(in, 0);
10438 }
10439
10440 return 0;
10441}
10442
10443int Client::lazyio(int fd, int enable)
10444{
10445 std::lock_guard l(client_lock);
10446 Fh *f = get_filehandle(fd);
10447 if (!f)
10448 return -EBADF;
10449
10450 return _lazyio(f, enable);
10451}
10452
10453int Client::ll_lazyio(Fh *fh, int enable)
10454{
10455 std::lock_guard lock(client_lock);
10456 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
10457 tout(cct) << __func__ << std::endl;
10458
10459 return _lazyio(fh, enable);
10460}
7c673cae
FG
10461
10462int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10463{
11fdf7f2 10464 std::lock_guard l(client_lock);
7c673cae
FG
10465 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10466 << ", " << offset << ", " << count << ")" << dendl;
10467
10468 Fh *f = get_filehandle(fd);
10469 if (!f)
10470 return -EBADF;
10471
10472 // for now
10473 _fsync(f, true);
10474
10475 return 0;
10476}
10477
10478int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10479{
11fdf7f2 10480 std::lock_guard l(client_lock);
7c673cae
FG
10481 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10482 << ", " << offset << ", " << count << ")" << dendl;
10483
10484 Fh *f = get_filehandle(fd);
10485 if (!f)
10486 return -EBADF;
10487 Inode *in = f->inode.get();
10488
10489 _fsync(f, true);
10490 if (_release(in))
10491 check_caps(in, 0);
10492 return 0;
10493}
10494
10495
10496// =============================
10497// snaps
10498
10499int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10500{
11fdf7f2 10501 std::lock_guard l(client_lock);
181888fb
FG
10502
10503 if (unmounting)
10504 return -ENOTCONN;
10505
7c673cae
FG
10506 filepath path(relpath);
10507 InodeRef in;
10508 int r = path_walk(path, &in, perm);
10509 if (r < 0)
10510 return r;
10511 if (cct->_conf->client_permissions) {
10512 r = may_create(in.get(), perm);
10513 if (r < 0)
10514 return r;
10515 }
10516 Inode *snapdir = open_snapdir(in.get());
10517 return _mkdir(snapdir, name, 0, perm);
10518}
181888fb 10519
7c673cae
FG
10520int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10521{
11fdf7f2 10522 std::lock_guard l(client_lock);
181888fb
FG
10523
10524 if (unmounting)
10525 return -ENOTCONN;
10526
7c673cae
FG
10527 filepath path(relpath);
10528 InodeRef in;
10529 int r = path_walk(path, &in, perms);
10530 if (r < 0)
10531 return r;
10532 if (cct->_conf->client_permissions) {
10533 r = may_delete(in.get(), NULL, perms);
10534 if (r < 0)
10535 return r;
10536 }
10537 Inode *snapdir = open_snapdir(in.get());
10538 return _rmdir(snapdir, name, perms);
10539}
10540
10541// =============================
10542// expose caps
10543
10544int Client::get_caps_issued(int fd) {
10545
11fdf7f2 10546 std::lock_guard lock(client_lock);
7c673cae 10547
181888fb
FG
10548 if (unmounting)
10549 return -ENOTCONN;
10550
7c673cae
FG
10551 Fh *f = get_filehandle(fd);
10552 if (!f)
10553 return -EBADF;
10554
10555 return f->inode->caps_issued();
10556}
10557
10558int Client::get_caps_issued(const char *path, const UserPerm& perms)
10559{
11fdf7f2 10560 std::lock_guard lock(client_lock);
181888fb
FG
10561
10562 if (unmounting)
10563 return -ENOTCONN;
10564
7c673cae
FG
10565 filepath p(path);
10566 InodeRef in;
10567 int r = path_walk(p, &in, perms, true);
10568 if (r < 0)
10569 return r;
10570 return in->caps_issued();
10571}
10572
10573// =========================================
10574// low level
10575
10576Inode *Client::open_snapdir(Inode *diri)
10577{
10578 Inode *in;
10579 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10580 if (!inode_map.count(vino)) {
10581 in = new Inode(this, vino, &diri->layout);
10582
10583 in->ino = diri->ino;
10584 in->snapid = CEPH_SNAPDIR;
10585 in->mode = diri->mode;
10586 in->uid = diri->uid;
10587 in->gid = diri->gid;
494da23a 10588 in->nlink = 1;
7c673cae
FG
10589 in->mtime = diri->mtime;
10590 in->ctime = diri->ctime;
10591 in->btime = diri->btime;
10592 in->size = diri->size;
10593 in->change_attr = diri->change_attr;
10594
10595 in->dirfragtree.clear();
10596 in->snapdir_parent = diri;
10597 diri->flags |= I_SNAPDIR_OPEN;
10598 inode_map[vino] = in;
10599 if (use_faked_inos())
10600 _assign_faked_ino(in);
10601 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10602 } else {
10603 in = inode_map[vino];
10604 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10605 }
10606 return in;
10607}
10608
10609int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10610 Inode **out, const UserPerm& perms)
10611{
11fdf7f2 10612 std::lock_guard lock(client_lock);
31f18b77 10613 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
10614 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
10615 tout(cct) << __func__ << std::endl;
7c673cae
FG
10616 tout(cct) << name << std::endl;
10617
181888fb
FG
10618 if (unmounting)
10619 return -ENOTCONN;
10620
7c673cae 10621 int r = 0;
11fdf7f2
TL
10622 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10623 "fuse_default_permissions");
10624 if (!fuse_default_permissions) {
10625 if (strcmp(name, ".") && strcmp(name, "..")) {
10626 r = may_lookup(parent, perms);
10627 if (r < 0)
10628 return r;
10629 }
7c673cae
FG
10630 }
10631
10632 string dname(name);
10633 InodeRef in;
10634
10635 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10636 if (r < 0) {
10637 attr->st_ino = 0;
10638 goto out;
10639 }
10640
11fdf7f2 10641 ceph_assert(in);
7c673cae
FG
10642 fill_stat(in, attr);
10643 _ll_get(in.get());
10644
10645 out:
11fdf7f2 10646 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10647 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10648 tout(cct) << attr->st_ino << std::endl;
10649 *out = in.get();
10650 return r;
10651}
10652
1adf2230
AA
10653int Client::ll_lookup_inode(
10654 struct inodeno_t ino,
10655 const UserPerm& perms,
10656 Inode **inode)
10657{
81eedcae 10658 ceph_assert(inode != NULL);
11fdf7f2 10659 std::lock_guard lock(client_lock);
1adf2230
AA
10660 ldout(cct, 3) << "ll_lookup_inode " << ino << dendl;
10661
81eedcae
TL
10662 if (unmounting)
10663 return -ENOTCONN;
10664
1adf2230
AA
10665 // Num1: get inode and *inode
10666 int r = _lookup_ino(ino, perms, inode);
81eedcae 10667 if (r)
1adf2230 10668 return r;
81eedcae 10669
11fdf7f2 10670 ceph_assert(*inode != NULL);
1adf2230 10671
81eedcae
TL
10672 if (!(*inode)->dentries.empty()) {
10673 ldout(cct, 8) << __func__ << " dentry already present" << dendl;
10674 return 0;
10675 }
10676
10677 if ((*inode)->is_root()) {
10678 ldout(cct, 8) << "ino is root, no parent" << dendl;
10679 return 0;
10680 }
10681
1adf2230
AA
10682 // Num2: Request the parent inode, so that we can look up the name
10683 Inode *parent;
10684 r = _lookup_parent(*inode, perms, &parent);
81eedcae 10685 if (r) {
1adf2230
AA
10686 _ll_forget(*inode, 1);
10687 return r;
1adf2230 10688 }
81eedcae 10689
11fdf7f2 10690 ceph_assert(parent != NULL);
1adf2230
AA
10691
10692 // Num3: Finally, get the name (dentry) of the requested inode
10693 r = _lookup_name(*inode, parent, perms);
10694 if (r) {
10695 // Unexpected error
10696 _ll_forget(parent, 1);
10697 _ll_forget(*inode, 1);
10698 return r;
10699 }
10700
10701 _ll_forget(parent, 1);
10702 return 0;
10703}
10704
7c673cae
FG
10705int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10706 struct ceph_statx *stx, unsigned want, unsigned flags,
10707 const UserPerm& perms)
10708{
11fdf7f2 10709 std::lock_guard lock(client_lock);
31f18b77 10710 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 10711 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
10712 tout(cct) << "ll_lookupx" << std::endl;
10713 tout(cct) << name << std::endl;
10714
181888fb
FG
10715 if (unmounting)
10716 return -ENOTCONN;
10717
7c673cae 10718 int r = 0;
11fdf7f2
TL
10719 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10720 "fuse_default_permissions");
10721 if (!fuse_default_permissions) {
7c673cae
FG
10722 r = may_lookup(parent, perms);
10723 if (r < 0)
10724 return r;
10725 }
10726
10727 string dname(name);
10728 InodeRef in;
10729
10730 unsigned mask = statx_to_mask(flags, want);
10731 r = _lookup(parent, dname, mask, &in, perms);
10732 if (r < 0) {
10733 stx->stx_ino = 0;
10734 stx->stx_mask = 0;
10735 } else {
11fdf7f2 10736 ceph_assert(in);
7c673cae
FG
10737 fill_statx(in, mask, stx);
10738 _ll_get(in.get());
10739 }
10740
11fdf7f2 10741 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
10742 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10743 tout(cct) << stx->stx_ino << std::endl;
10744 *out = in.get();
10745 return r;
10746}
10747
10748int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10749 unsigned int want, unsigned int flags, const UserPerm& perms)
10750{
11fdf7f2 10751 std::lock_guard lock(client_lock);
181888fb
FG
10752
10753 if (unmounting)
10754 return -ENOTCONN;
10755
7c673cae
FG
10756 filepath fp(name, 0);
10757 InodeRef in;
10758 int rc;
10759 unsigned mask = statx_to_mask(flags, want);
10760
11fdf7f2
TL
10761 ldout(cct, 3) << __func__ << " " << name << dendl;
10762 tout(cct) << __func__ << std::endl;
7c673cae
FG
10763 tout(cct) << name << std::endl;
10764
10765 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10766 if (rc < 0) {
10767 /* zero out mask, just in case... */
10768 stx->stx_mask = 0;
10769 stx->stx_ino = 0;
10770 *out = NULL;
10771 return rc;
10772 } else {
11fdf7f2 10773 ceph_assert(in);
7c673cae
FG
10774 fill_statx(in, mask, stx);
10775 _ll_get(in.get());
10776 *out = in.get();
10777 return 0;
10778 }
10779}
10780
10781void Client::_ll_get(Inode *in)
10782{
10783 if (in->ll_ref == 0) {
10784 in->get();
11fdf7f2
TL
10785 if (in->is_dir() && !in->dentries.empty()) {
10786 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10787 in->get_first_parent()->get(); // pin dentry
10788 }
11fdf7f2
TL
10789 if (in->snapid != CEPH_NOSNAP)
10790 ll_snap_ref[in->snapid]++;
7c673cae
FG
10791 }
10792 in->ll_get();
11fdf7f2 10793 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
10794}
10795
494da23a 10796int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
10797{
10798 in->ll_put(num);
11fdf7f2 10799 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 10800 if (in->ll_ref == 0) {
11fdf7f2
TL
10801 if (in->is_dir() && !in->dentries.empty()) {
10802 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
10803 in->get_first_parent()->put(); // unpin dentry
10804 }
11fdf7f2
TL
10805 if (in->snapid != CEPH_NOSNAP) {
10806 auto p = ll_snap_ref.find(in->snapid);
10807 ceph_assert(p != ll_snap_ref.end());
10808 ceph_assert(p->second > 0);
10809 if (--p->second == 0)
10810 ll_snap_ref.erase(p);
10811 }
7c673cae
FG
10812 put_inode(in);
10813 return 0;
10814 } else {
10815 return in->ll_ref;
10816 }
10817}
10818
10819void Client::_ll_drop_pins()
10820{
11fdf7f2 10821 ldout(cct, 10) << __func__ << dendl;
1adf2230 10822 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
10823 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10824 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10825 it != inode_map.end();
10826 it = next) {
10827 Inode *in = it->second;
10828 next = it;
10829 ++next;
1adf2230
AA
10830 if (in->ll_ref){
10831 to_be_put.insert(in);
7c673cae 10832 _ll_put(in, in->ll_ref);
1adf2230 10833 }
7c673cae
FG
10834 }
10835}
10836
494da23a 10837bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 10838{
11fdf7f2 10839 inodeno_t ino = in->ino;
7c673cae 10840
11fdf7f2
TL
10841 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
10842 tout(cct) << __func__ << std::endl;
7c673cae
FG
10843 tout(cct) << ino.val << std::endl;
10844 tout(cct) << count << std::endl;
10845
181888fb
FG
10846 // Ignore forget if we're no longer mounted
10847 if (unmounting)
10848 return true;
10849
7c673cae
FG
10850 if (ino == 1) return true; // ignore forget on root.
10851
10852 bool last = false;
10853 if (in->ll_ref < count) {
10854 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10855 << ", which only has ll_ref=" << in->ll_ref << dendl;
10856 _ll_put(in, in->ll_ref);
10857 last = true;
10858 } else {
10859 if (_ll_put(in, count) == 0)
10860 last = true;
10861 }
10862
10863 return last;
10864}
10865
494da23a 10866bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 10867{
11fdf7f2 10868 std::lock_guard lock(client_lock);
1adf2230
AA
10869 return _ll_forget(in, count);
10870}
10871
7c673cae
FG
10872bool Client::ll_put(Inode *in)
10873{
10874 /* ll_forget already takes the lock */
10875 return ll_forget(in, 1);
10876}
10877
11fdf7f2
TL
10878int Client::ll_get_snap_ref(snapid_t snap)
10879{
10880 std::lock_guard lock(client_lock);
10881 auto p = ll_snap_ref.find(snap);
10882 if (p != ll_snap_ref.end())
10883 return p->second;
10884 return 0;
10885}
10886
7c673cae
FG
10887snapid_t Client::ll_get_snapid(Inode *in)
10888{
11fdf7f2 10889 std::lock_guard lock(client_lock);
7c673cae
FG
10890 return in->snapid;
10891}
10892
10893Inode *Client::ll_get_inode(ino_t ino)
10894{
11fdf7f2 10895 std::lock_guard lock(client_lock);
181888fb
FG
10896
10897 if (unmounting)
10898 return NULL;
10899
7c673cae
FG
10900 vinodeno_t vino = _map_faked_ino(ino);
10901 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10902 if (p == inode_map.end())
10903 return NULL;
10904 Inode *in = p->second;
10905 _ll_get(in);
10906 return in;
10907}
10908
10909Inode *Client::ll_get_inode(vinodeno_t vino)
10910{
11fdf7f2 10911 std::lock_guard lock(client_lock);
181888fb
FG
10912
10913 if (unmounting)
10914 return NULL;
10915
7c673cae
FG
10916 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10917 if (p == inode_map.end())
10918 return NULL;
10919 Inode *in = p->second;
10920 _ll_get(in);
10921 return in;
10922}
10923
10924int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10925{
10926 vinodeno_t vino = _get_vino(in);
10927
11fdf7f2
TL
10928 ldout(cct, 8) << __func__ << " " << vino << dendl;
10929 tout(cct) << __func__ << std::endl;
7c673cae
FG
10930 tout(cct) << vino.ino.val << std::endl;
10931
10932 if (vino.snapid < CEPH_NOSNAP)
10933 return 0;
10934 else
10935 return _getattr(in, caps, perms);
10936}
10937
10938int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10939{
11fdf7f2 10940 std::lock_guard lock(client_lock);
7c673cae 10941
181888fb
FG
10942 if (unmounting)
10943 return -ENOTCONN;
10944
7c673cae
FG
10945 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10946
10947 if (res == 0)
10948 fill_stat(in, attr);
11fdf7f2 10949 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10950 return res;
10951}
10952
10953int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10954 unsigned int flags, const UserPerm& perms)
10955{
11fdf7f2 10956 std::lock_guard lock(client_lock);
7c673cae 10957
181888fb
FG
10958 if (unmounting)
10959 return -ENOTCONN;
10960
7c673cae
FG
10961 int res = 0;
10962 unsigned mask = statx_to_mask(flags, want);
10963
94b18763 10964 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
10965 res = _ll_getattr(in, mask, perms);
10966
10967 if (res == 0)
10968 fill_statx(in, mask, stx);
11fdf7f2 10969 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
10970 return res;
10971}
10972
10973int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10974 const UserPerm& perms, InodeRef *inp)
10975{
10976 vinodeno_t vino = _get_vino(in);
10977
11fdf7f2 10978 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 10979 << dendl;
11fdf7f2 10980 tout(cct) << __func__ << std::endl;
7c673cae
FG
10981 tout(cct) << vino.ino.val << std::endl;
10982 tout(cct) << stx->stx_mode << std::endl;
10983 tout(cct) << stx->stx_uid << std::endl;
10984 tout(cct) << stx->stx_gid << std::endl;
10985 tout(cct) << stx->stx_size << std::endl;
10986 tout(cct) << stx->stx_mtime << std::endl;
10987 tout(cct) << stx->stx_atime << std::endl;
10988 tout(cct) << stx->stx_btime << std::endl;
10989 tout(cct) << mask << std::endl;
10990
11fdf7f2
TL
10991 auto fuse_default_permissions = cct->_conf.get_val<bool>(
10992 "fuse_default_permissions");
10993 if (!fuse_default_permissions) {
7c673cae
FG
10994 int res = may_setattr(in, stx, mask, perms);
10995 if (res < 0)
10996 return res;
10997 }
10998
10999 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
11000
11001 return __setattrx(in, stx, mask, perms, inp);
11002}
11003
11004int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
11005 const UserPerm& perms)
11006{
11fdf7f2 11007 std::lock_guard lock(client_lock);
181888fb
FG
11008
11009 if (unmounting)
11010 return -ENOTCONN;
11011
7c673cae
FG
11012 InodeRef target(in);
11013 int res = _ll_setattrx(in, stx, mask, perms, &target);
11014 if (res == 0) {
11fdf7f2 11015 ceph_assert(in == target.get());
7c673cae
FG
11016 fill_statx(in, in->caps_issued(), stx);
11017 }
11018
11fdf7f2 11019 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11020 return res;
11021}
11022
11023int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
11024 const UserPerm& perms)
11025{
11026 struct ceph_statx stx;
11027 stat_to_statx(attr, &stx);
11028
11fdf7f2 11029 std::lock_guard lock(client_lock);
181888fb
FG
11030
11031 if (unmounting)
11032 return -ENOTCONN;
11033
7c673cae
FG
11034 InodeRef target(in);
11035 int res = _ll_setattrx(in, &stx, mask, perms, &target);
11036 if (res == 0) {
11fdf7f2 11037 ceph_assert(in == target.get());
7c673cae
FG
11038 fill_stat(in, attr);
11039 }
11040
11fdf7f2 11041 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
11042 return res;
11043}
11044
11045
11046// ----------
11047// xattrs
11048
11049int Client::getxattr(const char *path, const char *name, void *value, size_t size,
11050 const UserPerm& perms)
11051{
11fdf7f2 11052 std::lock_guard lock(client_lock);
181888fb
FG
11053
11054 if (unmounting)
11055 return -ENOTCONN;
11056
7c673cae
FG
11057 InodeRef in;
11058 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11059 if (r < 0)
11060 return r;
11061 return _getxattr(in, name, value, size, perms);
11062}
11063
11064int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
11065 const UserPerm& perms)
11066{
11fdf7f2 11067 std::lock_guard lock(client_lock);
181888fb
FG
11068
11069 if (unmounting)
11070 return -ENOTCONN;
11071
7c673cae
FG
11072 InodeRef in;
11073 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11074 if (r < 0)
11075 return r;
11076 return _getxattr(in, name, value, size, perms);
11077}
11078
11079int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
11080 const UserPerm& perms)
11081{
11fdf7f2 11082 std::lock_guard lock(client_lock);
181888fb
FG
11083
11084 if (unmounting)
11085 return -ENOTCONN;
11086
7c673cae
FG
11087 Fh *f = get_filehandle(fd);
11088 if (!f)
11089 return -EBADF;
11090 return _getxattr(f->inode, name, value, size, perms);
11091}
11092
11093int Client::listxattr(const char *path, char *list, size_t size,
11094 const UserPerm& perms)
11095{
11fdf7f2 11096 std::lock_guard lock(client_lock);
181888fb
FG
11097
11098 if (unmounting)
11099 return -ENOTCONN;
11100
7c673cae
FG
11101 InodeRef in;
11102 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
11103 if (r < 0)
11104 return r;
11105 return Client::_listxattr(in.get(), list, size, perms);
11106}
11107
11108int Client::llistxattr(const char *path, char *list, size_t size,
11109 const UserPerm& perms)
11110{
11fdf7f2 11111 std::lock_guard lock(client_lock);
181888fb
FG
11112
11113 if (unmounting)
11114 return -ENOTCONN;
11115
7c673cae
FG
11116 InodeRef in;
11117 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
11118 if (r < 0)
11119 return r;
11120 return Client::_listxattr(in.get(), list, size, perms);
11121}
11122
11123int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
11124{
11fdf7f2 11125 std::lock_guard lock(client_lock);
181888fb
FG
11126
11127 if (unmounting)
11128 return -ENOTCONN;
11129
7c673cae
FG
11130 Fh *f = get_filehandle(fd);
11131 if (!f)
11132 return -EBADF;
11133 return Client::_listxattr(f->inode.get(), list, size, perms);
11134}
11135
11136int Client::removexattr(const char *path, const char *name,
11137 const UserPerm& perms)
11138{
11fdf7f2 11139 std::lock_guard lock(client_lock);
181888fb
FG
11140
11141 if (unmounting)
11142 return -ENOTCONN;
11143
7c673cae
FG
11144 InodeRef in;
11145 int r = Client::path_walk(path, &in, perms, true);
11146 if (r < 0)
11147 return r;
11148 return _removexattr(in, name, perms);
11149}
11150
11151int Client::lremovexattr(const char *path, const char *name,
11152 const UserPerm& perms)
11153{
11fdf7f2 11154 std::lock_guard lock(client_lock);
181888fb
FG
11155
11156 if (unmounting)
11157 return -ENOTCONN;
11158
7c673cae
FG
11159 InodeRef in;
11160 int r = Client::path_walk(path, &in, perms, false);
11161 if (r < 0)
11162 return r;
11163 return _removexattr(in, name, perms);
11164}
11165
11166int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
11167{
11fdf7f2 11168 std::lock_guard lock(client_lock);
181888fb
FG
11169
11170 if (unmounting)
11171 return -ENOTCONN;
11172
7c673cae
FG
11173 Fh *f = get_filehandle(fd);
11174 if (!f)
11175 return -EBADF;
11176 return _removexattr(f->inode, name, perms);
11177}
11178
11179int Client::setxattr(const char *path, const char *name, const void *value,
11180 size_t size, int flags, const UserPerm& perms)
11181{
11182 _setxattr_maybe_wait_for_osdmap(name, value, size);
11183
11fdf7f2 11184 std::lock_guard lock(client_lock);
181888fb
FG
11185
11186 if (unmounting)
11187 return -ENOTCONN;
11188
7c673cae
FG
11189 InodeRef in;
11190 int r = Client::path_walk(path, &in, perms, true);
11191 if (r < 0)
11192 return r;
11193 return _setxattr(in, name, value, size, flags, perms);
11194}
11195
11196int Client::lsetxattr(const char *path, const char *name, const void *value,
11197 size_t size, int flags, const UserPerm& perms)
11198{
11199 _setxattr_maybe_wait_for_osdmap(name, value, size);
11200
11fdf7f2 11201 std::lock_guard lock(client_lock);
181888fb
FG
11202
11203 if (unmounting)
11204 return -ENOTCONN;
11205
7c673cae
FG
11206 InodeRef in;
11207 int r = Client::path_walk(path, &in, perms, false);
11208 if (r < 0)
11209 return r;
11210 return _setxattr(in, name, value, size, flags, perms);
11211}
11212
11213int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
11214 int flags, const UserPerm& perms)
11215{
11216 _setxattr_maybe_wait_for_osdmap(name, value, size);
11217
11fdf7f2 11218 std::lock_guard lock(client_lock);
181888fb
FG
11219
11220 if (unmounting)
11221 return -ENOTCONN;
11222
7c673cae
FG
11223 Fh *f = get_filehandle(fd);
11224 if (!f)
11225 return -EBADF;
11226 return _setxattr(f->inode, name, value, size, flags, perms);
11227}
11228
11229int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
11230 const UserPerm& perms)
11231{
11232 int r;
11233
11234 const VXattr *vxattr = _match_vxattr(in, name);
11235 if (vxattr) {
11236 r = -ENODATA;
11237
11238 // Do a force getattr to get the latest quota before returning
11239 // a value to userspace.
28e407b8
AA
11240 int flags = 0;
11241 if (vxattr->flags & VXATTR_RSTAT) {
11242 flags |= CEPH_STAT_RSTAT;
11243 }
11244 r = _getattr(in, flags, perms, true);
7c673cae
FG
11245 if (r != 0) {
11246 // Error from getattr!
11247 return r;
11248 }
11249
11250 // call pointer-to-member function
11251 char buf[256];
11252 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
11253 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
11254 } else {
11255 r = -ENODATA;
11256 }
11257
11258 if (size != 0) {
11259 if (r > (int)size) {
11260 r = -ERANGE;
11261 } else if (r > 0) {
11262 memcpy(value, buf, r);
11263 }
11264 }
11265 goto out;
11266 }
11267
11268 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
11269 r = -EOPNOTSUPP;
11270 goto out;
11271 }
11272
11273 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
11274 if (r == 0) {
11275 string n(name);
11276 r = -ENODATA;
11277 if (in->xattrs.count(n)) {
11278 r = in->xattrs[n].length();
11279 if (r > 0 && size != 0) {
11280 if (size >= (unsigned)r)
11281 memcpy(value, in->xattrs[n].c_str(), r);
11282 else
11283 r = -ERANGE;
11284 }
11285 }
11286 }
11287 out:
1adf2230 11288 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
11289 return r;
11290}
11291
11292int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
11293 const UserPerm& perms)
11294{
11295 if (cct->_conf->client_permissions) {
11296 int r = xattr_permission(in.get(), name, MAY_READ, perms);
11297 if (r < 0)
11298 return r;
11299 }
11300 return _getxattr(in.get(), name, value, size, perms);
11301}
11302
11303int Client::ll_getxattr(Inode *in, const char *name, void *value,
11304 size_t size, const UserPerm& perms)
11305{
11fdf7f2 11306 std::lock_guard lock(client_lock);
7c673cae 11307
181888fb
FG
11308 if (unmounting)
11309 return -ENOTCONN;
11310
7c673cae
FG
11311 vinodeno_t vino = _get_vino(in);
11312
11fdf7f2
TL
11313 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11314 tout(cct) << __func__ << std::endl;
7c673cae
FG
11315 tout(cct) << vino.ino.val << std::endl;
11316 tout(cct) << name << std::endl;
11317
11fdf7f2
TL
11318 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11319 "fuse_default_permissions");
11320 if (!fuse_default_permissions) {
7c673cae
FG
11321 int r = xattr_permission(in, name, MAY_READ, perms);
11322 if (r < 0)
11323 return r;
11324 }
11325
11326 return _getxattr(in, name, value, size, perms);
11327}
11328
11329int Client::_listxattr(Inode *in, char *name, size_t size,
11330 const UserPerm& perms)
11331{
81eedcae 11332 bool len_only = (size == 0);
7c673cae 11333 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
11334 if (r != 0) {
11335 goto out;
11336 }
7c673cae 11337
81eedcae
TL
11338 r = 0;
11339 for (const auto& p : in->xattrs) {
11340 size_t this_len = p.first.length() + 1;
11341 r += this_len;
11342 if (len_only)
11343 continue;
7c673cae 11344
81eedcae
TL
11345 if (this_len > size) {
11346 r = -ERANGE;
11347 goto out;
11348 }
11349
11350 memcpy(name, p.first.c_str(), this_len);
11351 name += this_len;
11352 size -= this_len;
11353 }
11354
11355 const VXattr *vxattr;
11356 for (vxattr = _get_vxattrs(in); vxattr && !vxattr->name.empty(); vxattr++) {
11357 if (vxattr->hidden)
11358 continue;
11359 // call pointer-to-member function
11360 if (vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))
11361 continue;
11362
11363 size_t this_len = vxattr->name.length() + 1;
11364 r += this_len;
11365 if (len_only)
11366 continue;
11367
11368 if (this_len > size) {
11369 r = -ERANGE;
11370 goto out;
7c673cae 11371 }
81eedcae
TL
11372
11373 memcpy(name, vxattr->name.c_str(), this_len);
11374 name += this_len;
11375 size -= this_len;
7c673cae 11376 }
81eedcae 11377out:
11fdf7f2 11378 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
11379 return r;
11380}
11381
11382int Client::ll_listxattr(Inode *in, char *names, size_t size,
11383 const UserPerm& perms)
11384{
11fdf7f2 11385 std::lock_guard lock(client_lock);
7c673cae 11386
181888fb
FG
11387 if (unmounting)
11388 return -ENOTCONN;
11389
7c673cae
FG
11390 vinodeno_t vino = _get_vino(in);
11391
11fdf7f2
TL
11392 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
11393 tout(cct) << __func__ << std::endl;
7c673cae
FG
11394 tout(cct) << vino.ino.val << std::endl;
11395 tout(cct) << size << std::endl;
11396
11397 return _listxattr(in, names, size, perms);
11398}
11399
11400int Client::_do_setxattr(Inode *in, const char *name, const void *value,
11401 size_t size, int flags, const UserPerm& perms)
11402{
11403
11404 int xattr_flags = 0;
11405 if (!value)
11406 xattr_flags |= CEPH_XATTR_REMOVE;
11407 if (flags & XATTR_CREATE)
11408 xattr_flags |= CEPH_XATTR_CREATE;
11409 if (flags & XATTR_REPLACE)
11410 xattr_flags |= CEPH_XATTR_REPLACE;
11411
11412 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
11413 filepath path;
11414 in->make_nosnap_relative_path(path);
11415 req->set_filepath(path);
11416 req->set_string2(name);
11417 req->set_inode(in);
11418 req->head.args.setxattr.flags = xattr_flags;
11419
11420 bufferlist bl;
11fdf7f2 11421 assert (value || size == 0);
7c673cae
FG
11422 bl.append((const char*)value, size);
11423 req->set_data(bl);
11424
11425 int res = make_request(req, perms);
11426
11427 trim_cache();
11fdf7f2 11428 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
11429 res << dendl;
11430 return res;
11431}
11432
11433int Client::_setxattr(Inode *in, const char *name, const void *value,
11434 size_t size, int flags, const UserPerm& perms)
11435{
11436 if (in->snapid != CEPH_NOSNAP) {
11437 return -EROFS;
11438 }
11439
11440 bool posix_acl_xattr = false;
11441 if (acl_type == POSIX_ACL)
11442 posix_acl_xattr = !strncmp(name, "system.", 7);
11443
11444 if (strncmp(name, "user.", 5) &&
11445 strncmp(name, "security.", 9) &&
11446 strncmp(name, "trusted.", 8) &&
11447 strncmp(name, "ceph.", 5) &&
11448 !posix_acl_xattr)
11449 return -EOPNOTSUPP;
11450
11fdf7f2
TL
11451 bool check_realm = false;
11452
7c673cae
FG
11453 if (posix_acl_xattr) {
11454 if (!strcmp(name, ACL_EA_ACCESS)) {
11455 mode_t new_mode = in->mode;
11456 if (value) {
11457 int ret = posix_acl_equiv_mode(value, size, &new_mode);
11458 if (ret < 0)
11459 return ret;
11460 if (ret == 0) {
11461 value = NULL;
11462 size = 0;
11463 }
11464 if (new_mode != in->mode) {
11465 struct ceph_statx stx;
11466 stx.stx_mode = new_mode;
11467 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
11468 if (ret < 0)
11469 return ret;
11470 }
11471 }
11472 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
11473 if (value) {
11474 if (!S_ISDIR(in->mode))
11475 return -EACCES;
11476 int ret = posix_acl_check(value, size);
11477 if (ret < 0)
11478 return -EINVAL;
11479 if (ret == 0) {
11480 value = NULL;
11481 size = 0;
11482 }
11483 }
11484 } else {
11485 return -EOPNOTSUPP;
11486 }
11487 } else {
11488 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
11489 if (vxattr) {
11490 if (vxattr->readonly)
11491 return -EOPNOTSUPP;
11492 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
11493 check_realm = true;
11494 }
7c673cae
FG
11495 }
11496
11fdf7f2
TL
11497 int ret = _do_setxattr(in, name, value, size, flags, perms);
11498 if (ret >= 0 && check_realm) {
11499 // check if snaprealm was created for quota inode
11500 if (in->quota.is_enable() &&
11501 !(in->snaprealm && in->snaprealm->ino == in->ino))
11502 ret = -EOPNOTSUPP;
11503 }
11504
11505 return ret;
7c673cae
FG
11506}
11507
11508int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11509 size_t size, int flags, const UserPerm& perms)
11510{
11511 if (cct->_conf->client_permissions) {
11512 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11513 if (r < 0)
11514 return r;
11515 }
11516 return _setxattr(in.get(), name, value, size, flags, perms);
11517}
11518
11519int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11520{
11521 string tmp;
11522 if (name == "layout") {
11523 string::iterator begin = value.begin();
11524 string::iterator end = value.end();
11525 keys_and_values<string::iterator> p; // create instance of parser
11526 std::map<string, string> m; // map to receive results
11527 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11528 return -EINVAL;
11529 }
11530 if (begin != end)
11531 return -EINVAL;
11532 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11533 if (q->first == "pool") {
11534 tmp = q->second;
11535 break;
11536 }
11537 }
11538 } else if (name == "layout.pool") {
11539 tmp = value;
11540 }
11541
11542 if (tmp.length()) {
11543 int64_t pool;
11544 try {
11545 pool = boost::lexical_cast<unsigned>(tmp);
11546 if (!osdmap->have_pg_pool(pool))
11547 return -ENOENT;
11548 } catch (boost::bad_lexical_cast const&) {
11549 pool = osdmap->lookup_pg_pool_name(tmp);
11550 if (pool < 0) {
11551 return -ENOENT;
11552 }
11553 }
11554 }
11555
11556 return 0;
11557}
11558
11559void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11560{
11561 // For setting pool of layout, MetaRequest need osdmap epoch.
11562 // There is a race which create a new data pool but client and mds both don't have.
11563 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11564 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11565 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11566 string rest(strstr(name, "layout"));
11567 string v((const char*)value, size);
11568 int r = objecter->with_osdmap([&](const OSDMap& o) {
11569 return _setxattr_check_data_pool(rest, v, &o);
11570 });
11571
11572 if (r == -ENOENT) {
11573 C_SaferCond ctx;
11574 objecter->wait_for_latest_osdmap(&ctx);
11575 ctx.wait();
11576 }
11577 }
11578}
11579
11580int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11581 size_t size, int flags, const UserPerm& perms)
11582{
11583 _setxattr_maybe_wait_for_osdmap(name, value, size);
11584
11fdf7f2 11585 std::lock_guard lock(client_lock);
7c673cae 11586
181888fb
FG
11587 if (unmounting)
11588 return -ENOTCONN;
11589
7c673cae
FG
11590 vinodeno_t vino = _get_vino(in);
11591
11fdf7f2
TL
11592 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
11593 tout(cct) << __func__ << std::endl;
7c673cae
FG
11594 tout(cct) << vino.ino.val << std::endl;
11595 tout(cct) << name << std::endl;
11596
11fdf7f2
TL
11597 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11598 "fuse_default_permissions");
11599 if (!fuse_default_permissions) {
7c673cae
FG
11600 int r = xattr_permission(in, name, MAY_WRITE, perms);
11601 if (r < 0)
11602 return r;
11603 }
11604 return _setxattr(in, name, value, size, flags, perms);
11605}
11606
11607int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11608{
11609 if (in->snapid != CEPH_NOSNAP) {
11610 return -EROFS;
11611 }
11612
11613 // same xattrs supported by kernel client
11614 if (strncmp(name, "user.", 5) &&
11615 strncmp(name, "system.", 7) &&
11616 strncmp(name, "security.", 9) &&
11617 strncmp(name, "trusted.", 8) &&
11618 strncmp(name, "ceph.", 5))
11619 return -EOPNOTSUPP;
11620
11621 const VXattr *vxattr = _match_vxattr(in, name);
11622 if (vxattr && vxattr->readonly)
11623 return -EOPNOTSUPP;
11624
11625 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11626 filepath path;
11627 in->make_nosnap_relative_path(path);
11628 req->set_filepath(path);
11629 req->set_filepath2(name);
11630 req->set_inode(in);
11631
11632 int res = make_request(req, perms);
11633
11634 trim_cache();
1adf2230 11635 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
11636 return res;
11637}
11638
11639int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11640{
11641 if (cct->_conf->client_permissions) {
11642 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11643 if (r < 0)
11644 return r;
11645 }
11646 return _removexattr(in.get(), name, perms);
11647}
11648
11649int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11650{
11fdf7f2 11651 std::lock_guard lock(client_lock);
7c673cae 11652
181888fb
FG
11653 if (unmounting)
11654 return -ENOTCONN;
11655
7c673cae
FG
11656 vinodeno_t vino = _get_vino(in);
11657
11658 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11659 tout(cct) << "ll_removexattr" << std::endl;
11660 tout(cct) << vino.ino.val << std::endl;
11661 tout(cct) << name << std::endl;
11662
11fdf7f2
TL
11663 auto fuse_default_permissions = cct->_conf.get_val<bool>(
11664 "fuse_default_permissions");
11665 if (!fuse_default_permissions) {
7c673cae
FG
11666 int r = xattr_permission(in, name, MAY_WRITE, perms);
11667 if (r < 0)
11668 return r;
11669 }
11670
11671 return _removexattr(in, name, perms);
11672}
11673
11674bool Client::_vxattrcb_quota_exists(Inode *in)
11675{
11fdf7f2
TL
11676 return in->quota.is_enable() &&
11677 in->snaprealm && in->snaprealm->ino == in->ino;
7c673cae
FG
11678}
11679size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11680{
11681 return snprintf(val, size,
11682 "max_bytes=%lld max_files=%lld",
11683 (long long int)in->quota.max_bytes,
11684 (long long int)in->quota.max_files);
11685}
11686size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11687{
11688 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11689}
11690size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11691{
11692 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11693}
11694
11695bool Client::_vxattrcb_layout_exists(Inode *in)
11696{
11697 return in->layout != file_layout_t();
11698}
11699size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11700{
11701 int r = snprintf(val, size,
11fdf7f2 11702 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
11703 (unsigned long long)in->layout.stripe_unit,
11704 (unsigned long long)in->layout.stripe_count,
11705 (unsigned long long)in->layout.object_size);
11706 objecter->with_osdmap([&](const OSDMap& o) {
11707 if (o.have_pg_pool(in->layout.pool_id))
11708 r += snprintf(val + r, size - r, "%s",
11709 o.get_pool_name(in->layout.pool_id).c_str());
11710 else
11711 r += snprintf(val + r, size - r, "%" PRIu64,
11712 (uint64_t)in->layout.pool_id);
11713 });
11714 if (in->layout.pool_ns.length())
11715 r += snprintf(val + r, size - r, " pool_namespace=%s",
11716 in->layout.pool_ns.c_str());
11717 return r;
11718}
11719size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11720{
11fdf7f2 11721 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
11722}
11723size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11724{
11fdf7f2 11725 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
11726}
11727size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11728{
11fdf7f2 11729 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
11730}
11731size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11732{
11733 size_t r;
11734 objecter->with_osdmap([&](const OSDMap& o) {
11735 if (o.have_pg_pool(in->layout.pool_id))
11736 r = snprintf(val, size, "%s", o.get_pool_name(
11737 in->layout.pool_id).c_str());
11738 else
11739 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11740 });
11741 return r;
11742}
11743size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11744{
11745 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11746}
11747size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11748{
11fdf7f2 11749 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
11750}
11751size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11752{
11fdf7f2 11753 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
11754}
11755size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11756{
11fdf7f2 11757 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
11758}
11759size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11760{
11fdf7f2 11761 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
11762}
11763size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11764{
11fdf7f2 11765 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
11766}
11767size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11768{
11fdf7f2 11769 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae
FG
11770}
11771size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11772{
11fdf7f2 11773 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
11774}
11775size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11776{
81eedcae 11777 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
11778 (long)in->rstat.rctime.nsec());
11779}
11fdf7f2
TL
11780bool Client::_vxattrcb_dir_pin_exists(Inode *in)
11781{
11782 return in->dir_pin != -ENODATA;
11783}
11784size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
11785{
11786 return snprintf(val, size, "%ld", (long)in->dir_pin);
11787}
7c673cae 11788
81eedcae
TL
11789bool Client::_vxattrcb_snap_btime_exists(Inode *in)
11790{
11791 return !in->snap_btime.is_zero();
11792}
11793
11794size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
11795{
11796 return snprintf(val, size, "%llu.%09lu",
11797 (long long unsigned)in->snap_btime.sec(),
11798 (long unsigned)in->snap_btime.nsec());
11799}
11800
7c673cae
FG
11801#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11802#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11803
11804#define XATTR_NAME_CEPH(_type, _name) \
11805{ \
11806 name: CEPH_XATTR_NAME(_type, _name), \
11807 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11808 readonly: true, \
11809 hidden: false, \
11810 exists_cb: NULL, \
28e407b8
AA
11811 flags: 0, \
11812}
11813#define XATTR_NAME_CEPH2(_type, _name, _flags) \
11814{ \
11815 name: CEPH_XATTR_NAME(_type, _name), \
11816 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11817 readonly: true, \
11818 hidden: false, \
11819 exists_cb: NULL, \
11820 flags: _flags, \
7c673cae
FG
11821}
11822#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11823{ \
11824 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11825 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11826 readonly: false, \
11827 hidden: true, \
11828 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 11829 flags: 0, \
7c673cae
FG
11830}
11831#define XATTR_QUOTA_FIELD(_type, _name) \
11832{ \
11833 name: CEPH_XATTR_NAME(_type, _name), \
11834 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11835 readonly: false, \
11836 hidden: true, \
11837 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 11838 flags: 0, \
7c673cae
FG
11839}
11840
11841const Client::VXattr Client::_dir_vxattrs[] = {
11842 {
11843 name: "ceph.dir.layout",
11844 getxattr_cb: &Client::_vxattrcb_layout,
11845 readonly: false,
11846 hidden: true,
11847 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11848 flags: 0,
7c673cae
FG
11849 },
11850 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11851 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11852 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11853 XATTR_LAYOUT_FIELD(dir, layout, pool),
11854 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11855 XATTR_NAME_CEPH(dir, entries),
11856 XATTR_NAME_CEPH(dir, files),
11857 XATTR_NAME_CEPH(dir, subdirs),
28e407b8
AA
11858 XATTR_NAME_CEPH2(dir, rentries, VXATTR_RSTAT),
11859 XATTR_NAME_CEPH2(dir, rfiles, VXATTR_RSTAT),
11860 XATTR_NAME_CEPH2(dir, rsubdirs, VXATTR_RSTAT),
11861 XATTR_NAME_CEPH2(dir, rbytes, VXATTR_RSTAT),
11862 XATTR_NAME_CEPH2(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
11863 {
11864 name: "ceph.quota",
11865 getxattr_cb: &Client::_vxattrcb_quota,
11866 readonly: false,
11867 hidden: true,
11868 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 11869 flags: 0,
7c673cae
FG
11870 },
11871 XATTR_QUOTA_FIELD(quota, max_bytes),
11872 XATTR_QUOTA_FIELD(quota, max_files),
11fdf7f2
TL
11873 {
11874 name: "ceph.dir.pin",
11875 getxattr_cb: &Client::_vxattrcb_dir_pin,
11876 readonly: false,
11877 hidden: true,
11878 exists_cb: &Client::_vxattrcb_dir_pin_exists,
11879 flags: 0,
11880 },
81eedcae
TL
11881 {
11882 name: "ceph.snap.btime",
11883 getxattr_cb: &Client::_vxattrcb_snap_btime,
11884 readonly: true,
11885 hidden: false,
11886 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11887 flags: 0,
11888 },
7c673cae
FG
11889 { name: "" } /* Required table terminator */
11890};
11891
11892const Client::VXattr Client::_file_vxattrs[] = {
11893 {
11894 name: "ceph.file.layout",
11895 getxattr_cb: &Client::_vxattrcb_layout,
11896 readonly: false,
11897 hidden: true,
11898 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 11899 flags: 0,
7c673cae
FG
11900 },
11901 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11902 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11903 XATTR_LAYOUT_FIELD(file, layout, object_size),
11904 XATTR_LAYOUT_FIELD(file, layout, pool),
11905 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
11906 {
11907 name: "ceph.snap.btime",
11908 getxattr_cb: &Client::_vxattrcb_snap_btime,
11909 readonly: true,
11910 hidden: false,
11911 exists_cb: &Client::_vxattrcb_snap_btime_exists,
11912 flags: 0,
11913 },
7c673cae
FG
11914 { name: "" } /* Required table terminator */
11915};
11916
11917const Client::VXattr *Client::_get_vxattrs(Inode *in)
11918{
11919 if (in->is_dir())
11920 return _dir_vxattrs;
11921 else if (in->is_file())
11922 return _file_vxattrs;
11923 return NULL;
11924}
11925
11926const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11927{
11928 if (strncmp(name, "ceph.", 5) == 0) {
11929 const VXattr *vxattr = _get_vxattrs(in);
11930 if (vxattr) {
11931 while (!vxattr->name.empty()) {
11932 if (vxattr->name == name)
11933 return vxattr;
11934 vxattr++;
11935 }
11936 }
11937 }
11938 return NULL;
11939}
11940
7c673cae
FG
11941int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11942{
11fdf7f2 11943 std::lock_guard lock(client_lock);
7c673cae 11944
181888fb
FG
11945 if (unmounting)
11946 return -ENOTCONN;
11947
7c673cae
FG
11948 vinodeno_t vino = _get_vino(in);
11949
11950 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11951 tout(cct) << "ll_readlink" << std::endl;
11952 tout(cct) << vino.ino.val << std::endl;
11953
11fdf7f2
TL
11954 for (auto dn : in->dentries) {
11955 touch_dn(dn);
7c673cae
FG
11956 }
11957
11958 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11959 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11960 return r;
11961}
11962
11963int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11964 const UserPerm& perms, InodeRef *inp)
11965{
1adf2230 11966 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
11967 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11968 << ", gid " << perms.gid() << ")" << dendl;
11969
11970 if (strlen(name) > NAME_MAX)
11971 return -ENAMETOOLONG;
11972
11973 if (dir->snapid != CEPH_NOSNAP) {
11974 return -EROFS;
11975 }
11976 if (is_quota_files_exceeded(dir, perms)) {
11977 return -EDQUOT;
11978 }
11979
11980 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11981
11982 filepath path;
11983 dir->make_nosnap_relative_path(path);
11984 path.push_dentry(name);
11985 req->set_filepath(path);
11986 req->set_inode(dir);
11987 req->head.args.mknod.rdev = rdev;
11988 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11989 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11990
11991 bufferlist xattrs_bl;
11992 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11993 if (res < 0)
11994 goto fail;
11995 req->head.args.mknod.mode = mode;
11996 if (xattrs_bl.length() > 0)
11997 req->set_data(xattrs_bl);
11998
11999 Dentry *de;
12000 res = get_or_create(dir, name, &de);
12001 if (res < 0)
12002 goto fail;
12003 req->set_dentry(de);
12004
12005 res = make_request(req, perms, inp);
12006
12007 trim_cache();
12008
1adf2230 12009 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12010 return res;
12011
12012 fail:
12013 put_request(req);
12014 return res;
12015}
12016
12017int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
12018 dev_t rdev, struct stat *attr, Inode **out,
12019 const UserPerm& perms)
12020{
11fdf7f2 12021 std::lock_guard lock(client_lock);
7c673cae 12022
181888fb
FG
12023 if (unmounting)
12024 return -ENOTCONN;
12025
7c673cae
FG
12026 vinodeno_t vparent = _get_vino(parent);
12027
12028 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
12029 tout(cct) << "ll_mknod" << std::endl;
12030 tout(cct) << vparent.ino.val << std::endl;
12031 tout(cct) << name << std::endl;
12032 tout(cct) << mode << std::endl;
12033 tout(cct) << rdev << std::endl;
12034
11fdf7f2
TL
12035 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12036 "fuse_default_permissions");
12037 if (!fuse_default_permissions) {
7c673cae
FG
12038 int r = may_create(parent, perms);
12039 if (r < 0)
12040 return r;
12041 }
12042
12043 InodeRef in;
12044 int r = _mknod(parent, name, mode, rdev, perms, &in);
12045 if (r == 0) {
12046 fill_stat(in, attr);
12047 _ll_get(in.get());
12048 }
12049 tout(cct) << attr->st_ino << std::endl;
12050 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
12051 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12052 *out = in.get();
12053 return r;
12054}
12055
12056int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
12057 dev_t rdev, Inode **out,
12058 struct ceph_statx *stx, unsigned want, unsigned flags,
12059 const UserPerm& perms)
12060{
12061 unsigned caps = statx_to_mask(flags, want);
11fdf7f2 12062 std::lock_guard lock(client_lock);
7c673cae 12063
181888fb
FG
12064 if (unmounting)
12065 return -ENOTCONN;
12066
7c673cae
FG
12067 vinodeno_t vparent = _get_vino(parent);
12068
12069 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
12070 tout(cct) << "ll_mknodx" << std::endl;
12071 tout(cct) << vparent.ino.val << std::endl;
12072 tout(cct) << name << std::endl;
12073 tout(cct) << mode << std::endl;
12074 tout(cct) << rdev << std::endl;
12075
11fdf7f2
TL
12076 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12077 "fuse_default_permissions");
12078 if (!fuse_default_permissions) {
7c673cae
FG
12079 int r = may_create(parent, perms);
12080 if (r < 0)
12081 return r;
12082 }
12083
12084 InodeRef in;
12085 int r = _mknod(parent, name, mode, rdev, perms, &in);
12086 if (r == 0) {
12087 fill_statx(in, caps, stx);
12088 _ll_get(in.get());
12089 }
12090 tout(cct) << stx->stx_ino << std::endl;
12091 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
12092 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12093 *out = in.get();
12094 return r;
12095}
12096
12097int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
12098 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
12099 int object_size, const char *data_pool, bool *created,
12100 const UserPerm& perms)
12101{
1adf2230 12102 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
12103 mode << dec << ")" << dendl;
12104
12105 if (strlen(name) > NAME_MAX)
12106 return -ENAMETOOLONG;
12107 if (dir->snapid != CEPH_NOSNAP) {
12108 return -EROFS;
12109 }
12110 if (is_quota_files_exceeded(dir, perms)) {
12111 return -EDQUOT;
12112 }
12113
12114 // use normalized flags to generate cmode
11fdf7f2
TL
12115 int cflags = ceph_flags_sys2wire(flags);
12116 if (cct->_conf.get_val<bool>("client_force_lazyio"))
12117 cflags |= CEPH_O_LAZY;
12118
12119 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
12120
12121 int64_t pool_id = -1;
12122 if (data_pool && *data_pool) {
12123 pool_id = objecter->with_osdmap(
12124 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
12125 if (pool_id < 0)
12126 return -EINVAL;
12127 if (pool_id > 0xffffffffll)
12128 return -ERANGE; // bummer!
12129 }
12130
12131 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
12132
12133 filepath path;
12134 dir->make_nosnap_relative_path(path);
12135 path.push_dentry(name);
12136 req->set_filepath(path);
12137 req->set_inode(dir);
11fdf7f2 12138 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
12139
12140 req->head.args.open.stripe_unit = stripe_unit;
12141 req->head.args.open.stripe_count = stripe_count;
12142 req->head.args.open.object_size = object_size;
12143 if (cct->_conf->client_debug_getattr_caps)
12144 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
12145 else
12146 req->head.args.open.mask = 0;
12147 req->head.args.open.pool = pool_id;
12148 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12149 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12150
12151 mode |= S_IFREG;
12152 bufferlist xattrs_bl;
12153 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
12154 if (res < 0)
12155 goto fail;
12156 req->head.args.open.mode = mode;
12157 if (xattrs_bl.length() > 0)
12158 req->set_data(xattrs_bl);
12159
12160 Dentry *de;
12161 res = get_or_create(dir, name, &de);
12162 if (res < 0)
12163 goto fail;
12164 req->set_dentry(de);
12165
12166 res = make_request(req, perms, inp, created);
12167 if (res < 0) {
12168 goto reply_error;
12169 }
12170
12171 /* If the caller passed a value in fhp, do the open */
12172 if(fhp) {
12173 (*inp)->get_open_ref(cmode);
12174 *fhp = _create_fh(inp->get(), flags, cmode, perms);
12175 }
12176
12177 reply_error:
12178 trim_cache();
12179
1adf2230 12180 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
12181 << " layout " << stripe_unit
12182 << ' ' << stripe_count
12183 << ' ' << object_size
12184 <<") = " << res << dendl;
12185 return res;
12186
12187 fail:
12188 put_request(req);
12189 return res;
12190}
12191
12192
12193int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
12194 InodeRef *inp)
12195{
1adf2230 12196 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
12197 << mode << dec << ", uid " << perm.uid()
12198 << ", gid " << perm.gid() << ")" << dendl;
12199
12200 if (strlen(name) > NAME_MAX)
12201 return -ENAMETOOLONG;
12202
12203 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12204 return -EROFS;
12205 }
12206 if (is_quota_files_exceeded(dir, perm)) {
12207 return -EDQUOT;
12208 }
12209 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
12210 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
12211
12212 filepath path;
12213 dir->make_nosnap_relative_path(path);
12214 path.push_dentry(name);
12215 req->set_filepath(path);
12216 req->set_inode(dir);
12217 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12218 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12219
12220 mode |= S_IFDIR;
12221 bufferlist xattrs_bl;
12222 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
12223 if (res < 0)
12224 goto fail;
12225 req->head.args.mkdir.mode = mode;
12226 if (xattrs_bl.length() > 0)
12227 req->set_data(xattrs_bl);
12228
12229 Dentry *de;
12230 res = get_or_create(dir, name, &de);
12231 if (res < 0)
12232 goto fail;
12233 req->set_dentry(de);
12234
12235 ldout(cct, 10) << "_mkdir: making request" << dendl;
12236 res = make_request(req, perm, inp);
12237 ldout(cct, 10) << "_mkdir result is " << res << dendl;
12238
12239 trim_cache();
12240
1adf2230 12241 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
12242 return res;
12243
12244 fail:
12245 put_request(req);
12246 return res;
12247}
12248
12249int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
12250 struct stat *attr, Inode **out, const UserPerm& perm)
12251{
11fdf7f2 12252 std::lock_guard lock(client_lock);
7c673cae 12253
181888fb
FG
12254 if (unmounting)
12255 return -ENOTCONN;
12256
7c673cae
FG
12257 vinodeno_t vparent = _get_vino(parent);
12258
12259 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
12260 tout(cct) << "ll_mkdir" << std::endl;
12261 tout(cct) << vparent.ino.val << std::endl;
12262 tout(cct) << name << std::endl;
12263 tout(cct) << mode << std::endl;
12264
11fdf7f2
TL
12265 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12266 "fuse_default_permissions");
12267 if (!fuse_default_permissions) {
7c673cae
FG
12268 int r = may_create(parent, perm);
12269 if (r < 0)
12270 return r;
12271 }
12272
12273 InodeRef in;
12274 int r = _mkdir(parent, name, mode, perm, &in);
12275 if (r == 0) {
12276 fill_stat(in, attr);
12277 _ll_get(in.get());
12278 }
12279 tout(cct) << attr->st_ino << std::endl;
12280 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
12281 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12282 *out = in.get();
12283 return r;
12284}
12285
12286int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
12287 struct ceph_statx *stx, unsigned want, unsigned flags,
12288 const UserPerm& perms)
12289{
11fdf7f2 12290 std::lock_guard lock(client_lock);
7c673cae 12291
181888fb
FG
12292 if (unmounting)
12293 return -ENOTCONN;
12294
7c673cae
FG
12295 vinodeno_t vparent = _get_vino(parent);
12296
12297 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
12298 tout(cct) << "ll_mkdirx" << std::endl;
12299 tout(cct) << vparent.ino.val << std::endl;
12300 tout(cct) << name << std::endl;
12301 tout(cct) << mode << std::endl;
12302
11fdf7f2
TL
12303 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12304 "fuse_default_permissions");
12305 if (!fuse_default_permissions) {
7c673cae
FG
12306 int r = may_create(parent, perms);
12307 if (r < 0)
12308 return r;
12309 }
12310
12311 InodeRef in;
12312 int r = _mkdir(parent, name, mode, perms, &in);
12313 if (r == 0) {
12314 fill_statx(in, statx_to_mask(flags, want), stx);
12315 _ll_get(in.get());
12316 } else {
12317 stx->stx_ino = 0;
12318 stx->stx_mask = 0;
12319 }
12320 tout(cct) << stx->stx_ino << std::endl;
12321 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
12322 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12323 *out = in.get();
12324 return r;
12325}
12326
12327int Client::_symlink(Inode *dir, const char *name, const char *target,
12328 const UserPerm& perms, InodeRef *inp)
12329{
1adf2230 12330 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
12331 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
12332 << dendl;
12333
12334 if (strlen(name) > NAME_MAX)
12335 return -ENAMETOOLONG;
12336
12337 if (dir->snapid != CEPH_NOSNAP) {
12338 return -EROFS;
12339 }
12340 if (is_quota_files_exceeded(dir, perms)) {
12341 return -EDQUOT;
12342 }
12343
12344 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
12345
12346 filepath path;
12347 dir->make_nosnap_relative_path(path);
12348 path.push_dentry(name);
12349 req->set_filepath(path);
12350 req->set_inode(dir);
12351 req->set_string2(target);
12352 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12353 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12354
12355 Dentry *de;
12356 int res = get_or_create(dir, name, &de);
12357 if (res < 0)
12358 goto fail;
12359 req->set_dentry(de);
12360
12361 res = make_request(req, perms, inp);
12362
12363 trim_cache();
1adf2230 12364 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
12365 res << dendl;
12366 return res;
12367
12368 fail:
12369 put_request(req);
12370 return res;
12371}
12372
12373int Client::ll_symlink(Inode *parent, const char *name, const char *value,
12374 struct stat *attr, Inode **out, const UserPerm& perms)
12375{
11fdf7f2 12376 std::lock_guard lock(client_lock);
7c673cae 12377
181888fb
FG
12378 if (unmounting)
12379 return -ENOTCONN;
12380
7c673cae
FG
12381 vinodeno_t vparent = _get_vino(parent);
12382
12383 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
12384 << dendl;
12385 tout(cct) << "ll_symlink" << std::endl;
12386 tout(cct) << vparent.ino.val << std::endl;
12387 tout(cct) << name << std::endl;
12388 tout(cct) << value << std::endl;
12389
11fdf7f2
TL
12390 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12391 "fuse_default_permissions");
12392 if (!fuse_default_permissions) {
7c673cae
FG
12393 int r = may_create(parent, perms);
12394 if (r < 0)
12395 return r;
12396 }
12397
12398 InodeRef in;
12399 int r = _symlink(parent, name, value, perms, &in);
12400 if (r == 0) {
12401 fill_stat(in, attr);
12402 _ll_get(in.get());
12403 }
12404 tout(cct) << attr->st_ino << std::endl;
12405 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
12406 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12407 *out = in.get();
12408 return r;
12409}
12410
12411int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
12412 Inode **out, struct ceph_statx *stx, unsigned want,
12413 unsigned flags, const UserPerm& perms)
12414{
11fdf7f2 12415 std::lock_guard lock(client_lock);
7c673cae 12416
181888fb
FG
12417 if (unmounting)
12418 return -ENOTCONN;
12419
7c673cae
FG
12420 vinodeno_t vparent = _get_vino(parent);
12421
12422 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
12423 << dendl;
12424 tout(cct) << "ll_symlinkx" << std::endl;
12425 tout(cct) << vparent.ino.val << std::endl;
12426 tout(cct) << name << std::endl;
12427 tout(cct) << value << std::endl;
12428
11fdf7f2
TL
12429 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12430 "fuse_default_permissions");
12431 if (!fuse_default_permissions) {
7c673cae
FG
12432 int r = may_create(parent, perms);
12433 if (r < 0)
12434 return r;
12435 }
12436
12437 InodeRef in;
12438 int r = _symlink(parent, name, value, perms, &in);
12439 if (r == 0) {
12440 fill_statx(in, statx_to_mask(flags, want), stx);
12441 _ll_get(in.get());
12442 }
12443 tout(cct) << stx->stx_ino << std::endl;
12444 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
12445 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12446 *out = in.get();
12447 return r;
12448}
12449
12450int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
12451{
1adf2230 12452 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
12453 << " uid " << perm.uid() << " gid " << perm.gid()
12454 << ")" << dendl;
12455
12456 if (dir->snapid != CEPH_NOSNAP) {
12457 return -EROFS;
12458 }
12459
12460 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
12461
12462 filepath path;
12463 dir->make_nosnap_relative_path(path);
12464 path.push_dentry(name);
12465 req->set_filepath(path);
12466
12467 InodeRef otherin;
b32b8144 12468 Inode *in;
7c673cae 12469 Dentry *de;
b32b8144 12470
7c673cae
FG
12471 int res = get_or_create(dir, name, &de);
12472 if (res < 0)
12473 goto fail;
12474 req->set_dentry(de);
12475 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12476 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12477
12478 res = _lookup(dir, name, 0, &otherin, perm);
12479 if (res < 0)
12480 goto fail;
b32b8144
FG
12481
12482 in = otherin.get();
12483 req->set_other_inode(in);
12484 in->break_all_delegs();
7c673cae
FG
12485 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12486
12487 req->set_inode(dir);
12488
12489 res = make_request(req, perm);
12490
12491 trim_cache();
1adf2230 12492 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
12493 return res;
12494
12495 fail:
12496 put_request(req);
12497 return res;
12498}
12499
12500int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
12501{
11fdf7f2 12502 std::lock_guard lock(client_lock);
7c673cae 12503
181888fb
FG
12504 if (unmounting)
12505 return -ENOTCONN;
12506
7c673cae
FG
12507 vinodeno_t vino = _get_vino(in);
12508
12509 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
12510 tout(cct) << "ll_unlink" << std::endl;
12511 tout(cct) << vino.ino.val << std::endl;
12512 tout(cct) << name << std::endl;
12513
11fdf7f2
TL
12514 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12515 "fuse_default_permissions");
12516 if (!fuse_default_permissions) {
7c673cae
FG
12517 int r = may_delete(in, name, perm);
12518 if (r < 0)
12519 return r;
12520 }
12521 return _unlink(in, name, perm);
12522}
12523
12524int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
12525{
1adf2230 12526 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
12527 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
12528
12529 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
12530 return -EROFS;
12531 }
b32b8144
FG
12532
12533 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
12534 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
12535 filepath path;
12536 dir->make_nosnap_relative_path(path);
12537 path.push_dentry(name);
12538 req->set_filepath(path);
11fdf7f2 12539 req->set_inode(dir);
7c673cae
FG
12540
12541 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12542 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12543 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12544
12545 InodeRef in;
12546
12547 Dentry *de;
12548 int res = get_or_create(dir, name, &de);
12549 if (res < 0)
12550 goto fail;
b32b8144
FG
12551 if (op == CEPH_MDS_OP_RMDIR)
12552 req->set_dentry(de);
12553 else
12554 de->get();
12555
7c673cae
FG
12556 res = _lookup(dir, name, 0, &in, perms);
12557 if (res < 0)
12558 goto fail;
11fdf7f2
TL
12559
12560 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 12561 unlink(de, true, true);
b32b8144 12562 de->put();
7c673cae 12563 }
11fdf7f2 12564 req->set_other_inode(in.get());
7c673cae
FG
12565
12566 res = make_request(req, perms);
12567
12568 trim_cache();
1adf2230 12569 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
12570 return res;
12571
12572 fail:
12573 put_request(req);
12574 return res;
12575}
12576
12577int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12578{
11fdf7f2 12579 std::lock_guard lock(client_lock);
7c673cae 12580
181888fb
FG
12581 if (unmounting)
12582 return -ENOTCONN;
12583
7c673cae
FG
12584 vinodeno_t vino = _get_vino(in);
12585
12586 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12587 tout(cct) << "ll_rmdir" << std::endl;
12588 tout(cct) << vino.ino.val << std::endl;
12589 tout(cct) << name << std::endl;
12590
11fdf7f2
TL
12591 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12592 "fuse_default_permissions");
12593 if (!fuse_default_permissions) {
7c673cae
FG
12594 int r = may_delete(in, name, perms);
12595 if (r < 0)
12596 return r;
12597 }
12598
12599 return _rmdir(in, name, perms);
12600}
12601
12602int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12603{
1adf2230 12604 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
12605 << todir->ino << " " << toname
12606 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12607 << dendl;
12608
12609 if (fromdir->snapid != todir->snapid)
12610 return -EXDEV;
12611
12612 int op = CEPH_MDS_OP_RENAME;
12613 if (fromdir->snapid != CEPH_NOSNAP) {
12614 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12615 op = CEPH_MDS_OP_RENAMESNAP;
12616 else
12617 return -EROFS;
12618 }
12619 if (fromdir != todir) {
12620 Inode *fromdir_root =
12621 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12622 Inode *todir_root =
12623 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12624 if (fromdir_root != todir_root) {
12625 return -EXDEV;
12626 }
12627 }
12628
12629 InodeRef target;
12630 MetaRequest *req = new MetaRequest(op);
12631
12632 filepath from;
12633 fromdir->make_nosnap_relative_path(from);
12634 from.push_dentry(fromname);
12635 filepath to;
12636 todir->make_nosnap_relative_path(to);
12637 to.push_dentry(toname);
12638 req->set_filepath(to);
12639 req->set_filepath2(from);
12640
12641 Dentry *oldde;
12642 int res = get_or_create(fromdir, fromname, &oldde);
12643 if (res < 0)
12644 goto fail;
12645 Dentry *de;
12646 res = get_or_create(todir, toname, &de);
12647 if (res < 0)
12648 goto fail;
12649
12650 if (op == CEPH_MDS_OP_RENAME) {
12651 req->set_old_dentry(oldde);
12652 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12653 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12654
12655 req->set_dentry(de);
12656 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12657 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12658
12659 InodeRef oldin, otherin;
12660 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12661 if (res < 0)
12662 goto fail;
b32b8144
FG
12663
12664 Inode *oldinode = oldin.get();
12665 oldinode->break_all_delegs();
12666 req->set_old_inode(oldinode);
7c673cae
FG
12667 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12668
12669 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
12670 switch (res) {
12671 case 0:
12672 {
12673 Inode *in = otherin.get();
12674 req->set_other_inode(in);
12675 in->break_all_delegs();
12676 }
7c673cae 12677 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144
FG
12678 break;
12679 case -ENOENT:
12680 break;
12681 default:
12682 goto fail;
7c673cae
FG
12683 }
12684
12685 req->set_inode(todir);
12686 } else {
12687 // renamesnap reply contains no tracedn, so we need to invalidate
12688 // dentry manually
12689 unlink(oldde, true, true);
12690 unlink(de, true, true);
11fdf7f2
TL
12691
12692 req->set_inode(todir);
7c673cae
FG
12693 }
12694
12695 res = make_request(req, perm, &target);
12696 ldout(cct, 10) << "rename result is " << res << dendl;
12697
12698 // renamed item from our cache
12699
12700 trim_cache();
1adf2230 12701 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
12702 return res;
12703
12704 fail:
12705 put_request(req);
12706 return res;
12707}
12708
12709int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12710 const char *newname, const UserPerm& perm)
12711{
11fdf7f2 12712 std::lock_guard lock(client_lock);
7c673cae 12713
181888fb
FG
12714 if (unmounting)
12715 return -ENOTCONN;
12716
7c673cae
FG
12717 vinodeno_t vparent = _get_vino(parent);
12718 vinodeno_t vnewparent = _get_vino(newparent);
12719
12720 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12721 << vnewparent << " " << newname << dendl;
12722 tout(cct) << "ll_rename" << std::endl;
12723 tout(cct) << vparent.ino.val << std::endl;
12724 tout(cct) << name << std::endl;
12725 tout(cct) << vnewparent.ino.val << std::endl;
12726 tout(cct) << newname << std::endl;
12727
11fdf7f2
TL
12728 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12729 "fuse_default_permissions");
12730 if (!fuse_default_permissions) {
7c673cae
FG
12731 int r = may_delete(parent, name, perm);
12732 if (r < 0)
12733 return r;
12734 r = may_delete(newparent, newname, perm);
12735 if (r < 0 && r != -ENOENT)
12736 return r;
12737 }
12738
12739 return _rename(parent, name, newparent, newname, perm);
12740}
12741
12742int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12743{
1adf2230 12744 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
12745 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12746
12747 if (strlen(newname) > NAME_MAX)
12748 return -ENAMETOOLONG;
12749
12750 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12751 return -EROFS;
12752 }
12753 if (is_quota_files_exceeded(dir, perm)) {
12754 return -EDQUOT;
12755 }
12756
b32b8144 12757 in->break_all_delegs();
7c673cae
FG
12758 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12759
12760 filepath path(newname, dir->ino);
12761 req->set_filepath(path);
12762 filepath existing(in->ino);
12763 req->set_filepath2(existing);
12764
12765 req->set_inode(dir);
12766 req->inode_drop = CEPH_CAP_FILE_SHARED;
12767 req->inode_unless = CEPH_CAP_FILE_EXCL;
12768
12769 Dentry *de;
12770 int res = get_or_create(dir, newname, &de);
12771 if (res < 0)
12772 goto fail;
12773 req->set_dentry(de);
12774
12775 res = make_request(req, perm, inp);
12776 ldout(cct, 10) << "link result is " << res << dendl;
12777
12778 trim_cache();
1adf2230 12779 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
12780 return res;
12781
12782 fail:
12783 put_request(req);
12784 return res;
12785}
12786
12787int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12788 const UserPerm& perm)
12789{
11fdf7f2 12790 std::lock_guard lock(client_lock);
7c673cae 12791
181888fb
FG
12792 if (unmounting)
12793 return -ENOTCONN;
12794
7c673cae
FG
12795 vinodeno_t vino = _get_vino(in);
12796 vinodeno_t vnewparent = _get_vino(newparent);
12797
31f18b77 12798 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
12799 newname << dendl;
12800 tout(cct) << "ll_link" << std::endl;
12801 tout(cct) << vino.ino.val << std::endl;
12802 tout(cct) << vnewparent << std::endl;
12803 tout(cct) << newname << std::endl;
12804
7c673cae
FG
12805 InodeRef target;
12806
11fdf7f2
TL
12807 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12808 "fuse_default_permissions");
12809 if (!fuse_default_permissions) {
7c673cae
FG
12810 if (S_ISDIR(in->mode))
12811 return -EPERM;
12812
11fdf7f2 12813 int r = may_hardlink(in, perm);
7c673cae
FG
12814 if (r < 0)
12815 return r;
12816
12817 r = may_create(newparent, perm);
12818 if (r < 0)
12819 return r;
12820 }
12821
12822 return _link(in, newparent, newname, perm, &target);
12823}
12824
12825int Client::ll_num_osds(void)
12826{
11fdf7f2 12827 std::lock_guard lock(client_lock);
7c673cae
FG
12828 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12829}
12830
12831int Client::ll_osdaddr(int osd, uint32_t *addr)
12832{
11fdf7f2 12833 std::lock_guard lock(client_lock);
181888fb 12834
7c673cae
FG
12835 entity_addr_t g;
12836 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12837 if (!o.exists(osd))
12838 return false;
11fdf7f2 12839 g = o.get_addrs(osd).front();
7c673cae
FG
12840 return true;
12841 });
12842 if (!exists)
12843 return -1;
12844 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12845 *addr = ntohl(nb_addr);
12846 return 0;
12847}
181888fb 12848
7c673cae
FG
12849uint32_t Client::ll_stripe_unit(Inode *in)
12850{
11fdf7f2 12851 std::lock_guard lock(client_lock);
7c673cae
FG
12852 return in->layout.stripe_unit;
12853}
12854
12855uint64_t Client::ll_snap_seq(Inode *in)
12856{
11fdf7f2 12857 std::lock_guard lock(client_lock);
7c673cae
FG
12858 return in->snaprealm->seq;
12859}
12860
12861int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12862{
11fdf7f2 12863 std::lock_guard lock(client_lock);
7c673cae
FG
12864 *layout = in->layout;
12865 return 0;
12866}
12867
12868int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12869{
12870 return ll_file_layout(fh->inode.get(), layout);
12871}
12872
12873/* Currently we cannot take advantage of redundancy in reads, since we
12874 would have to go through all possible placement groups (a
12875 potentially quite large number determined by a hash), and use CRUSH
12876 to calculate the appropriate set of OSDs for each placement group,
12877 then index into that. An array with one entry per OSD is much more
12878 tractable and works for demonstration purposes. */
12879
12880int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12881 file_layout_t* layout)
12882{
11fdf7f2 12883 std::lock_guard lock(client_lock);
181888fb 12884
28e407b8 12885 inodeno_t ino = in->ino;
7c673cae
FG
12886 uint32_t object_size = layout->object_size;
12887 uint32_t su = layout->stripe_unit;
12888 uint32_t stripe_count = layout->stripe_count;
12889 uint64_t stripes_per_object = object_size / su;
11fdf7f2 12890 uint64_t stripeno = 0, stripepos = 0;
7c673cae 12891
11fdf7f2
TL
12892 if(stripe_count) {
12893 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12894 stripepos = blockno % stripe_count; // which object in the object set (X)
12895 }
7c673cae
FG
12896 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12897 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12898
12899 object_t oid = file_object_t(ino, objectno);
12900 return objecter->with_osdmap([&](const OSDMap& o) {
12901 ceph_object_layout olayout =
12902 o.file_to_object_layout(oid, *layout);
12903 pg_t pg = (pg_t)olayout.ol_pgid;
12904 vector<int> osds;
12905 int primary;
12906 o.pg_to_acting_osds(pg, &osds, &primary);
12907 return primary;
12908 });
12909}
12910
12911/* Return the offset of the block, internal to the object */
12912
12913uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12914{
11fdf7f2 12915 std::lock_guard lock(client_lock);
7c673cae
FG
12916 file_layout_t *layout=&(in->layout);
12917 uint32_t object_size = layout->object_size;
12918 uint32_t su = layout->stripe_unit;
12919 uint64_t stripes_per_object = object_size / su;
12920
12921 return (blockno % stripes_per_object) * su;
12922}
12923
12924int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12925 const UserPerm& perms)
12926{
11fdf7f2 12927 std::lock_guard lock(client_lock);
7c673cae 12928
181888fb
FG
12929 if (unmounting)
12930 return -ENOTCONN;
12931
7c673cae
FG
12932 vinodeno_t vino = _get_vino(in);
12933
12934 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12935 tout(cct) << "ll_opendir" << std::endl;
12936 tout(cct) << vino.ino.val << std::endl;
12937
11fdf7f2
TL
12938 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12939 "fuse_default_permissions");
12940 if (!fuse_default_permissions) {
7c673cae
FG
12941 int r = may_open(in, flags, perms);
12942 if (r < 0)
12943 return r;
12944 }
12945
12946 int r = _opendir(in, dirpp, perms);
12947 tout(cct) << (unsigned long)*dirpp << std::endl;
12948
12949 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12950 << dendl;
12951 return r;
12952}
12953
12954int Client::ll_releasedir(dir_result_t *dirp)
12955{
11fdf7f2 12956 std::lock_guard lock(client_lock);
7c673cae
FG
12957 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12958 tout(cct) << "ll_releasedir" << std::endl;
12959 tout(cct) << (unsigned long)dirp << std::endl;
181888fb
FG
12960
12961 if (unmounting)
12962 return -ENOTCONN;
12963
7c673cae
FG
12964 _closedir(dirp);
12965 return 0;
12966}
12967
12968int Client::ll_fsyncdir(dir_result_t *dirp)
12969{
11fdf7f2 12970 std::lock_guard lock(client_lock);
7c673cae
FG
12971 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12972 tout(cct) << "ll_fsyncdir" << std::endl;
12973 tout(cct) << (unsigned long)dirp << std::endl;
12974
181888fb
FG
12975 if (unmounting)
12976 return -ENOTCONN;
12977
7c673cae
FG
12978 return _fsync(dirp->inode.get(), false);
12979}
12980
12981int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12982{
11fdf7f2 12983 ceph_assert(!(flags & O_CREAT));
7c673cae 12984
11fdf7f2 12985 std::lock_guard lock(client_lock);
7c673cae 12986
181888fb
FG
12987 if (unmounting)
12988 return -ENOTCONN;
12989
7c673cae
FG
12990 vinodeno_t vino = _get_vino(in);
12991
12992 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12993 tout(cct) << "ll_open" << std::endl;
12994 tout(cct) << vino.ino.val << std::endl;
12995 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12996
12997 int r;
11fdf7f2
TL
12998 auto fuse_default_permissions = cct->_conf.get_val<bool>(
12999 "fuse_default_permissions");
13000 if (!fuse_default_permissions) {
7c673cae
FG
13001 r = may_open(in, flags, perms);
13002 if (r < 0)
13003 goto out;
13004 }
13005
13006 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
13007
13008 out:
13009 Fh *fhptr = fhp ? *fhp : NULL;
13010 if (fhptr) {
13011 ll_unclosed_fh_set.insert(fhptr);
13012 }
13013 tout(cct) << (unsigned long)fhptr << std::endl;
13014 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
13015 " = " << r << " (" << fhptr << ")" << dendl;
13016 return r;
13017}
13018
13019int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
13020 int flags, InodeRef *in, int caps, Fh **fhp,
13021 const UserPerm& perms)
13022{
13023 *fhp = NULL;
13024
13025 vinodeno_t vparent = _get_vino(parent);
13026
1adf2230 13027 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13028 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
13029 << ", gid " << perms.gid() << dendl;
13030 tout(cct) << "ll_create" << std::endl;
13031 tout(cct) << vparent.ino.val << std::endl;
13032 tout(cct) << name << std::endl;
13033 tout(cct) << mode << std::endl;
13034 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
13035
13036 bool created = false;
13037 int r = _lookup(parent, name, caps, in, perms);
13038
13039 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
13040 return -EEXIST;
13041
13042 if (r == -ENOENT && (flags & O_CREAT)) {
11fdf7f2
TL
13043 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13044 "fuse_default_permissions");
13045 if (!fuse_default_permissions) {
7c673cae
FG
13046 r = may_create(parent, perms);
13047 if (r < 0)
13048 goto out;
13049 }
13050 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
13051 perms);
13052 if (r < 0)
13053 goto out;
13054 }
13055
13056 if (r < 0)
13057 goto out;
13058
11fdf7f2 13059 ceph_assert(*in);
7c673cae
FG
13060
13061 ldout(cct, 20) << "_ll_create created = " << created << dendl;
13062 if (!created) {
11fdf7f2
TL
13063 auto fuse_default_permissions = cct->_conf.get_val<bool>(
13064 "fuse_default_permissions");
13065 if (!fuse_default_permissions) {
7c673cae
FG
13066 r = may_open(in->get(), flags, perms);
13067 if (r < 0) {
13068 if (*fhp) {
13069 int release_r = _release_fh(*fhp);
11fdf7f2 13070 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
13071 }
13072 goto out;
13073 }
13074 }
13075 if (*fhp == NULL) {
13076 r = _open(in->get(), flags, mode, fhp, perms);
13077 if (r < 0)
13078 goto out;
13079 }
13080 }
13081
13082out:
13083 if (*fhp) {
13084 ll_unclosed_fh_set.insert(*fhp);
13085 }
13086
13087 ino_t ino = 0;
13088 if (r >= 0) {
13089 Inode *inode = in->get();
13090 if (use_faked_inos())
13091 ino = inode->faked_ino;
13092 else
13093 ino = inode->ino;
13094 }
13095
13096 tout(cct) << (unsigned long)*fhp << std::endl;
13097 tout(cct) << ino << std::endl;
1adf2230 13098 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
13099 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
13100 *fhp << " " << hex << ino << dec << ")" << dendl;
13101
13102 return r;
13103}
13104
13105int Client::ll_create(Inode *parent, const char *name, mode_t mode,
13106 int flags, struct stat *attr, Inode **outp, Fh **fhp,
13107 const UserPerm& perms)
13108{
11fdf7f2 13109 std::lock_guard lock(client_lock);
7c673cae
FG
13110 InodeRef in;
13111
181888fb
FG
13112 if (unmounting)
13113 return -ENOTCONN;
13114
7c673cae
FG
13115 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
13116 fhp, perms);
13117 if (r >= 0) {
11fdf7f2 13118 ceph_assert(in);
7c673cae
FG
13119
13120 // passing an Inode in outp requires an additional ref
13121 if (outp) {
13122 _ll_get(in.get());
13123 *outp = in.get();
13124 }
13125 fill_stat(in, attr);
13126 } else {
13127 attr->st_ino = 0;
13128 }
13129
13130 return r;
13131}
13132
13133int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
13134 int oflags, Inode **outp, Fh **fhp,
13135 struct ceph_statx *stx, unsigned want, unsigned lflags,
13136 const UserPerm& perms)
13137{
13138 unsigned caps = statx_to_mask(lflags, want);
11fdf7f2 13139 std::lock_guard lock(client_lock);
7c673cae
FG
13140 InodeRef in;
13141
181888fb
FG
13142 if (unmounting)
13143 return -ENOTCONN;
7c673cae
FG
13144
13145 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
13146 if (r >= 0) {
11fdf7f2 13147 ceph_assert(in);
7c673cae
FG
13148
13149 // passing an Inode in outp requires an additional ref
13150 if (outp) {
13151 _ll_get(in.get());
13152 *outp = in.get();
13153 }
13154 fill_statx(in, caps, stx);
13155 } else {
13156 stx->stx_ino = 0;
13157 stx->stx_mask = 0;
13158 }
13159
13160 return r;
13161}
13162
13163loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
13164{
11fdf7f2 13165 std::lock_guard lock(client_lock);
7c673cae
FG
13166 tout(cct) << "ll_lseek" << std::endl;
13167 tout(cct) << offset << std::endl;
13168 tout(cct) << whence << std::endl;
13169
181888fb
FG
13170 if (unmounting)
13171 return -ENOTCONN;
13172
7c673cae
FG
13173 return _lseek(fh, offset, whence);
13174}
13175
13176int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
13177{
11fdf7f2 13178 std::lock_guard lock(client_lock);
7c673cae
FG
13179 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
13180 tout(cct) << "ll_read" << std::endl;
13181 tout(cct) << (unsigned long)fh << std::endl;
13182 tout(cct) << off << std::endl;
13183 tout(cct) << len << std::endl;
13184
181888fb
FG
13185 if (unmounting)
13186 return -ENOTCONN;
13187
11fdf7f2
TL
13188 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13189 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13190 return _read(fh, off, len, bl);
13191}
13192
13193int Client::ll_read_block(Inode *in, uint64_t blockid,
13194 char *buf,
13195 uint64_t offset,
13196 uint64_t length,
13197 file_layout_t* layout)
13198{
11fdf7f2 13199 std::lock_guard lock(client_lock);
181888fb
FG
13200
13201 if (unmounting)
13202 return -ENOTCONN;
13203
b32b8144 13204 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13205 object_t oid = file_object_t(vino.ino, blockid);
13206 C_SaferCond onfinish;
13207 bufferlist bl;
13208
13209 objecter->read(oid,
13210 object_locator_t(layout->pool_id),
13211 offset,
13212 length,
13213 vino.snapid,
13214 &bl,
13215 CEPH_OSD_FLAG_READ,
13216 &onfinish);
13217
13218 client_lock.Unlock();
13219 int r = onfinish.wait();
13220 client_lock.Lock();
13221
13222 if (r >= 0) {
13223 bl.copy(0, bl.length(), buf);
13224 r = bl.length();
13225 }
13226
13227 return r;
13228}
13229
13230/* It appears that the OSD doesn't return success unless the entire
13231 buffer was written, return the write length on success. */
13232
13233int Client::ll_write_block(Inode *in, uint64_t blockid,
13234 char* buf, uint64_t offset,
13235 uint64_t length, file_layout_t* layout,
13236 uint64_t snapseq, uint32_t sync)
13237{
7c673cae 13238 vinodeno_t vino = ll_get_vino(in);
7c673cae 13239 int r = 0;
11fdf7f2
TL
13240 std::unique_ptr<C_SaferCond> onsafe = nullptr;
13241
7c673cae
FG
13242 if (length == 0) {
13243 return -EINVAL;
13244 }
13245 if (true || sync) {
13246 /* if write is stable, the epilogue is waiting on
13247 * flock */
11fdf7f2 13248 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
13249 }
13250 object_t oid = file_object_t(vino.ino, blockid);
13251 SnapContext fakesnap;
11fdf7f2
TL
13252 ceph::bufferlist bl;
13253 if (length > 0) {
13254 bl.push_back(buffer::copy(buf, length));
13255 }
7c673cae
FG
13256
13257 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
13258 << dendl;
13259
13260 fakesnap.seq = snapseq;
13261
13262 /* lock just in time */
13263 client_lock.Lock();
181888fb
FG
13264 if (unmounting) {
13265 client_lock.Unlock();
181888fb
FG
13266 return -ENOTCONN;
13267 }
7c673cae
FG
13268
13269 objecter->write(oid,
13270 object_locator_t(layout->pool_id),
13271 offset,
13272 length,
13273 fakesnap,
13274 bl,
13275 ceph::real_clock::now(),
13276 0,
11fdf7f2 13277 onsafe.get());
7c673cae
FG
13278
13279 client_lock.Unlock();
11fdf7f2
TL
13280 if (nullptr != onsafe) {
13281 r = onsafe->wait();
7c673cae
FG
13282 }
13283
13284 if (r < 0) {
13285 return r;
13286 } else {
13287 return length;
13288 }
13289}
13290
13291int Client::ll_commit_blocks(Inode *in,
13292 uint64_t offset,
13293 uint64_t length)
13294{
11fdf7f2 13295 std::lock_guard lock(client_lock);
7c673cae
FG
13296 /*
13297 BarrierContext *bctx;
b32b8144 13298 vinodeno_t vino = _get_vino(in);
7c673cae
FG
13299 uint64_t ino = vino.ino;
13300
13301 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
13302 << offset << " to " << length << dendl;
13303
13304 if (length == 0) {
13305 return -EINVAL;
13306 }
13307
13308 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
13309 if (p != barriers.end()) {
13310 barrier_interval civ(offset, offset + length);
13311 p->second->commit_barrier(civ);
13312 }
13313 */
13314 return 0;
13315}
13316
13317int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
13318{
11fdf7f2 13319 std::lock_guard lock(client_lock);
7c673cae
FG
13320 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
13321 "~" << len << dendl;
13322 tout(cct) << "ll_write" << std::endl;
13323 tout(cct) << (unsigned long)fh << std::endl;
13324 tout(cct) << off << std::endl;
13325 tout(cct) << len << std::endl;
13326
181888fb
FG
13327 if (unmounting)
13328 return -ENOTCONN;
13329
11fdf7f2
TL
13330 /* We can't return bytes written larger than INT_MAX, clamp len to that */
13331 len = std::min(len, (loff_t)INT_MAX);
7c673cae
FG
13332 int r = _write(fh, off, len, data, NULL, 0);
13333 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
13334 << dendl;
13335 return r;
13336}
13337
11fdf7f2
TL
13338int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13339{
13340 std::lock_guard lock(client_lock);
13341 if (unmounting)
13342 return -ENOTCONN;
13343 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
13344}
13345
13346int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
13347{
13348 std::lock_guard lock(client_lock);
13349 if (unmounting)
13350 return -ENOTCONN;
13351 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
13352}
13353
7c673cae
FG
13354int Client::ll_flush(Fh *fh)
13355{
11fdf7f2 13356 std::lock_guard lock(client_lock);
7c673cae
FG
13357 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
13358 tout(cct) << "ll_flush" << std::endl;
13359 tout(cct) << (unsigned long)fh << std::endl;
13360
181888fb
FG
13361 if (unmounting)
13362 return -ENOTCONN;
13363
7c673cae
FG
13364 return _flush(fh);
13365}
13366
13367int Client::ll_fsync(Fh *fh, bool syncdataonly)
13368{
11fdf7f2 13369 std::lock_guard lock(client_lock);
7c673cae
FG
13370 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
13371 tout(cct) << "ll_fsync" << std::endl;
13372 tout(cct) << (unsigned long)fh << std::endl;
13373
181888fb
FG
13374 if (unmounting)
13375 return -ENOTCONN;
13376
7c673cae
FG
13377 int r = _fsync(fh, syncdataonly);
13378 if (r) {
13379 // If we're returning an error, clear it from the FH
13380 fh->take_async_err();
13381 }
13382 return r;
13383}
13384
28e407b8
AA
13385int Client::ll_sync_inode(Inode *in, bool syncdataonly)
13386{
11fdf7f2 13387 std::lock_guard lock(client_lock);
28e407b8
AA
13388 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
13389 tout(cct) << "ll_sync_inode" << std::endl;
13390 tout(cct) << (unsigned long)in << std::endl;
13391
13392 if (unmounting)
13393 return -ENOTCONN;
13394
13395 return _fsync(in, syncdataonly);
13396}
13397
7c673cae
FG
13398#ifdef FALLOC_FL_PUNCH_HOLE
13399
13400int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13401{
13402 if (offset < 0 || length <= 0)
13403 return -EINVAL;
13404
13405 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
13406 return -EOPNOTSUPP;
13407
13408 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
13409 return -EOPNOTSUPP;
13410
13411 Inode *in = fh->inode.get();
13412
13413 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
13414 !(mode & FALLOC_FL_PUNCH_HOLE)) {
13415 return -ENOSPC;
13416 }
13417
13418 if (in->snapid != CEPH_NOSNAP)
13419 return -EROFS;
13420
13421 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
13422 return -EBADF;
13423
13424 uint64_t size = offset + length;
13425 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
13426 size > in->size &&
11fdf7f2 13427 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
7c673cae
FG
13428 return -EDQUOT;
13429 }
13430
13431 int have;
13432 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
13433 if (r < 0)
13434 return r;
13435
11fdf7f2 13436 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
13437 if (mode & FALLOC_FL_PUNCH_HOLE) {
13438 if (in->inline_version < CEPH_INLINE_NONE &&
13439 (have & CEPH_CAP_FILE_BUFFER)) {
13440 bufferlist bl;
13441 int len = in->inline_data.length();
13442 if (offset < len) {
13443 if (offset > 0)
13444 in->inline_data.copy(0, offset, bl);
13445 int size = length;
13446 if (offset + size > len)
13447 size = len - offset;
13448 if (size > 0)
13449 bl.append_zero(size);
13450 if (offset + size < len)
13451 in->inline_data.copy(offset + size, len - offset - size, bl);
13452 in->inline_data = bl;
13453 in->inline_version++;
13454 }
91327a77 13455 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13456 in->change_attr++;
28e407b8 13457 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13458 } else {
13459 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
13460 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
13461 uninline_data(in, onuninline.get());
7c673cae
FG
13462 }
13463
11fdf7f2 13464 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae
FG
13465
13466 unsafe_sync_write++;
13467 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
13468
13469 _invalidate_inode_cache(in, offset, length);
13470 filer->zero(in->ino, &in->layout,
13471 in->snaprealm->get_snap_context(),
13472 offset, length,
13473 ceph::real_clock::now(),
11fdf7f2 13474 0, true, &onfinish);
91327a77 13475 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13476 in->change_attr++;
28e407b8 13477 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13478
13479 client_lock.Unlock();
11fdf7f2 13480 onfinish.wait();
7c673cae
FG
13481 client_lock.Lock();
13482 _sync_write_commit(in);
13483 }
13484 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
13485 uint64_t size = offset + length;
13486 if (size > in->size) {
13487 in->size = size;
91327a77 13488 in->mtime = in->ctime = ceph_clock_now();
7c673cae 13489 in->change_attr++;
28e407b8 13490 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 13491
11fdf7f2 13492 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 13493 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
13494 } else if (is_max_size_approaching(in)) {
13495 check_caps(in, 0);
7c673cae
FG
13496 }
13497 }
13498 }
13499
11fdf7f2 13500 if (nullptr != onuninline) {
7c673cae 13501 client_lock.Unlock();
11fdf7f2 13502 int ret = onuninline->wait();
7c673cae
FG
13503 client_lock.Lock();
13504
11fdf7f2 13505 if (ret >= 0 || ret == -ECANCELED) {
7c673cae
FG
13506 in->inline_data.clear();
13507 in->inline_version = CEPH_INLINE_NONE;
28e407b8 13508 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
13509 check_caps(in, 0);
13510 } else
11fdf7f2 13511 r = ret;
7c673cae
FG
13512 }
13513
13514 put_cap_ref(in, CEPH_CAP_FILE_WR);
13515 return r;
13516}
13517#else
13518
13519int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
13520{
13521 return -EOPNOTSUPP;
13522}
13523
13524#endif
13525
13526
11fdf7f2 13527int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 13528{
11fdf7f2
TL
13529 std::lock_guard lock(client_lock);
13530 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
13531 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
7c673cae
FG
13532 tout(cct) << (unsigned long)fh << std::endl;
13533
181888fb
FG
13534 if (unmounting)
13535 return -ENOTCONN;
13536
7c673cae
FG
13537 return _fallocate(fh, mode, offset, length);
13538}
13539
13540int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
13541{
11fdf7f2
TL
13542 std::lock_guard lock(client_lock);
13543 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
7c673cae 13544
181888fb
FG
13545 if (unmounting)
13546 return -ENOTCONN;
13547
7c673cae
FG
13548 Fh *fh = get_filehandle(fd);
13549 if (!fh)
13550 return -EBADF;
13551#if defined(__linux__) && defined(O_PATH)
13552 if (fh->flags & O_PATH)
13553 return -EBADF;
13554#endif
13555 return _fallocate(fh, mode, offset, length);
13556}
13557
13558int Client::ll_release(Fh *fh)
13559{
11fdf7f2 13560 std::lock_guard lock(client_lock);
91327a77
AA
13561
13562 if (unmounting)
13563 return -ENOTCONN;
13564
11fdf7f2 13565 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 13566 dendl;
11fdf7f2 13567 tout(cct) << __func__ << " (fh)" << std::endl;
7c673cae
FG
13568 tout(cct) << (unsigned long)fh << std::endl;
13569
13570 if (ll_unclosed_fh_set.count(fh))
13571 ll_unclosed_fh_set.erase(fh);
13572 return _release_fh(fh);
13573}
13574
13575int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
13576{
11fdf7f2 13577 std::lock_guard lock(client_lock);
7c673cae
FG
13578
13579 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
13580 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
13581
181888fb
FG
13582 if (unmounting)
13583 return -ENOTCONN;
13584
7c673cae
FG
13585 return _getlk(fh, fl, owner);
13586}
13587
13588int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
13589{
11fdf7f2 13590 std::lock_guard lock(client_lock);
7c673cae 13591
11fdf7f2
TL
13592 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13593 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13594
181888fb
FG
13595 if (unmounting)
13596 return -ENOTCONN;
13597
7c673cae
FG
13598 return _setlk(fh, fl, owner, sleep);
13599}
13600
13601int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13602{
11fdf7f2 13603 std::lock_guard lock(client_lock);
7c673cae 13604
11fdf7f2
TL
13605 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
13606 tout(cct) << __func__ << " (fh)" << (unsigned long)fh << std::endl;
7c673cae 13607
181888fb
FG
13608 if (unmounting)
13609 return -ENOTCONN;
13610
7c673cae
FG
13611 return _flock(fh, cmd, owner);
13612}
13613
b32b8144
FG
13614int Client::set_deleg_timeout(uint32_t timeout)
13615{
11fdf7f2 13616 std::lock_guard lock(client_lock);
b32b8144
FG
13617
13618 /*
13619 * The whole point is to prevent blacklisting so we must time out the
13620 * delegation before the session autoclose timeout kicks in.
13621 */
13622 if (timeout >= mdsmap->get_session_autoclose())
13623 return -EINVAL;
13624
13625 deleg_timeout = timeout;
13626 return 0;
13627}
13628
13629int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
13630{
13631 int ret = -EINVAL;
13632
11fdf7f2 13633 std::lock_guard lock(client_lock);
b32b8144
FG
13634
13635 if (!mounted)
13636 return -ENOTCONN;
13637
13638 Inode *inode = fh->inode.get();
13639
13640 switch(cmd) {
13641 case CEPH_DELEGATION_NONE:
13642 inode->unset_deleg(fh);
13643 ret = 0;
13644 break;
13645 default:
13646 try {
13647 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 13648 } catch (std::bad_alloc&) {
b32b8144
FG
13649 ret = -ENOMEM;
13650 }
13651 break;
13652 }
13653 return ret;
13654}
13655
7c673cae
FG
13656class C_Client_RequestInterrupt : public Context {
13657private:
13658 Client *client;
13659 MetaRequest *req;
13660public:
13661 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13662 req->get();
13663 }
13664 void finish(int r) override {
11fdf7f2
TL
13665 std::lock_guard l(client->client_lock);
13666 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
13667 client->_interrupt_filelock(req);
13668 client->put_request(req);
13669 }
13670};
13671
13672void Client::ll_interrupt(void *d)
13673{
13674 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
13675 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
13676 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
13677 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13678}
13679
13680// =========================================
13681// layout
13682
13683// expose file layouts
13684
13685int Client::describe_layout(const char *relpath, file_layout_t *lp,
13686 const UserPerm& perms)
13687{
11fdf7f2 13688 std::lock_guard lock(client_lock);
7c673cae 13689
181888fb
FG
13690 if (unmounting)
13691 return -ENOTCONN;
13692
7c673cae
FG
13693 filepath path(relpath);
13694 InodeRef in;
13695 int r = path_walk(path, &in, perms);
13696 if (r < 0)
13697 return r;
13698
13699 *lp = in->layout;
13700
11fdf7f2 13701 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
13702 return 0;
13703}
13704
13705int Client::fdescribe_layout(int fd, file_layout_t *lp)
13706{
11fdf7f2 13707 std::lock_guard lock(client_lock);
7c673cae 13708
181888fb
FG
13709 if (unmounting)
13710 return -ENOTCONN;
13711
7c673cae
FG
13712 Fh *f = get_filehandle(fd);
13713 if (!f)
13714 return -EBADF;
13715 Inode *in = f->inode.get();
13716
13717 *lp = in->layout;
13718
11fdf7f2 13719 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
13720 return 0;
13721}
13722
d2e6a577
FG
13723int64_t Client::get_default_pool_id()
13724{
11fdf7f2 13725 std::lock_guard lock(client_lock);
181888fb
FG
13726
13727 if (unmounting)
13728 return -ENOTCONN;
13729
d2e6a577
FG
13730 /* first data pool is the default */
13731 return mdsmap->get_first_data_pool();
13732}
7c673cae
FG
13733
13734// expose osdmap
13735
13736int64_t Client::get_pool_id(const char *pool_name)
13737{
11fdf7f2 13738 std::lock_guard lock(client_lock);
181888fb
FG
13739
13740 if (unmounting)
13741 return -ENOTCONN;
13742
7c673cae
FG
13743 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13744 pool_name);
13745}
13746
13747string Client::get_pool_name(int64_t pool)
13748{
11fdf7f2 13749 std::lock_guard lock(client_lock);
181888fb
FG
13750
13751 if (unmounting)
13752 return string();
13753
7c673cae
FG
13754 return objecter->with_osdmap([pool](const OSDMap& o) {
13755 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13756 });
13757}
13758
13759int Client::get_pool_replication(int64_t pool)
13760{
11fdf7f2 13761 std::lock_guard lock(client_lock);
181888fb
FG
13762
13763 if (unmounting)
13764 return -ENOTCONN;
13765
7c673cae
FG
13766 return objecter->with_osdmap([pool](const OSDMap& o) {
13767 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13768 });
13769}
13770
13771int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13772{
11fdf7f2 13773 std::lock_guard lock(client_lock);
7c673cae 13774
181888fb
FG
13775 if (unmounting)
13776 return -ENOTCONN;
13777
7c673cae
FG
13778 Fh *f = get_filehandle(fd);
13779 if (!f)
13780 return -EBADF;
13781 Inode *in = f->inode.get();
13782
13783 vector<ObjectExtent> extents;
13784 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 13785 ceph_assert(extents.size() == 1);
7c673cae
FG
13786
13787 objecter->with_osdmap([&](const OSDMap& o) {
13788 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13789 o.pg_to_acting_osds(pg, osds);
13790 });
13791
13792 if (osds.empty())
13793 return -EINVAL;
13794
13795 /*
13796 * Return the remainder of the extent (stripe unit)
13797 *
13798 * If length = 1 is passed to Striper::file_to_extents we get a single
13799 * extent back, but its length is one so we still need to compute the length
13800 * to the end of the stripe unit.
13801 *
13802 * If length = su then we may get 1 or 2 objects back in the extents vector
13803 * which would have to be examined. Even then, the offsets are local to the
13804 * object, so matching up to the file offset is extra work.
13805 *
13806 * It seems simpler to stick with length = 1 and manually compute the
13807 * remainder.
13808 */
13809 if (len) {
13810 uint64_t su = in->layout.stripe_unit;
13811 *len = su - (off % su);
13812 }
13813
13814 return 0;
13815}
13816
13817int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13818{
11fdf7f2 13819 std::lock_guard lock(client_lock);
181888fb
FG
13820
13821 if (unmounting)
13822 return -ENOTCONN;
13823
7c673cae
FG
13824 if (id < 0)
13825 return -EINVAL;
13826 return objecter->with_osdmap([&](const OSDMap& o) {
13827 return o.crush->get_full_location_ordered(id, path);
13828 });
13829}
13830
13831int Client::get_file_stripe_address(int fd, loff_t offset,
13832 vector<entity_addr_t>& address)
13833{
11fdf7f2 13834 std::lock_guard lock(client_lock);
7c673cae 13835
181888fb
FG
13836 if (unmounting)
13837 return -ENOTCONN;
13838
7c673cae
FG
13839 Fh *f = get_filehandle(fd);
13840 if (!f)
13841 return -EBADF;
13842 Inode *in = f->inode.get();
13843
13844 // which object?
13845 vector<ObjectExtent> extents;
13846 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13847 in->truncate_size, extents);
11fdf7f2 13848 ceph_assert(extents.size() == 1);
7c673cae
FG
13849
13850 // now we have the object and its 'layout'
13851 return objecter->with_osdmap([&](const OSDMap& o) {
13852 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13853 vector<int> osds;
13854 o.pg_to_acting_osds(pg, osds);
13855 if (osds.empty())
13856 return -EINVAL;
13857 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 13858 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
13859 address.push_back(addr);
13860 }
13861 return 0;
13862 });
13863}
13864
13865int Client::get_osd_addr(int osd, entity_addr_t& addr)
13866{
11fdf7f2 13867 std::lock_guard lock(client_lock);
181888fb
FG
13868
13869 if (unmounting)
13870 return -ENOTCONN;
13871
7c673cae
FG
13872 return objecter->with_osdmap([&](const OSDMap& o) {
13873 if (!o.exists(osd))
13874 return -ENOENT;
13875
11fdf7f2 13876 addr = o.get_addrs(osd).front();
7c673cae
FG
13877 return 0;
13878 });
13879}
13880
13881int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13882 loff_t length, loff_t offset)
13883{
11fdf7f2 13884 std::lock_guard lock(client_lock);
7c673cae 13885
181888fb
FG
13886 if (unmounting)
13887 return -ENOTCONN;
13888
7c673cae
FG
13889 Fh *f = get_filehandle(fd);
13890 if (!f)
13891 return -EBADF;
13892 Inode *in = f->inode.get();
13893
13894 // map to a list of extents
13895 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13896
11fdf7f2 13897 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
13898 return 0;
13899}
13900
13901
b32b8144 13902/* find an osd with the same ip. -ENXIO if none. */
7c673cae
FG
13903int Client::get_local_osd()
13904{
11fdf7f2 13905 std::lock_guard lock(client_lock);
181888fb
FG
13906
13907 if (unmounting)
13908 return -ENOTCONN;
13909
7c673cae
FG
13910 objecter->with_osdmap([this](const OSDMap& o) {
13911 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 13912 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
13913 local_osd_epoch = o.get_epoch();
13914 }
13915 });
13916 return local_osd;
13917}
13918
13919
13920
13921
13922
13923
13924// ===============================
13925
13926void Client::ms_handle_connect(Connection *con)
13927{
11fdf7f2 13928 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13929}
13930
13931bool Client::ms_handle_reset(Connection *con)
13932{
11fdf7f2 13933 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
13934 return false;
13935}
13936
13937void Client::ms_handle_remote_reset(Connection *con)
13938{
11fdf7f2
TL
13939 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
13940 std::lock_guard l(client_lock);
7c673cae
FG
13941 switch (con->get_peer_type()) {
13942 case CEPH_ENTITY_TYPE_MDS:
13943 {
13944 // kludge to figure out which mds this is; fixme with a Connection* state
13945 mds_rank_t mds = MDS_RANK_NONE;
13946 MetaSession *s = NULL;
11fdf7f2
TL
13947 for (auto &p : mds_sessions) {
13948 if (mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
13949 mds = p.first;
13950 s = &p.second;
7c673cae
FG
13951 }
13952 }
13953 if (mds >= 0) {
d2e6a577 13954 assert (s != NULL);
7c673cae
FG
13955 switch (s->state) {
13956 case MetaSession::STATE_CLOSING:
13957 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13958 _closed_mds_session(s);
13959 break;
13960
13961 case MetaSession::STATE_OPENING:
13962 {
13963 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13964 list<Context*> waiters;
13965 waiters.swap(s->waiting_for_open);
13966 _closed_mds_session(s);
13967 MetaSession *news = _get_or_open_mds_session(mds);
13968 news->waiting_for_open.swap(waiters);
13969 }
13970 break;
13971
13972 case MetaSession::STATE_OPEN:
13973 {
28e407b8 13974 objecter->maybe_request_map(); /* to check if we are blacklisted */
11fdf7f2 13975 const auto& conf = cct->_conf;
7c673cae
FG
13976 if (conf->client_reconnect_stale) {
13977 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13978 _closed_mds_session(s);
13979 } else {
13980 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13981 s->state = MetaSession::STATE_STALE;
13982 }
13983 }
13984 break;
13985
13986 case MetaSession::STATE_NEW:
13987 case MetaSession::STATE_CLOSED:
13988 default:
13989 break;
13990 }
13991 }
13992 }
13993 break;
13994 }
13995}
13996
13997bool Client::ms_handle_refused(Connection *con)
13998{
11fdf7f2 13999 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
14000 return false;
14001}
14002
11fdf7f2 14003bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer)
7c673cae
FG
14004{
14005 if (dest_type == CEPH_ENTITY_TYPE_MON)
14006 return true;
14007 *authorizer = monclient->build_authorizer(dest_type);
14008 return true;
14009}
14010
14011Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
14012{
11fdf7f2
TL
14013 Inode *quota_in = root_ancestor;
14014 SnapRealm *realm = in->snaprealm;
14015 while (realm) {
14016 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
14017 if (realm->ino != in->ino) {
14018 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
14019 if (p == inode_map.end())
14020 break;
7c673cae 14021
11fdf7f2
TL
14022 if (p->second->quota.is_enable()) {
14023 quota_in = p->second;
14024 break;
7c673cae 14025 }
7c673cae 14026 }
11fdf7f2 14027 realm = realm->pparent;
7c673cae 14028 }
11fdf7f2
TL
14029 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
14030 return quota_in;
7c673cae
FG
14031}
14032
14033/**
14034 * Traverse quota ancestors of the Inode, return true
14035 * if any of them passes the passed function
14036 */
14037bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
14038 std::function<bool (const Inode &in)> test)
14039{
14040 while (true) {
11fdf7f2 14041 ceph_assert(in != NULL);
7c673cae
FG
14042 if (test(*in)) {
14043 return true;
14044 }
14045
14046 if (in == root_ancestor) {
14047 // We're done traversing, drop out
14048 return false;
14049 } else {
14050 // Continue up the tree
14051 in = get_quota_root(in, perms);
14052 }
14053 }
14054
14055 return false;
14056}
14057
14058bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
14059{
14060 return check_quota_condition(in, perms,
14061 [](const Inode &in) {
14062 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
14063 });
14064}
14065
14066bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 14067 const UserPerm& perms)
7c673cae
FG
14068{
14069 return check_quota_condition(in, perms,
11fdf7f2 14070 [&new_bytes](const Inode &in) {
7c673cae
FG
14071 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
14072 > in.quota.max_bytes;
14073 });
14074}
14075
11fdf7f2 14076bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 14077{
11fdf7f2
TL
14078 return check_quota_condition(in, perms,
14079 [](const Inode &in) {
14080 if (in.quota.max_bytes) {
14081 if (in.rstat.rbytes >= in.quota.max_bytes) {
14082 return true;
14083 }
14084
14085 ceph_assert(in.size >= in.reported_size);
14086 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
14087 const uint64_t size = in.size - in.reported_size;
14088 return (space >> 4) < size;
14089 } else {
14090 return false;
14091 }
14092 });
7c673cae
FG
14093}
14094
14095enum {
14096 POOL_CHECKED = 1,
14097 POOL_CHECKING = 2,
14098 POOL_READ = 4,
14099 POOL_WRITE = 8,
14100};
14101
14102int Client::check_pool_perm(Inode *in, int need)
14103{
14104 if (!cct->_conf->client_check_pool_perm)
14105 return 0;
14106
14107 int64_t pool_id = in->layout.pool_id;
14108 std::string pool_ns = in->layout.pool_ns;
14109 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
14110 int have = 0;
14111 while (true) {
14112 auto it = pool_perms.find(perm_key);
14113 if (it == pool_perms.end())
14114 break;
14115 if (it->second == POOL_CHECKING) {
14116 // avoid concurrent checkings
14117 wait_on_list(waiting_for_pool_perm);
14118 } else {
14119 have = it->second;
11fdf7f2 14120 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
14121 break;
14122 }
14123 }
14124
14125 if (!have) {
14126 if (in->snapid != CEPH_NOSNAP) {
14127 // pool permission check needs to write to the first object. But for snapshot,
14128 // head of the first object may have alread been deleted. To avoid creating
14129 // orphan object, skip the check for now.
14130 return 0;
14131 }
14132
14133 pool_perms[perm_key] = POOL_CHECKING;
14134
14135 char oid_buf[32];
14136 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
14137 object_t oid = oid_buf;
14138
14139 SnapContext nullsnapc;
14140
14141 C_SaferCond rd_cond;
14142 ObjectOperation rd_op;
14143 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
14144
14145 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
14146 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
14147
14148 C_SaferCond wr_cond;
14149 ObjectOperation wr_op;
14150 wr_op.create(true);
14151
14152 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
14153 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
14154
14155 client_lock.Unlock();
14156 int rd_ret = rd_cond.wait();
14157 int wr_ret = wr_cond.wait();
14158 client_lock.Lock();
14159
14160 bool errored = false;
14161
14162 if (rd_ret == 0 || rd_ret == -ENOENT)
14163 have |= POOL_READ;
14164 else if (rd_ret != -EPERM) {
11fdf7f2 14165 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14166 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14167 errored = true;
14168 }
14169
14170 if (wr_ret == 0 || wr_ret == -EEXIST)
14171 have |= POOL_WRITE;
14172 else if (wr_ret != -EPERM) {
11fdf7f2 14173 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14174 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
14175 errored = true;
14176 }
14177
14178 if (errored) {
14179 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
14180 // Raise EIO because actual error code might be misleading for
14181 // userspace filesystem user.
14182 pool_perms.erase(perm_key);
14183 signal_cond_list(waiting_for_pool_perm);
14184 return -EIO;
14185 }
14186
14187 pool_perms[perm_key] = have | POOL_CHECKED;
14188 signal_cond_list(waiting_for_pool_perm);
14189 }
14190
14191 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 14192 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14193 << " need " << ccap_string(need) << ", but no read perm" << dendl;
14194 return -EPERM;
14195 }
14196 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 14197 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
14198 << " need " << ccap_string(need) << ", but no write perm" << dendl;
14199 return -EPERM;
14200 }
14201
14202 return 0;
14203}
14204
14205int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
14206{
14207 if (acl_type == POSIX_ACL) {
14208 if (in->xattrs.count(ACL_EA_ACCESS)) {
14209 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14210
14211 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
14212 }
14213 }
14214 return -EAGAIN;
14215}
14216
14217int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
14218{
14219 if (acl_type == NO_ACL)
14220 return 0;
14221
14222 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
14223 if (r < 0)
14224 goto out;
14225
14226 if (acl_type == POSIX_ACL) {
14227 if (in->xattrs.count(ACL_EA_ACCESS)) {
14228 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
14229 bufferptr acl(access_acl.c_str(), access_acl.length());
14230 r = posix_acl_access_chmod(acl, mode);
14231 if (r < 0)
14232 goto out;
14233 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
14234 } else {
14235 r = 0;
14236 }
14237 }
14238out:
14239 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
14240 return r;
14241}
14242
14243int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
14244 const UserPerm& perms)
14245{
14246 if (acl_type == NO_ACL)
14247 return 0;
14248
14249 if (S_ISLNK(*mode))
14250 return 0;
14251
14252 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
14253 if (r < 0)
14254 goto out;
14255
14256 if (acl_type == POSIX_ACL) {
14257 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
14258 map<string, bufferptr> xattrs;
14259
14260 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
14261 bufferptr acl(default_acl.c_str(), default_acl.length());
14262 r = posix_acl_inherit_mode(acl, mode);
14263 if (r < 0)
14264 goto out;
14265
14266 if (r > 0) {
14267 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
14268 if (r < 0)
14269 goto out;
14270 if (r > 0)
14271 xattrs[ACL_EA_ACCESS] = acl;
14272 }
14273
14274 if (S_ISDIR(*mode))
14275 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
14276
14277 r = xattrs.size();
14278 if (r > 0)
11fdf7f2 14279 encode(xattrs, xattrs_bl);
7c673cae
FG
14280 } else {
14281 if (umask_cb)
14282 *mode &= ~umask_cb(callback_handle);
14283 r = 0;
14284 }
14285 }
14286out:
14287 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
14288 return r;
14289}
14290
14291void Client::set_filer_flags(int flags)
14292{
11fdf7f2
TL
14293 std::lock_guard l(client_lock);
14294 ceph_assert(flags == 0 ||
7c673cae
FG
14295 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
14296 objecter->add_global_op_flags(flags);
14297}
14298
14299void Client::clear_filer_flags(int flags)
14300{
11fdf7f2
TL
14301 std::lock_guard l(client_lock);
14302 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
14303 objecter->clear_global_op_flag(flags);
14304}
14305
11fdf7f2
TL
14306// called before mount
14307void Client::set_uuid(const std::string& uuid)
14308{
14309 std::lock_guard l(client_lock);
14310 assert(initialized);
14311 assert(!uuid.empty());
14312
14313 metadata["uuid"] = uuid;
14314 _close_sessions();
14315}
14316
14317// called before mount. 0 means infinite
14318void Client::set_session_timeout(unsigned timeout)
14319{
14320 std::lock_guard l(client_lock);
14321 assert(initialized);
14322
14323 metadata["timeout"] = stringify(timeout);
14324}
14325
14326// called before mount
14327int Client::start_reclaim(const std::string& uuid, unsigned flags,
14328 const std::string& fs_name)
14329{
14330 std::lock_guard l(client_lock);
14331 if (!initialized)
14332 return -ENOTCONN;
14333
14334 if (uuid.empty())
14335 return -EINVAL;
14336
14337 {
14338 auto it = metadata.find("uuid");
14339 if (it != metadata.end() && it->second == uuid)
14340 return -EINVAL;
14341 }
14342
14343 int r = subscribe_mdsmap(fs_name);
14344 if (r < 0) {
14345 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
14346 return r;
14347 }
14348
14349 if (metadata.empty())
14350 populate_metadata("");
14351
14352 while (mdsmap->get_epoch() == 0)
14353 wait_on_list(waiting_for_mdsmap);
14354
14355 reclaim_errno = 0;
14356 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
14357 if (!mdsmap->is_up(mds)) {
14358 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
14359 wait_on_list(waiting_for_mdsmap);
14360 continue;
14361 }
14362
14363 MetaSession *session;
14364 if (!have_open_session(mds)) {
14365 session = _get_or_open_mds_session(mds);
14366 if (session->state != MetaSession::STATE_OPENING) {
14367 // umounting?
14368 return -EINVAL;
14369 }
14370 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
14371 wait_on_context_list(session->waiting_for_open);
14372 if (rejected_by_mds.count(mds))
14373 return -EPERM;
14374 continue;
14375 }
14376
14377 session = &mds_sessions.at(mds);
14378 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
14379 return -EOPNOTSUPP;
14380
14381 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
14382 session->reclaim_state == MetaSession::RECLAIMING) {
14383 session->reclaim_state = MetaSession::RECLAIMING;
14384 auto m = MClientReclaim::create(uuid, flags);
14385 session->con->send_message2(std::move(m));
14386 wait_on_list(waiting_for_reclaim);
14387 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
14388 return reclaim_errno ? : -ENOTRECOVERABLE;
14389 } else {
14390 mds++;
14391 }
14392 }
14393
14394 // didn't find target session in any mds
14395 if (reclaim_target_addrs.empty()) {
14396 if (flags & CEPH_RECLAIM_RESET)
14397 return -ENOENT;
14398 return -ENOTRECOVERABLE;
14399 }
14400
14401 if (flags & CEPH_RECLAIM_RESET)
14402 return 0;
14403
14404 // use blacklist to check if target session was killed
14405 // (config option mds_session_blacklist_on_evict needs to be true)
14406 C_SaferCond cond;
14407 if (!objecter->wait_for_map(reclaim_osd_epoch, &cond)) {
14408 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
14409 client_lock.Unlock();
14410 cond.wait();
14411 client_lock.Lock();
14412 }
14413
14414 bool blacklisted = objecter->with_osdmap(
14415 [this](const OSDMap &osd_map) -> bool {
14416 return osd_map.is_blacklisted(reclaim_target_addrs);
14417 });
14418 if (blacklisted)
14419 return -ENOTRECOVERABLE;
14420
14421 metadata["reclaiming_uuid"] = uuid;
14422 return 0;
14423}
14424
14425void Client::finish_reclaim()
14426{
14427 auto it = metadata.find("reclaiming_uuid");
14428 if (it == metadata.end()) {
14429 for (auto &p : mds_sessions)
14430 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14431 return;
14432 }
14433
14434 for (auto &p : mds_sessions) {
14435 p.second.reclaim_state = MetaSession::RECLAIM_NULL;
14436 auto m = MClientReclaim::create("", MClientReclaim::FLAG_FINISH);
14437 p.second.con->send_message2(std::move(m));
14438 }
14439
14440 metadata["uuid"] = it->second;
14441 metadata.erase(it);
14442}
14443
14444void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
14445{
14446 mds_rank_t from = mds_rank_t(reply->get_source().num());
14447 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
14448
14449 MetaSession *session = _get_mds_session(from, reply->get_connection().get());
14450 if (!session) {
14451 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
14452 return;
14453 }
14454
14455 if (reply->get_result() >= 0) {
14456 session->reclaim_state = MetaSession::RECLAIM_OK;
14457 if (reply->get_epoch() > reclaim_osd_epoch)
14458 reclaim_osd_epoch = reply->get_epoch();
14459 if (!reply->get_addrs().empty())
14460 reclaim_target_addrs = reply->get_addrs();
14461 } else {
14462 session->reclaim_state = MetaSession::RECLAIM_FAIL;
14463 reclaim_errno = reply->get_result();
14464 }
14465
14466 signal_cond_list(waiting_for_reclaim);
14467}
14468
7c673cae
FG
14469/**
14470 * This is included in cap release messages, to cause
14471 * the MDS to wait until this OSD map epoch. It is necessary
14472 * in corner cases where we cancel RADOS ops, so that
14473 * nobody else tries to do IO to the same objects in
14474 * the same epoch as the cancelled ops.
14475 */
14476void Client::set_cap_epoch_barrier(epoch_t e)
14477{
14478 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
14479 cap_epoch_barrier = e;
14480}
14481
14482const char** Client::get_tracked_conf_keys() const
14483{
14484 static const char* keys[] = {
14485 "client_cache_size",
14486 "client_cache_mid",
14487 "client_acl_type",
b32b8144
FG
14488 "client_deleg_timeout",
14489 "client_deleg_break_on_open",
7c673cae
FG
14490 NULL
14491 };
14492 return keys;
14493}
14494
11fdf7f2 14495void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
14496 const std::set <std::string> &changed)
14497{
11fdf7f2 14498 std::lock_guard lock(client_lock);
7c673cae 14499
181888fb 14500 if (changed.count("client_cache_mid")) {
7c673cae
FG
14501 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
14502 }
14503 if (changed.count("client_acl_type")) {
14504 acl_type = NO_ACL;
14505 if (cct->_conf->client_acl_type == "posix_acl")
14506 acl_type = POSIX_ACL;
14507 }
14508}
14509
7c673cae
FG
14510void intrusive_ptr_add_ref(Inode *in)
14511{
14512 in->get();
14513}
14514
14515void intrusive_ptr_release(Inode *in)
14516{
14517 in->client->put_inode(in);
14518}
14519
14520mds_rank_t Client::_get_random_up_mds() const
14521{
11fdf7f2 14522 ceph_assert(client_lock.is_locked_by_me());
7c673cae
FG
14523
14524 std::set<mds_rank_t> up;
14525 mdsmap->get_up_mds_set(up);
14526
14527 if (up.empty())
14528 return MDS_RANK_NONE;
14529 std::set<mds_rank_t>::const_iterator p = up.begin();
14530 for (int n = rand() % up.size(); n; n--)
14531 ++p;
14532 return *p;
14533}
14534
14535
14536StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
14537 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
14538{
14539 monclient->set_messenger(m);
14540 objecter->set_client_incarnation(0);
14541}
14542
14543StandaloneClient::~StandaloneClient()
14544{
14545 delete objecter;
14546 objecter = nullptr;
14547}
14548
14549int StandaloneClient::init()
14550{
14551 timer.init();
14552 objectcacher->start();
14553 objecter->init();
14554
14555 client_lock.Lock();
11fdf7f2 14556 ceph_assert(!is_initialized());
7c673cae
FG
14557
14558 messenger->add_dispatcher_tail(objecter);
14559 messenger->add_dispatcher_tail(this);
14560
14561 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
14562 int r = monclient->init();
14563 if (r < 0) {
14564 // need to do cleanup because we're in an intermediate init state
14565 timer.shutdown();
14566 client_lock.Unlock();
14567 objecter->shutdown();
14568 objectcacher->stop();
14569 monclient->shutdown();
14570 return r;
14571 }
14572 objecter->start();
14573
14574 client_lock.Unlock();
14575 _finish_init();
14576
14577 return 0;
14578}
14579
14580void StandaloneClient::shutdown()
14581{
14582 Client::shutdown();
14583 objecter->shutdown();
14584 monclient->shutdown();
14585}