]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
import ceph quincy 17.2.1
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
f67539c2 26#ifndef _WIN32
7c673cae 27#include <sys/utsname.h>
f67539c2 28#endif
7c673cae
FG
29#include <sys/uio.h>
30
31#include <boost/lexical_cast.hpp>
32#include <boost/fusion/include/std_pair.hpp>
33
f67539c2
TL
34#include "common/async/waiter.h"
35
36#if defined(__FreeBSD__) || defined(_WIN32)
7c673cae
FG
37#define XATTR_CREATE 0x1
38#define XATTR_REPLACE 0x2
39#else
40#include <sys/xattr.h>
41#endif
42
43#if defined(__linux__)
44#include <linux/falloc.h>
45#endif
46
47#include <sys/statvfs.h>
48
49#include "common/config.h"
50#include "common/version.h"
f67539c2 51#include "common/async/blocked_completion.h"
7c673cae 52
11fdf7f2
TL
53#include "mon/MonClient.h"
54
55#include "messages/MClientCaps.h"
56#include "messages/MClientLease.h"
57#include "messages/MClientQuota.h"
58#include "messages/MClientReclaim.h"
59#include "messages/MClientReclaimReply.h"
7c673cae 60#include "messages/MClientReconnect.h"
11fdf7f2 61#include "messages/MClientReply.h"
7c673cae
FG
62#include "messages/MClientRequest.h"
63#include "messages/MClientRequestForward.h"
11fdf7f2 64#include "messages/MClientSession.h"
7c673cae 65#include "messages/MClientSnap.h"
f67539c2 66#include "messages/MClientMetrics.h"
7c673cae 67#include "messages/MCommandReply.h"
7c673cae
FG
68#include "messages/MFSMap.h"
69#include "messages/MFSMapUser.h"
11fdf7f2
TL
70#include "messages/MMDSMap.h"
71#include "messages/MOSDMap.h"
7c673cae
FG
72
73#include "mds/flock.h"
11fdf7f2 74#include "mds/cephfs_features.h"
7c673cae
FG
75#include "osd/OSDMap.h"
76#include "osdc/Filer.h"
77
78#include "common/Cond.h"
7c673cae
FG
79#include "common/perf_counters.h"
80#include "common/admin_socket.h"
81#include "common/errno.h"
82#include "include/str_list.h"
83
84#define dout_subsys ceph_subsys_client
85
86#include "include/lru.h"
87#include "include/compat.h"
88#include "include/stringify.h"
f67539c2 89#include "include/random.h"
7c673cae
FG
90
91#include "Client.h"
92#include "Inode.h"
93#include "Dentry.h"
b32b8144 94#include "Delegation.h"
7c673cae
FG
95#include "Dir.h"
96#include "ClientSnapRealm.h"
97#include "Fh.h"
98#include "MetaSession.h"
99#include "MetaRequest.h"
100#include "ObjecterWriteback.h"
101#include "posix_acl.h"
102
11fdf7f2 103#include "include/ceph_assert.h"
7c673cae
FG
104#include "include/stat.h"
105
e306af50 106#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
107
108#if HAVE_GETGROUPLIST
109#include <grp.h>
110#include <pwd.h>
111#include <unistd.h>
112#endif
113
114#undef dout_prefix
115#define dout_prefix *_dout << "client." << whoami << " "
116
117#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119// FreeBSD fails to define this
120#ifndef O_DSYNC
121#define O_DSYNC 0x0
122#endif
123// Darwin fails to define this
124#ifndef O_RSYNC
125#define O_RSYNC 0x0
126#endif
127
128#ifndef O_DIRECT
129#define O_DIRECT 0x0
130#endif
131
f67539c2
TL
132// Windows doesn't define those values. While the Posix compatibilty layer
133// doesn't support those values, the Windows native functions do provide
134// similar flags. Special care should be taken if we're going to use those
135// flags in ceph-dokan. The current values are no-ops, while propagating
136// them to the rest of the code might cause the Windows functions to reject
137// them as invalid.
138#ifndef O_NOFOLLOW
139#define O_NOFOLLOW 0x0
140#endif
141
142#ifndef O_SYNC
143#define O_SYNC 0x0
144#endif
145
7c673cae
FG
146#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
b3b6e05e
TL
148#ifndef S_IXUGO
149#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150#endif
151
20effc67
TL
152using std::dec;
153using std::hex;
154using std::list;
155using std::oct;
156using std::pair;
157using std::string;
158using std::vector;
159
adb31ebb
TL
160using namespace TOPNSPC::common;
161
f67539c2
TL
162namespace bs = boost::system;
163namespace ca = ceph::async;
164
7c673cae
FG
165void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166{
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169}
170
b3b6e05e
TL
171bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177}
178
7c673cae
FG
179
180// -------------
181
182Client::CommandHook::CommandHook(Client *client) :
183 m_client(client)
184{
185}
186
9f95a23c
TL
187int Client::CommandHook::call(
188 std::string_view command,
189 const cmdmap_t& cmdmap,
190 Formatter *f,
191 std::ostream& errss,
192 bufferlist& out)
7c673cae 193{
7c673cae 194 f->open_object_section("result");
9f95a23c 195 {
f67539c2 196 std::scoped_lock l{m_client->client_lock};
9f95a23c
TL
197 if (command == "mds_requests")
198 m_client->dump_mds_requests(f);
adb31ebb
TL
199 else if (command == "mds_sessions") {
200 bool cap_dump = false;
201 cmd_getval(cmdmap, "cap_dump", cap_dump);
202 m_client->dump_mds_sessions(f, cap_dump);
203 } else if (command == "dump_cache")
9f95a23c
TL
204 m_client->dump_cache(f);
205 else if (command == "kick_stale_sessions")
206 m_client->_kick_stale_sessions();
207 else if (command == "status")
208 m_client->dump_status(f);
209 else
210 ceph_abort_msg("bad command registered");
211 }
7c673cae 212 f->close_section();
9f95a23c 213 return 0;
7c673cae
FG
214}
215
216
217// -------------
218
b3b6e05e
TL
219int Client::get_fd_inode(int fd, InodeRef *in) {
220 int r = 0;
221 if (fd == CEPHFS_AT_FDCWD) {
222 *in = cwd;
223 } else {
224 Fh *f = get_filehandle(fd);
225 if (!f) {
226 r = -CEPHFS_EBADF;
227 } else {
228 *in = f->inode;
229 }
230 }
231 return r;
232}
233
7c673cae
FG
234dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
235 : inode(in), offset(0), next_offset(2),
236 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
237 perms(perms)
238 { }
239
240void Client::_reset_faked_inos()
241{
242 ino_t start = 1024;
243 free_faked_inos.clear();
244 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
245 last_used_faked_ino = 0;
11fdf7f2 246 last_used_faked_root = 0;
f67539c2
TL
247 #ifdef _WIN32
248 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
249 // Windows structures, including Dokan ones, are using 64B identifiers.
250 _use_faked_inos = false;
251 #else
7c673cae 252 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
f67539c2 253 #endif
7c673cae
FG
254}
255
256void Client::_assign_faked_ino(Inode *in)
257{
11fdf7f2
TL
258 if (0 == last_used_faked_ino)
259 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
260 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
261 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 262 last_used_faked_ino = 2048;
7c673cae
FG
263 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
264 }
11fdf7f2 265 ceph_assert(it != free_faked_inos.end());
7c673cae 266 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 267 ceph_assert(it.get_len() > 0);
7c673cae
FG
268 last_used_faked_ino = it.get_start();
269 } else {
270 ++last_used_faked_ino;
11fdf7f2 271 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
272 }
273 in->faked_ino = last_used_faked_ino;
274 free_faked_inos.erase(in->faked_ino);
275 faked_ino_map[in->faked_ino] = in->vino();
276}
277
11fdf7f2
TL
278/*
279 * In the faked mode, if you export multiple subdirectories,
280 * you will see that the inode numbers of the exported subdirectories
281 * are the same. so we distinguish the mount point by reserving
282 * the "fake ids" between "1024~2048" and combining the last
283 * 10bits(0x3ff) of the "root inodes".
284*/
285void Client::_assign_faked_root(Inode *in)
286{
287 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
288 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
289 last_used_faked_root = 0;
290 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
291 }
20effc67 292 ceph_assert(it != free_faked_inos.end());
11fdf7f2
TL
293 vinodeno_t inode_info = in->vino();
294 uint64_t inode_num = (uint64_t)inode_info.ino;
295 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
296 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
20effc67 297 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
11fdf7f2
TL
298
299 in->faked_ino = last_used_faked_root;
300 free_faked_inos.erase(in->faked_ino);
301 faked_ino_map[in->faked_ino] = in->vino();
302}
303
7c673cae
FG
304void Client::_release_faked_ino(Inode *in)
305{
306 free_faked_inos.insert(in->faked_ino);
307 faked_ino_map.erase(in->faked_ino);
308}
309
310vinodeno_t Client::_map_faked_ino(ino_t ino)
311{
312 vinodeno_t vino;
313 if (ino == 1)
314 vino = root->vino();
315 else if (faked_ino_map.count(ino))
316 vino = faked_ino_map[ino];
317 else
318 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 319 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
320 return vino;
321}
322
323vinodeno_t Client::map_faked_ino(ino_t ino)
324{
f67539c2 325 std::scoped_lock lock(client_lock);
7c673cae
FG
326 return _map_faked_ino(ino);
327}
328
329// cons/des
330
331Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
f67539c2
TL
332 : Dispatcher(m->cct->get()),
333 timer(m->cct, timer_lock, false),
11fdf7f2
TL
334 messenger(m),
335 monclient(mc),
336 objecter(objecter_),
337 whoami(mc->get_global_id()),
f67539c2
TL
338 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
339 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
340 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
7c673cae
FG
341 async_ino_invalidator(m->cct),
342 async_dentry_invalidator(m->cct),
343 interrupt_finisher(m->cct),
344 remount_finisher(m->cct),
e306af50 345 async_ino_releasor(m->cct),
7c673cae 346 objecter_finisher(m->cct),
11fdf7f2
TL
347 m_command_hook(this),
348 fscid(0)
7c673cae
FG
349{
350 _reset_faked_inos();
7c673cae 351
7c673cae
FG
352 user_id = cct->_conf->client_mount_uid;
353 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
354 fuse_default_permissions = cct->_conf.get_val<bool>(
355 "fuse_default_permissions");
7c673cae 356
33c7a0ef
TL
357 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
358 "client_collect_and_send_global_metrics");
359
7c673cae
FG
360 if (cct->_conf->client_acl_type == "posix_acl")
361 acl_type = POSIX_ACL;
362
7c673cae
FG
363 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
364
365 // file handles
366 free_fd_set.insert(10, 1<<30);
367
368 mdsmap.reset(new MDSMap);
369
370 // osd interfaces
371 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
372 &client_lock));
373 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
374 client_flush_set_callback, // all commit callback
375 (void*)this,
376 cct->_conf->client_oc_size,
377 cct->_conf->client_oc_max_objects,
378 cct->_conf->client_oc_max_dirty,
379 cct->_conf->client_oc_target_dirty,
380 cct->_conf->client_oc_max_dirty_age,
381 true));
7c673cae
FG
382}
383
384
385Client::~Client()
386{
9f95a23c 387 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 388
f67539c2
TL
389 // If the task is crashed or aborted and doesn't
390 // get any chance to run the umount and shutdow.
391 {
392 std::scoped_lock l{client_lock};
393 tick_thread_stopped = true;
394 upkeep_cond.notify_one();
395 }
396
397 if (upkeeper.joinable())
398 upkeeper.join();
399
31f18b77
FG
400 // It is necessary to hold client_lock, because any inode destruction
401 // may call into ObjectCacher, which asserts that it's lock (which is
402 // client_lock) is held.
f67539c2 403 std::scoped_lock l{client_lock};
7c673cae
FG
404 tear_down_cache();
405}
406
407void Client::tear_down_cache()
408{
409 // fd's
f67539c2
TL
410 for (auto &[fd, fh] : fd_map) {
411 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
7c673cae
FG
412 _release_fh(fh);
413 }
414 fd_map.clear();
415
416 while (!opened_dirs.empty()) {
417 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 418 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
419 _closedir(dirp);
420 }
421
422 // caps!
423 // *** FIXME ***
424
425 // empty lru
7c673cae 426 trim_cache();
11fdf7f2 427 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
428
429 // close root ino
11fdf7f2 430 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae 431 if (root && inode_map.size() == 1 + root_parents.size()) {
b3b6e05e 432 root.reset();
7c673cae
FG
433 }
434
11fdf7f2 435 ceph_assert(inode_map.empty());
7c673cae
FG
436}
437
438inodeno_t Client::get_root_ino()
439{
f67539c2 440 std::scoped_lock l(client_lock);
7c673cae
FG
441 if (use_faked_inos())
442 return root->faked_ino;
443 else
444 return root->ino;
445}
446
447Inode *Client::get_root()
448{
f67539c2 449 std::scoped_lock l(client_lock);
7c673cae 450 root->ll_get();
b3b6e05e 451 return root.get();
7c673cae
FG
452}
453
454
455// debug crapola
456
457void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
458{
459 filepath path;
460 in->make_long_path(path);
461 ldout(cct, 1) << "dump_inode: "
462 << (disconnected ? "DISCONNECTED ":"")
463 << "inode " << in->ino
464 << " " << path
b3b6e05e 465 << " ref " << in->get_nref()
f67539c2 466 << " " << *in << dendl;
7c673cae
FG
467
468 if (f) {
469 f->open_object_section("inode");
470 f->dump_stream("path") << path;
471 if (disconnected)
472 f->dump_int("disconnected", 1);
473 in->dump(f);
474 f->close_section();
475 }
476
477 did.insert(in);
478 if (in->dir) {
479 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
480 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
481 it != in->dir->dentries.end();
482 ++it) {
483 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
484 if (f) {
485 f->open_object_section("dentry");
486 it->second->dump(f);
487 f->close_section();
488 }
489 if (it->second->inode)
490 dump_inode(f, it->second->inode.get(), did, false);
491 }
492 }
493}
494
495void Client::dump_cache(Formatter *f)
496{
497 set<Inode*> did;
498
11fdf7f2 499 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
500
501 if (f)
502 f->open_array_section("cache");
503
504 if (root)
b3b6e05e 505 dump_inode(f, root.get(), did, true);
7c673cae
FG
506
507 // make a second pass to catch anything disconnected
508 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
509 it != inode_map.end();
510 ++it) {
511 if (did.count(it->second))
512 continue;
513 dump_inode(f, it->second, did, true);
514 }
515
516 if (f)
517 f->close_section();
518}
519
520void Client::dump_status(Formatter *f)
521{
9f95a23c 522 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
523
524 ldout(cct, 1) << __func__ << dendl;
525
526 const epoch_t osd_epoch
527 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
528
529 if (f) {
530 f->open_object_section("metadata");
531 for (const auto& kv : metadata)
532 f->dump_string(kv.first.c_str(), kv.second);
533 f->close_section();
534
535 f->dump_int("dentry_count", lru.lru_get_size());
536 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
537 f->dump_int("id", get_nodeid().v);
11fdf7f2 538 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 539 f->dump_object("inst", inst);
11fdf7f2
TL
540 f->dump_object("addr", inst.addr);
541 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
542 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
543 f->dump_int("inode_count", inode_map.size());
544 f->dump_int("mds_epoch", mdsmap->get_epoch());
545 f->dump_int("osd_epoch", osd_epoch);
546 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f67539c2 547 f->dump_bool("blocklisted", blocklisted);
adb31ebb 548 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
549 }
550}
551
e306af50 552void Client::_pre_init()
7c673cae
FG
553{
554 timer.init();
e306af50
TL
555
556 objecter_finisher.start();
557 filer.reset(new Filer(objecter, &objecter_finisher));
e306af50 558
7c673cae 559 objectcacher->start();
e306af50
TL
560}
561
562int Client::init()
563{
f67539c2
TL
564 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
565 ceph_assert(iref_writer.is_first_writer());
566
e306af50 567 _pre_init();
9f95a23c 568 {
f67539c2 569 std::scoped_lock l{client_lock};
9f95a23c
TL
570 messenger->add_dispatcher_tail(this);
571 }
7c673cae 572 _finish_init();
f67539c2 573 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
574 return 0;
575}
576
577void Client::_finish_init()
578{
9f95a23c 579 {
f67539c2 580 std::scoped_lock l{client_lock};
9f95a23c
TL
581 // logger
582 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
583 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
584 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
585 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
586 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
587 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
588 logger.reset(plb.create_perf_counters());
589 cct->get_perfcounters_collection()->add(logger.get());
590 }
7c673cae 591
11fdf7f2 592 cct->_conf.add_observer(this);
7c673cae
FG
593
594 AdminSocket* admin_socket = cct->get_admin_socket();
595 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
596 &m_command_hook,
597 "show in-progress mds requests");
598 if (ret < 0) {
599 lderr(cct) << "error registering admin socket command: "
600 << cpp_strerror(-ret) << dendl;
601 }
adb31ebb
TL
602 ret = admin_socket->register_command("mds_sessions "
603 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
604 &m_command_hook,
605 "show mds session state");
606 if (ret < 0) {
607 lderr(cct) << "error registering admin socket command: "
608 << cpp_strerror(-ret) << dendl;
609 }
610 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
611 &m_command_hook,
612 "show in-memory metadata cache contents");
613 if (ret < 0) {
614 lderr(cct) << "error registering admin socket command: "
615 << cpp_strerror(-ret) << dendl;
616 }
617 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
618 &m_command_hook,
619 "kick sessions that were remote reset");
620 if (ret < 0) {
621 lderr(cct) << "error registering admin socket command: "
622 << cpp_strerror(-ret) << dendl;
623 }
624 ret = admin_socket->register_command("status",
7c673cae
FG
625 &m_command_hook,
626 "show overall client status");
627 if (ret < 0) {
628 lderr(cct) << "error registering admin socket command: "
629 << cpp_strerror(-ret) << dendl;
630 }
7c673cae
FG
631}
632
633void Client::shutdown()
634{
11fdf7f2 635 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
636
637 // If we were not mounted, but were being used for sending
638 // MDS commands, we may have sessions that need closing.
9f95a23c 639 {
f67539c2
TL
640 std::scoped_lock l{client_lock};
641
642 // To make sure the tick thread will be stoppped before
643 // destructing the Client, just in case like the _mount()
644 // failed but didn't not get a chance to stop the tick
645 // thread
646 tick_thread_stopped = true;
647 upkeep_cond.notify_one();
648
9f95a23c
TL
649 _close_sessions();
650 }
11fdf7f2 651 cct->_conf.remove_observer(this);
7c673cae 652
11fdf7f2 653 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
654
655 if (ino_invalidate_cb) {
656 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
657 async_ino_invalidator.wait_for_empty();
658 async_ino_invalidator.stop();
659 }
660
661 if (dentry_invalidate_cb) {
662 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
663 async_dentry_invalidator.wait_for_empty();
664 async_dentry_invalidator.stop();
665 }
666
667 if (switch_interrupt_cb) {
668 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
669 interrupt_finisher.wait_for_empty();
670 interrupt_finisher.stop();
671 }
672
673 if (remount_cb) {
674 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
675 remount_finisher.wait_for_empty();
676 remount_finisher.stop();
677 }
678
e306af50
TL
679 if (ino_release_cb) {
680 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
681 async_ino_releasor.wait_for_empty();
682 async_ino_releasor.stop();
683 }
684
7c673cae 685 objectcacher->stop(); // outside of client_lock! this does a join.
f67539c2
TL
686
687 /*
688 * We are shuting down the client.
689 *
690 * Just declare the state to CLIENT_NEW to block and fail any
691 * new comming "reader" and then try to wait all the in-flight
692 * "readers" to finish.
693 */
694 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
695 if (!iref_writer.is_first_writer())
696 return;
697 iref_writer.wait_readers_done();
698
9f95a23c 699 {
f67539c2 700 std::scoped_lock l(timer_lock);
9f95a23c
TL
701 timer.shutdown();
702 }
f67539c2 703
7c673cae
FG
704 objecter_finisher.wait_for_empty();
705 objecter_finisher.stop();
706
707 if (logger) {
708 cct->get_perfcounters_collection()->remove(logger.get());
709 logger.reset();
710 }
711}
712
713
714// ===================
715// metadata cache stuff
716
717void Client::trim_cache(bool trim_kernel_dcache)
718{
181888fb
FG
719 uint64_t max = cct->_conf->client_cache_size;
720 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
721 unsigned last = 0;
722 while (lru.lru_get_size() != last) {
723 last = lru.lru_get_size();
724
f67539c2 725 if (!is_unmounting() && lru.lru_get_size() <= max) break;
7c673cae
FG
726
727 // trim!
31f18b77 728 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
729 if (!dn)
730 break; // done
f67539c2 731
7c673cae
FG
732 trim_dentry(dn);
733 }
734
181888fb 735 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
736 _invalidate_kernel_dcache();
737
738 // hose root?
b3b6e05e 739 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
7c673cae 740 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
b3b6e05e 741 root.reset();
7c673cae
FG
742 }
743}
744
745void Client::trim_cache_for_reconnect(MetaSession *s)
746{
747 mds_rank_t mds = s->mds_num;
11fdf7f2 748 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
749
750 int trimmed = 0;
751 list<Dentry*> skipped;
752 while (lru.lru_get_size() > 0) {
753 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
754 if (!dn)
755 break;
756
757 if ((dn->inode && dn->inode->caps.count(mds)) ||
758 dn->dir->parent_inode->caps.count(mds)) {
759 trim_dentry(dn);
760 trimmed++;
761 } else
762 skipped.push_back(dn);
763 }
764
765 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
766 lru.lru_insert_mid(*p);
767
11fdf7f2 768 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
769 << " trimmed " << trimmed << " dentries" << dendl;
770
771 if (s->caps.size() > 0)
772 _invalidate_kernel_dcache();
773}
774
775void Client::trim_dentry(Dentry *dn)
776{
777 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
778 << " in dir "
779 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
780 << dendl;
781 if (dn->inode) {
782 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
783 clear_dir_complete_and_ordered(diri, true);
784 }
785 unlink(dn, false, false); // drop dir, drop dentry
786}
787
788
1adf2230
AA
789void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
790 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 791{
7c673cae
FG
792 uint64_t prior_size = in->size;
793
7c673cae
FG
794 if (truncate_seq > in->truncate_seq ||
795 (truncate_seq == in->truncate_seq && size > in->size)) {
796 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
797 in->size = size;
798 in->reported_size = size;
799 if (truncate_seq != in->truncate_seq) {
800 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
801 << truncate_seq << dendl;
802 in->truncate_seq = truncate_seq;
803 in->oset.truncate_seq = truncate_seq;
804
805 // truncate cached file data
806 if (prior_size > size) {
807 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
808 }
809 }
810
811 // truncate inline data
812 if (in->inline_version < CEPH_INLINE_NONE) {
813 uint32_t len = in->inline_data.length();
814 if (size < len)
815 in->inline_data.splice(size, len - size);
816 }
817 }
818 if (truncate_seq >= in->truncate_seq &&
819 in->truncate_size != truncate_size) {
820 if (in->is_file()) {
821 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
822 << truncate_size << dendl;
823 in->truncate_size = truncate_size;
824 in->oset.truncate_size = truncate_size;
825 } else {
826 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
827 }
828 }
1adf2230
AA
829}
830
831void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
832 utime_t ctime, utime_t mtime, utime_t atime)
833{
834 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
835 << " ctime " << ctime << " mtime " << mtime << dendl;
836
837 if (time_warp_seq > in->time_warp_seq)
838 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
839 << " is higher than local time_warp_seq "
840 << in->time_warp_seq << dendl;
841
842 int warn = false;
7c673cae
FG
843 // be careful with size, mtime, atime
844 if (issued & (CEPH_CAP_FILE_EXCL|
845 CEPH_CAP_FILE_WR|
846 CEPH_CAP_FILE_BUFFER|
847 CEPH_CAP_AUTH_EXCL|
848 CEPH_CAP_XATTR_EXCL)) {
849 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
850 if (ctime > in->ctime)
851 in->ctime = ctime;
852 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
853 //the mds updated times, so take those!
854 in->mtime = mtime;
855 in->atime = atime;
856 in->time_warp_seq = time_warp_seq;
857 } else if (time_warp_seq == in->time_warp_seq) {
858 //take max times
859 if (mtime > in->mtime)
860 in->mtime = mtime;
861 if (atime > in->atime)
862 in->atime = atime;
863 } else if (issued & CEPH_CAP_FILE_EXCL) {
864 //ignore mds values as we have a higher seq
865 } else warn = true;
866 } else {
867 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
868 if (time_warp_seq >= in->time_warp_seq) {
869 in->ctime = ctime;
870 in->mtime = mtime;
871 in->atime = atime;
872 in->time_warp_seq = time_warp_seq;
873 } else warn = true;
874 }
875 if (warn) {
876 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
877 << time_warp_seq << " is lower than local time_warp_seq "
878 << in->time_warp_seq
879 << dendl;
880 }
881}
882
883void Client::_fragmap_remove_non_leaves(Inode *in)
884{
885 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
886 if (!in->dirfragtree.is_leaf(p->first))
887 in->fragmap.erase(p++);
888 else
889 ++p;
890}
891
892void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
893{
894 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
895 if (p->second == mds)
896 in->fragmap.erase(p++);
897 else
898 ++p;
899}
900
901Inode * Client::add_update_inode(InodeStat *st, utime_t from,
902 MetaSession *session,
903 const UserPerm& request_perms)
904{
905 Inode *in;
906 bool was_new = false;
907 if (inode_map.count(st->vino)) {
908 in = inode_map[st->vino];
11fdf7f2 909 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
910 } else {
911 in = new Inode(this, st->vino, &st->layout);
912 inode_map[st->vino] = in;
913
914 if (use_faked_inos())
915 _assign_faked_ino(in);
916
917 if (!root) {
918 root = in;
11fdf7f2 919 if (use_faked_inos())
b3b6e05e 920 _assign_faked_root(root.get());
7c673cae
FG
921 root_ancestor = in;
922 cwd = root;
f67539c2 923 } else if (is_mounting()) {
7c673cae
FG
924 root_parents[root_ancestor] = in;
925 root_ancestor = in;
926 }
927
928 // immutable bits
929 in->ino = st->vino.ino;
930 in->snapid = st->vino.snapid;
931 in->mode = st->mode & S_IFMT;
932 was_new = true;
933 }
934
935 in->rdev = st->rdev;
936 if (in->is_symlink())
937 in->symlink = st->symlink;
938
7c673cae 939 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
940 bool new_version = false;
941 if (in->version == 0 ||
942 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
943 (in->version & ~1) < st->version))
944 new_version = true;
7c673cae 945
1adf2230
AA
946 int issued;
947 in->caps_issued(&issued);
948 issued |= in->caps_dirty();
949 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 950
1adf2230
AA
951 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
952 !(issued & CEPH_CAP_AUTH_EXCL)) {
953 in->mode = st->mode;
954 in->uid = st->uid;
955 in->gid = st->gid;
956 in->btime = st->btime;
81eedcae 957 in->snap_btime = st->snap_btime;
f67539c2 958 in->snap_metadata = st->snap_metadata;
1adf2230 959 }
7c673cae 960
1adf2230
AA
961 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
962 !(issued & CEPH_CAP_LINK_EXCL)) {
963 in->nlink = st->nlink;
964 }
7c673cae 965
1adf2230
AA
966 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
967 update_inode_file_time(in, issued, st->time_warp_seq,
968 st->ctime, st->mtime, st->atime);
969 }
7c673cae 970
1adf2230
AA
971 if (new_version ||
972 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 973 in->layout = st->layout;
1adf2230
AA
974 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
975 }
7c673cae 976
1adf2230
AA
977 if (in->is_dir()) {
978 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
979 in->dirstat = st->dirstat;
980 }
981 // dir_layout/rstat/quota are not tracked by capability, update them only if
982 // the inode stat is from auth mds
983 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
984 in->dir_layout = st->dir_layout;
985 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
986 in->rstat = st->rstat;
987 in->quota = st->quota;
11fdf7f2 988 in->dir_pin = st->dir_pin;
1adf2230
AA
989 }
990 // move me if/when version reflects fragtree changes.
991 if (in->dirfragtree != st->dirfragtree) {
992 in->dirfragtree = st->dirfragtree;
993 _fragmap_remove_non_leaves(in);
7c673cae 994 }
7c673cae
FG
995 }
996
997 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
998 st->xattrbl.length() &&
999 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
1000 auto p = st->xattrbl.cbegin();
1001 decode(in->xattrs, p);
7c673cae
FG
1002 in->xattr_version = st->xattr_version;
1003 }
1004
1adf2230
AA
1005 if (st->inline_version > in->inline_version) {
1006 in->inline_data = st->inline_data;
1007 in->inline_version = st->inline_version;
7c673cae
FG
1008 }
1009
1adf2230
AA
1010 /* always take a newer change attr */
1011 if (st->change_attr > in->change_attr)
1012 in->change_attr = st->change_attr;
1013
1014 if (st->version > in->version)
1015 in->version = st->version;
1016
1017 if (was_new)
1018 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1019
1020 if (!st->cap.caps)
1021 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1022
7c673cae 1023 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
1024 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1025 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1026 st->cap.flags, request_perms);
28e407b8 1027 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 1028 in->max_size = st->max_size;
28e407b8
AA
1029 in->rstat = st->rstat;
1030 }
7c673cae 1031
1adf2230
AA
1032 // setting I_COMPLETE needs to happen after adding the cap
1033 if (in->is_dir() &&
1034 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1035 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1036 in->dirstat.nfiles == 0 &&
1037 in->dirstat.nsubdirs == 0) {
1038 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1039 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1040 if (in->dir) {
1041 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1042 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1043 in->dir->readdir_cache.clear();
1044 for (const auto& p : in->dir->dentries) {
1045 unlink(p.second, true, true); // keep dir, keep dentry
1046 }
1047 if (in->dir->dentries.empty())
1048 close_dir(in->dir);
7c673cae 1049 }
7c673cae 1050 }
1adf2230
AA
1051 } else {
1052 in->snap_caps |= st->cap.caps;
7c673cae
FG
1053 }
1054
f67539c2 1055 in->fscrypt = st->fscrypt;
7c673cae
FG
1056 return in;
1057}
1058
1059
1060/*
1061 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1062 */
1063Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1064 Inode *in, utime_t from, MetaSession *session,
1065 Dentry *old_dentry)
1066{
1067 Dentry *dn = NULL;
1068 if (dir->dentries.count(dname))
1069 dn = dir->dentries[dname];
1070
11fdf7f2 1071 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
1072 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1073 << dendl;
1074
1075 if (dn && dn->inode) {
1076 if (dn->inode->vino() == in->vino()) {
1077 touch_dn(dn);
1078 ldout(cct, 12) << " had dentry " << dname
1079 << " with correct vino " << dn->inode->vino()
1080 << dendl;
1081 } else {
1082 ldout(cct, 12) << " had dentry " << dname
1083 << " with WRONG vino " << dn->inode->vino()
1084 << dendl;
1085 unlink(dn, true, true); // keep dir, keep dentry
1086 }
1087 }
1088
1089 if (!dn || !dn->inode) {
1090 InodeRef tmp_ref(in);
1091 if (old_dentry) {
1092 if (old_dentry->dir != dir) {
1093 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1094 clear_dir_complete_and_ordered(old_diri, false);
1095 }
1096 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1097 }
1098 Inode *diri = dir->parent_inode;
7c673cae
FG
1099 clear_dir_complete_and_ordered(diri, false);
1100 dn = link(dir, dname, in, dn);
1101 }
1102
1103 update_dentry_lease(dn, dlease, from, session);
1104 return dn;
1105}
1106
1107void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1108{
1109 utime_t dttl = from;
1110 dttl += (float)dlease->duration_ms / 1000.0;
f67539c2
TL
1111
1112 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
7c673cae 1113
11fdf7f2 1114 ceph_assert(dn);
7c673cae 1115
9f95a23c 1116 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1117 if (dttl > dn->lease_ttl) {
1118 ldout(cct, 10) << "got dentry lease on " << dn->name
1119 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1120 dn->lease_ttl = dttl;
1121 dn->lease_mds = session->mds_num;
1122 dn->lease_seq = dlease->seq;
1123 dn->lease_gen = session->cap_gen;
1124 }
1125 }
1126 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1127 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1128 dn->mark_primary();
f67539c2 1129 dn->alternate_name = std::move(dlease->alternate_name);
7c673cae
FG
1130}
1131
1132
1133/*
1134 * update MDS location cache for a single inode
1135 */
522d829b 1136void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
7c673cae
FG
1137{
1138 // auth
1139 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1140 if (dst->auth >= 0) {
1141 in->fragmap[dst->frag] = dst->auth;
1142 } else {
1143 in->fragmap.erase(dst->frag);
1144 }
1145 if (!in->dirfragtree.is_leaf(dst->frag)) {
1146 in->dirfragtree.force_to_leaf(cct, dst->frag);
1147 _fragmap_remove_non_leaves(in);
1148 }
1149
522d829b
TL
1150 // replicated, only update from auth mds reply
1151 if (from == dst->auth) {
1152 in->dir_replicated = !dst->dist.empty();
1153 if (!dst->dist.empty())
1154 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1155 else
1156 in->frag_repmap.erase(dst->frag);
1157 }
7c673cae
FG
1158}
1159
1160void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1161{
f91f0fd5
TL
1162 if (complete)
1163 diri->dir_release_count++;
1164 else
1165 diri->dir_ordered_count++;
7c673cae
FG
1166 if (diri->flags & I_COMPLETE) {
1167 if (complete) {
1168 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1169 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1170 } else {
1171 if (diri->flags & I_DIR_ORDERED) {
1172 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1173 diri->flags &= ~I_DIR_ORDERED;
1174 }
1175 }
1176 if (diri->dir)
1177 diri->dir->readdir_cache.clear();
1178 }
1179}
1180
1181/*
1182 * insert results from readdir or lssnap into the metadata cache.
1183 */
1184void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1185
11fdf7f2 1186 auto& reply = request->reply;
7c673cae 1187 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1188 uint64_t features;
1189 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1190 features = (uint64_t)-1;
1191 }
1192 else {
1193 features = con->get_features();
1194 }
7c673cae
FG
1195
1196 dir_result_t *dirp = request->dirp;
11fdf7f2 1197 ceph_assert(dirp);
7c673cae
FG
1198
1199 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1200 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1201 if (!p.end()) {
1202 // snapdir?
1203 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1204 ceph_assert(diri);
7c673cae
FG
1205 diri = open_snapdir(diri);
1206 }
1207
1208 // only open dir if we're actually adding stuff to it!
1209 Dir *dir = diri->open_dir();
11fdf7f2 1210 ceph_assert(dir);
7c673cae
FG
1211
1212 // dirstat
11fdf7f2 1213 DirStat dst(p, features);
7c673cae
FG
1214 __u32 numdn;
1215 __u16 flags;
11fdf7f2
TL
1216 decode(numdn, p);
1217 decode(flags, p);
7c673cae
FG
1218
1219 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1220 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1221
1222 frag_t fg = (unsigned)request->head.args.readdir.frag;
1223 unsigned readdir_offset = dirp->next_offset;
1224 string readdir_start = dirp->last_name;
11fdf7f2 1225 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1226
1227 unsigned last_hash = 0;
1228 if (hash_order) {
1229 if (!readdir_start.empty()) {
1230 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1231 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1232 /* mds understands offset_hash */
1233 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1234 }
1235 }
1236
1237 if (fg != dst.frag) {
1238 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1239 fg = dst.frag;
1240 if (!hash_order) {
1241 readdir_offset = 2;
1242 readdir_start.clear();
1243 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1244 }
1245 }
1246
1247 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1248 << ", hash_order=" << hash_order
1249 << ", readdir_start " << readdir_start
1250 << ", last_hash " << last_hash
1251 << ", next_offset " << readdir_offset << dendl;
1252
1253 if (diri->snapid != CEPH_SNAPDIR &&
1254 fg.is_leftmost() && readdir_offset == 2 &&
1255 !(hash_order && last_hash)) {
1256 dirp->release_count = diri->dir_release_count;
1257 dirp->ordered_count = diri->dir_ordered_count;
1258 dirp->start_shared_gen = diri->shared_gen;
1259 dirp->cache_index = 0;
1260 }
1261
1262 dirp->buffer_frag = fg;
1263
1264 _readdir_drop_dirp_buffer(dirp);
1265 dirp->buffer.reserve(numdn);
1266
1267 string dname;
1268 LeaseStat dlease;
1269 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1270 decode(dname, p);
1271 dlease.decode(p, features);
7c673cae
FG
1272 InodeStat ist(p, features);
1273
1274 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1275
1276 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1277 request->perms);
1278 Dentry *dn;
1279 if (diri->dir->dentries.count(dname)) {
1280 Dentry *olddn = diri->dir->dentries[dname];
1281 if (olddn->inode != in) {
1282 // replace incorrect dentry
1283 unlink(olddn, true, true); // keep dir, dentry
1284 dn = link(dir, dname, in, olddn);
11fdf7f2 1285 ceph_assert(dn == olddn);
7c673cae
FG
1286 } else {
1287 // keep existing dn
1288 dn = olddn;
1289 touch_dn(dn);
1290 }
1291 } else {
1292 // new dn
1293 dn = link(dir, dname, in, NULL);
1294 }
f67539c2 1295 dn->alternate_name = std::move(dlease.alternate_name);
7c673cae
FG
1296
1297 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1298 if (hash_order) {
1299 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1300 if (hash != last_hash)
1301 readdir_offset = 2;
1302 last_hash = hash;
1303 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1304 } else {
1305 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1306 }
1307 // add to readdir cache
1308 if (dirp->release_count == diri->dir_release_count &&
1309 dirp->ordered_count == diri->dir_ordered_count &&
1310 dirp->start_shared_gen == diri->shared_gen) {
1311 if (dirp->cache_index == dir->readdir_cache.size()) {
1312 if (i == 0) {
11fdf7f2 1313 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1314 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1315 }
1316 dir->readdir_cache.push_back(dn);
1317 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1318 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1319 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1320 else
1321 dir->readdir_cache[dirp->cache_index] = dn;
1322 } else {
11fdf7f2 1323 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1324 }
1325 dirp->cache_index++;
1326 }
1327 // add to cached result list
f67539c2 1328 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
7c673cae
FG
1329 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1330 }
1331
1332 if (numdn > 0)
1333 dirp->last_name = dname;
1334 if (end)
1335 dirp->next_offset = 2;
1336 else
1337 dirp->next_offset = readdir_offset;
1338
1339 if (dir->is_empty())
1340 close_dir(dir);
1341 }
1342}
1343
1344/** insert_trace
1345 *
1346 * insert a trace from a MDS reply into the cache.
1347 */
1348Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1349{
11fdf7f2 1350 auto& reply = request->reply;
7c673cae
FG
1351 int op = request->get_op();
1352
1353 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1354 << " is_target=" << (int)reply->head.is_target
1355 << " is_dentry=" << (int)reply->head.is_dentry
1356 << dendl;
1357
11fdf7f2 1358 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1359 if (request->got_unsafe) {
1360 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1361 ceph_assert(p.end());
7c673cae
FG
1362 return NULL;
1363 }
1364
1365 if (p.end()) {
1366 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1367
1368 Dentry *d = request->dentry();
1369 if (d) {
1370 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1371 clear_dir_complete_and_ordered(diri, true);
1372 }
1373
1374 if (d && reply->get_result() == 0) {
1375 if (op == CEPH_MDS_OP_RENAME) {
1376 // rename
1377 Dentry *od = request->old_dentry();
1378 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1379 ceph_assert(od);
7c673cae
FG
1380 unlink(od, true, true); // keep dir, dentry
1381 } else if (op == CEPH_MDS_OP_RMDIR ||
1382 op == CEPH_MDS_OP_UNLINK) {
1383 // unlink, rmdir
1384 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1385 unlink(d, true, true); // keep dir, dentry
1386 }
1387 }
1388 return NULL;
1389 }
1390
1391 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1392 uint64_t features;
1393 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1394 features = (uint64_t)-1;
1395 }
1396 else {
1397 features = con->get_features();
1398 }
7c673cae
FG
1399 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1400
1401 // snap trace
1402 SnapRealm *realm = NULL;
1403 if (reply->snapbl.length())
1404 update_snap_trace(reply->snapbl, &realm);
1405
1406 ldout(cct, 10) << " hrm "
1407 << " is_target=" << (int)reply->head.is_target
1408 << " is_dentry=" << (int)reply->head.is_dentry
1409 << dendl;
1410
1411 InodeStat dirst;
1412 DirStat dst;
1413 string dname;
1414 LeaseStat dlease;
1415 InodeStat ist;
1416
1417 if (reply->head.is_dentry) {
1418 dirst.decode(p, features);
11fdf7f2
TL
1419 dst.decode(p, features);
1420 decode(dname, p);
1421 dlease.decode(p, features);
7c673cae
FG
1422 }
1423
1424 Inode *in = 0;
1425 if (reply->head.is_target) {
1426 ist.decode(p, features);
1427 if (cct->_conf->client_debug_getattr_caps) {
1428 unsigned wanted = 0;
1429 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1430 wanted = request->head.args.getattr.mask;
1431 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1432 wanted = request->head.args.open.mask;
1433
1434 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1435 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1436 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1437 }
1438
1439 in = add_update_inode(&ist, request->sent_stamp, session,
1440 request->perms);
1441 }
1442
1443 Inode *diri = NULL;
1444 if (reply->head.is_dentry) {
1445 diri = add_update_inode(&dirst, request->sent_stamp, session,
1446 request->perms);
522d829b
TL
1447 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1448 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
7c673cae
FG
1449
1450 if (in) {
1451 Dir *dir = diri->open_dir();
1452 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1453 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1454 } else {
1455 Dentry *dn = NULL;
1456 if (diri->dir && diri->dir->dentries.count(dname)) {
1457 dn = diri->dir->dentries[dname];
1458 if (dn->inode) {
7c673cae
FG
1459 clear_dir_complete_and_ordered(diri, false);
1460 unlink(dn, true, true); // keep dir, dentry
1461 }
1462 }
1463 if (dlease.duration_ms > 0) {
1464 if (!dn) {
1465 Dir *dir = diri->open_dir();
1466 dn = link(dir, dname, NULL, NULL);
1467 }
1468 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1469 }
1470 }
1471 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1472 op == CEPH_MDS_OP_MKSNAP) {
1473 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1474 // fake it for snap lookup
1475 vinodeno_t vino = ist.vino;
1476 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1477 ceph_assert(inode_map.count(vino));
7c673cae
FG
1478 diri = inode_map[vino];
1479
1480 string dname = request->path.last_dentry();
1481
1482 LeaseStat dlease;
1483 dlease.duration_ms = 0;
1484
1485 if (in) {
1486 Dir *dir = diri->open_dir();
1487 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1488 } else {
1489 if (diri->dir && diri->dir->dentries.count(dname)) {
1490 Dentry *dn = diri->dir->dentries[dname];
1491 if (dn->inode)
1492 unlink(dn, true, true); // keep dir, dentry
1493 }
1494 }
1495 }
1496
1497 if (in) {
1498 if (op == CEPH_MDS_OP_READDIR ||
1499 op == CEPH_MDS_OP_LSSNAP) {
1500 insert_readdir_results(request, session, in);
1501 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1502 // hack: return parent inode instead
1503 in = diri;
1504 }
1505
1506 if (request->dentry() == NULL && in != request->inode()) {
1507 // pin the target inode if its parent dentry is not pinned
1508 request->set_other_inode(in);
1509 }
1510 }
1511
1512 if (realm)
1513 put_snap_realm(realm);
1514
1515 request->target = in;
1516 return in;
1517}
1518
1519// -------
1520
1521mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1522{
1523 mds_rank_t mds = MDS_RANK_NONE;
1524 __u32 hash = 0;
1525 bool is_hash = false;
1526
1527 Inode *in = NULL;
1528 Dentry *de = NULL;
7c673cae
FG
1529
1530 if (req->resend_mds >= 0) {
1531 mds = req->resend_mds;
1532 req->resend_mds = -1;
11fdf7f2 1533 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1534 goto out;
1535 }
1536
1537 if (cct->_conf->client_use_random_mds)
1538 goto random_mds;
1539
1540 in = req->inode();
1541 de = req->dentry();
1542 if (in) {
11fdf7f2 1543 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1544 if (req->path.depth()) {
1545 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1546 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1547 << " on " << req->path[0]
1548 << " => " << hash << dendl;
1549 is_hash = true;
1550 }
1551 } else if (de) {
1552 if (de->inode) {
1553 in = de->inode.get();
11fdf7f2 1554 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1555 } else {
1556 in = de->dir->parent_inode;
1557 hash = in->hash_dentry_name(de->name);
11fdf7f2 1558 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1559 << " on " << de->name
1560 << " => " << hash << dendl;
1561 is_hash = true;
1562 }
1563 }
1564 if (in) {
1565 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1566 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1567 while (in->snapid != CEPH_NOSNAP) {
1568 if (in->snapid == CEPH_SNAPDIR)
1569 in = in->snapdir_parent.get();
11fdf7f2 1570 else if (!in->dentries.empty())
7c673cae
FG
1571 /* In most cases there will only be one dentry, so getting it
1572 * will be the correct action. If there are multiple hard links,
1573 * I think the MDS should be able to redirect as needed*/
1574 in = in->get_first_parent()->dir->parent_inode;
1575 else {
1576 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1577 break;
1578 }
1579 }
1580 is_hash = false;
1581 }
1582
11fdf7f2 1583 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1584 << " hash=" << hash << dendl;
1585
f67539c2 1586 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
7c673cae 1587 frag_t fg = in->dirfragtree[hash];
f67539c2
TL
1588 if (!req->auth_is_best()) {
1589 auto repmapit = in->frag_repmap.find(fg);
1590 if (repmapit != in->frag_repmap.end()) {
1591 auto& repmap = repmapit->second;
1592 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1593 mds = repmap.at(r);
1594 }
1595 } else if (in->fragmap.count(fg)) {
7c673cae
FG
1596 mds = in->fragmap[fg];
1597 if (phash_diri)
1598 *phash_diri = in;
91327a77 1599 } else if (in->auth_cap) {
f67539c2 1600 req->send_to_auth = true;
91327a77
AA
1601 mds = in->auth_cap->session->mds_num;
1602 }
1603 if (mds >= 0) {
11fdf7f2 1604 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1605 goto out;
1606 }
1607 }
1608
11fdf7f2
TL
1609 if (in->auth_cap && req->auth_is_best()) {
1610 mds = in->auth_cap->session->mds_num;
1611 } else if (!in->caps.empty()) {
1612 mds = in->caps.begin()->second.session->mds_num;
1613 } else {
7c673cae 1614 goto random_mds;
11fdf7f2
TL
1615 }
1616 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1617
1618 goto out;
1619 }
1620
1621random_mds:
1622 if (mds < 0) {
1623 mds = _get_random_up_mds();
1624 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1625 }
1626
1627out:
1628 ldout(cct, 20) << "mds is " << mds << dendl;
1629 return mds;
1630}
1631
7c673cae
FG
1632void Client::connect_mds_targets(mds_rank_t mds)
1633{
11fdf7f2
TL
1634 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1635 ceph_assert(mds_sessions.count(mds));
7c673cae 1636 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
f67539c2
TL
1637 for (const auto &rank : info.export_targets) {
1638 if (mds_sessions.count(rank) == 0 &&
1639 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
7c673cae 1640 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
f67539c2
TL
1641 << " export target mds." << rank << dendl;
1642 _open_mds_session(rank);
7c673cae
FG
1643 }
1644 }
1645}
1646
adb31ebb 1647void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1648{
1649 f->dump_int("id", get_nodeid().v);
11fdf7f2 1650 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1651 f->dump_object("inst", inst);
1652 f->dump_stream("inst_str") << inst;
1653 f->dump_stream("addr_str") << inst.addr;
7c673cae 1654 f->open_array_section("sessions");
11fdf7f2 1655 for (const auto &p : mds_sessions) {
7c673cae 1656 f->open_object_section("session");
20effc67 1657 p.second->dump(f, cap_dump);
7c673cae
FG
1658 f->close_section();
1659 }
1660 f->close_section();
1661 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1662}
f67539c2 1663
7c673cae
FG
1664void Client::dump_mds_requests(Formatter *f)
1665{
1666 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1667 p != mds_requests.end();
1668 ++p) {
1669 f->open_object_section("request");
1670 p->second->dump(f);
1671 f->close_section();
1672 }
1673}
1674
9f95a23c 1675int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1676 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1677 InodeRef *ptarget, bool *pcreated,
1678 const UserPerm& perms)
1679{
1680 // check whether this request actually did the create, and set created flag
1681 bufferlist extra_bl;
1682 inodeno_t created_ino;
1683 bool got_created_ino = false;
1684 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1685
11fdf7f2 1686 extra_bl = reply->get_extra_bl();
7c673cae 1687 if (extra_bl.length() >= 8) {
9f95a23c
TL
1688 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1689 struct openc_response_t ocres;
1690
1691 decode(ocres, extra_bl);
1692 created_ino = ocres.created_ino;
1693 /*
1694 * The userland cephfs client doesn't have a way to do an async create
1695 * (yet), so just discard delegated_inos for now. Eventually we should
1696 * store them and use them in create calls, even if they are synchronous,
1697 * if only for testing purposes.
1698 */
1699 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1700 } else {
1701 // u64 containing number of created ino
1702 decode(created_ino, extra_bl);
1703 }
7c673cae 1704 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1705 got_created_ino = true;
7c673cae
FG
1706 }
1707
1708 if (pcreated)
1709 *pcreated = got_created_ino;
1710
1711 if (request->target) {
1712 *ptarget = request->target;
1713 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1714 } else {
1715 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1716 (*ptarget) = p->second;
1717 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1718 } else {
1719 // we got a traceless reply, and need to look up what we just
1720 // created. for now, do this by name. someday, do this by the
1721 // ino... which we know! FIXME.
1722 InodeRef target;
1723 Dentry *d = request->dentry();
1724 if (d) {
1725 if (d->dir) {
1726 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1727 << d->dir->parent_inode->ino << "/" << d->name
1728 << " got_ino " << got_created_ino
1729 << " ino " << created_ino
1730 << dendl;
1731 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1732 &target, perms);
1733 } else {
1734 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1735 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1736 }
1737 } else {
1738 Inode *in = request->inode();
1739 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1740 << in->ino << dendl;
1741 r = _getattr(in, request->regetattr_mask, perms, true);
1742 target = in;
1743 }
1744 if (r >= 0) {
1745 // verify ino returned in reply and trace_dist are the same
1746 if (got_created_ino &&
1747 created_ino.val != target->ino.val) {
1748 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
f67539c2 1749 r = -CEPHFS_EINTR;
7c673cae
FG
1750 }
1751 if (ptarget)
1752 ptarget->swap(target);
1753 }
1754 }
1755 }
1756
1757 return r;
1758}
1759
1760
1761/**
1762 * make a request
1763 *
1764 * Blocking helper to make an MDS request.
1765 *
1766 * If the ptarget flag is set, behavior changes slightly: the caller
1767 * expects to get a pointer to the inode we are creating or operating
1768 * on. As a result, we will follow up any traceless mutation reply
1769 * with a getattr or lookup to transparently handle a traceless reply
1770 * from the MDS (as when the MDS restarts and the client has to replay
1771 * a request).
1772 *
1773 * @param request the MetaRequest to execute
1774 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1775 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1776 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1777 * @param use_mds [optional] prefer a specific mds (-1 for default)
1778 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1779 */
1780int Client::make_request(MetaRequest *request,
1781 const UserPerm& perms,
1782 InodeRef *ptarget, bool *pcreated,
1783 mds_rank_t use_mds,
1784 bufferlist *pdirbl)
1785{
1786 int r = 0;
1787
1788 // assign a unique tid
1789 ceph_tid_t tid = ++last_tid;
1790 request->set_tid(tid);
1791
1792 // and timestamp
1793 request->op_stamp = ceph_clock_now();
1794
1795 // make note
1796 mds_requests[tid] = request->get();
1797 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1798 oldest_tid = tid;
1799
1800 request->set_caller_perms(perms);
1801
1802 if (cct->_conf->client_inject_fixed_oldest_tid) {
1803 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1804 request->set_oldest_client_tid(1);
1805 } else {
1806 request->set_oldest_client_tid(oldest_tid);
1807 }
1808
1809 // hack target mds?
1810 if (use_mds >= 0)
1811 request->resend_mds = use_mds;
1812
20effc67 1813 MetaSessionRef session = NULL;
7c673cae
FG
1814 while (1) {
1815 if (request->aborted())
1816 break;
1817
f67539c2
TL
1818 if (blocklisted) {
1819 request->abort(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
1820 break;
1821 }
1822
7c673cae 1823 // set up wait cond
9f95a23c 1824 ceph::condition_variable caller_cond;
7c673cae
FG
1825 request->caller_cond = &caller_cond;
1826
1827 // choose mds
1828 Inode *hash_diri = NULL;
1829 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1830 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1831 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1832 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1833 if (hash_diri) {
1834 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1835 _fragmap_remove_stopped_mds(hash_diri, mds);
1836 } else {
1837 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1838 request->resend_mds = _get_random_up_mds();
1839 }
1840 } else {
1841 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1842 wait_on_list(waiting_for_mdsmap);
1843 }
1844 continue;
1845 }
1846
1847 // open a session?
7c673cae
FG
1848 if (!have_open_session(mds)) {
1849 session = _get_or_open_mds_session(mds);
f6b5b4d7 1850 if (session->state == MetaSession::STATE_REJECTED) {
f67539c2 1851 request->abort(-CEPHFS_EPERM);
f6b5b4d7
TL
1852 break;
1853 }
7c673cae
FG
1854 // wait
1855 if (session->state == MetaSession::STATE_OPENING) {
1856 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1857 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1858 continue;
1859 }
1860
1861 if (!have_open_session(mds))
1862 continue;
1863 } else {
20effc67 1864 session = mds_sessions.at(mds);
7c673cae
FG
1865 }
1866
1867 // send request.
20effc67 1868 send_request(request, session.get());
7c673cae
FG
1869
1870 // wait for signal
1871 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1872 request->kick = false;
9f95a23c
TL
1873 std::unique_lock l{client_lock, std::adopt_lock};
1874 caller_cond.wait(l, [request] {
1875 return (request->reply || // reply
1876 request->resend_mds >= 0 || // forward
1877 request->kick);
1878 });
1879 l.release();
1880 request->caller_cond = nullptr;
7c673cae
FG
1881
1882 // did we get a reply?
1883 if (request->reply)
1884 break;
1885 }
1886
1887 if (!request->reply) {
11fdf7f2
TL
1888 ceph_assert(request->aborted());
1889 ceph_assert(!request->got_unsafe);
7c673cae
FG
1890 r = request->get_abort_code();
1891 request->item.remove_myself();
1892 unregister_request(request);
11fdf7f2 1893 put_request(request);
7c673cae
FG
1894 return r;
1895 }
1896
1897 // got it!
11fdf7f2 1898 auto reply = std::move(request->reply);
7c673cae
FG
1899 r = reply->get_result();
1900 if (r >= 0)
1901 request->success = true;
1902
1903 // kick dispatcher (we've got it!)
11fdf7f2 1904 ceph_assert(request->dispatch_cond);
9f95a23c 1905 request->dispatch_cond->notify_all();
7c673cae
FG
1906 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1907 request->dispatch_cond = 0;
1908
1909 if (r >= 0 && ptarget)
20effc67 1910 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
7c673cae
FG
1911
1912 if (pdirbl)
11fdf7f2 1913 *pdirbl = reply->get_extra_bl();
7c673cae
FG
1914
1915 // -- log times --
1916 utime_t lat = ceph_clock_now();
1917 lat -= request->sent_stamp;
1918 ldout(cct, 20) << "lat " << lat << dendl;
1919 logger->tinc(l_c_lat, lat);
1920 logger->tinc(l_c_reply, lat);
1921
1922 put_request(request);
7c673cae
FG
1923 return r;
1924}
1925
1926void Client::unregister_request(MetaRequest *req)
1927{
1928 mds_requests.erase(req->tid);
1929 if (req->tid == oldest_tid) {
1930 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1931 while (true) {
1932 if (p == mds_requests.end()) {
1933 oldest_tid = 0;
1934 break;
1935 }
1936 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1937 oldest_tid = p->first;
1938 break;
1939 }
1940 ++p;
1941 }
1942 }
1943 put_request(req);
1944}
1945
1946void Client::put_request(MetaRequest *request)
1947{
1948 if (request->_put()) {
1949 int op = -1;
1950 if (request->success)
1951 op = request->get_op();
1952 InodeRef other_in;
1953 request->take_other_inode(&other_in);
1954 delete request;
1955
1956 if (other_in &&
1957 (op == CEPH_MDS_OP_RMDIR ||
1958 op == CEPH_MDS_OP_RENAME ||
1959 op == CEPH_MDS_OP_RMSNAP)) {
1960 _try_to_trim_inode(other_in.get(), false);
1961 }
1962 }
1963}
1964
1965int Client::encode_inode_release(Inode *in, MetaRequest *req,
1966 mds_rank_t mds, int drop,
1967 int unless, int force)
1968{
11fdf7f2 1969 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
f67539c2 1970 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1911f103 1971 << ", force:" << force << ")" << dendl;
7c673cae 1972 int released = 0;
11fdf7f2
TL
1973 auto it = in->caps.find(mds);
1974 if (it != in->caps.end()) {
1975 Cap &cap = it->second;
7c673cae 1976 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
1977 if ((drop & cap.issued) &&
1978 !(unless & cap.issued)) {
1911f103 1979 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
1980 cap.issued &= ~drop;
1981 cap.implemented &= ~drop;
7c673cae 1982 released = 1;
7c673cae
FG
1983 } else {
1984 released = force;
1985 }
1986 if (released) {
1911f103
TL
1987 cap.wanted = in->caps_wanted();
1988 if (&cap == in->auth_cap &&
1989 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
1990 in->requested_max_size = 0;
1991 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
1992 }
7c673cae
FG
1993 ceph_mds_request_release rel;
1994 rel.ino = in->ino;
11fdf7f2
TL
1995 rel.cap_id = cap.cap_id;
1996 rel.seq = cap.seq;
1997 rel.issue_seq = cap.issue_seq;
1998 rel.mseq = cap.mseq;
1999 rel.caps = cap.implemented;
2000 rel.wanted = cap.wanted;
7c673cae
FG
2001 rel.dname_len = 0;
2002 rel.dname_seq = 0;
2003 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2004 }
2005 }
11fdf7f2 2006 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
2007 << released << dendl;
2008 return released;
2009}
2010
2011void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2012 mds_rank_t mds, int drop, int unless)
2013{
11fdf7f2 2014 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
2015 << dn << ")" << dendl;
2016 int released = 0;
2017 if (dn->dir)
2018 released = encode_inode_release(dn->dir->parent_inode, req,
2019 mds, drop, unless, 1);
2020 if (released && dn->lease_mds == mds) {
2021 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 2022 auto& rel = req->cap_releases.back();
7c673cae
FG
2023 rel.item.dname_len = dn->name.length();
2024 rel.item.dname_seq = dn->lease_seq;
2025 rel.dname = dn->name;
adb31ebb 2026 dn->lease_mds = -1;
7c673cae 2027 }
11fdf7f2 2028 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
2029 << dn << ")" << dendl;
2030}
2031
2032
2033/*
2034 * This requires the MClientRequest *request member to be set.
2035 * It will error out horribly without one.
2036 * Additionally, if you set any *drop member, you'd better have
2037 * set the corresponding dentry!
2038 */
2039void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2040{
11fdf7f2 2041 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
2042 << req << ", mds: " << mds << ")" << dendl;
2043 if (req->inode_drop && req->inode())
2044 encode_inode_release(req->inode(), req,
2045 mds, req->inode_drop,
2046 req->inode_unless);
2047
2048 if (req->old_inode_drop && req->old_inode())
2049 encode_inode_release(req->old_inode(), req,
2050 mds, req->old_inode_drop,
2051 req->old_inode_unless);
2052 if (req->other_inode_drop && req->other_inode())
2053 encode_inode_release(req->other_inode(), req,
2054 mds, req->other_inode_drop,
2055 req->other_inode_unless);
2056
2057 if (req->dentry_drop && req->dentry())
2058 encode_dentry_release(req->dentry(), req,
2059 mds, req->dentry_drop,
2060 req->dentry_unless);
2061
2062 if (req->old_dentry_drop && req->old_dentry())
2063 encode_dentry_release(req->old_dentry(), req,
2064 mds, req->old_dentry_drop,
2065 req->old_dentry_unless);
11fdf7f2 2066 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
2067 << req << ", mds " << mds <<dendl;
2068}
2069
2070bool Client::have_open_session(mds_rank_t mds)
2071{
11fdf7f2
TL
2072 const auto &it = mds_sessions.find(mds);
2073 return it != mds_sessions.end() &&
20effc67
TL
2074 (it->second->state == MetaSession::STATE_OPEN ||
2075 it->second->state == MetaSession::STATE_STALE);
7c673cae
FG
2076}
2077
20effc67 2078MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
7c673cae 2079{
11fdf7f2 2080 const auto &it = mds_sessions.find(mds);
20effc67 2081 if (it == mds_sessions.end() || it->second->con != con) {
7c673cae 2082 return NULL;
11fdf7f2 2083 } else {
20effc67 2084 return it->second;
11fdf7f2 2085 }
7c673cae
FG
2086}
2087
20effc67 2088MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
7c673cae 2089{
11fdf7f2 2090 auto it = mds_sessions.find(mds);
20effc67 2091 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
7c673cae
FG
2092}
2093
2094/**
2095 * Populate a map of strings with client-identifying metadata,
2096 * such as the hostname. Call this once at initialization.
2097 */
2098void Client::populate_metadata(const std::string &mount_root)
2099{
2100 // Hostname
f67539c2
TL
2101#ifdef _WIN32
2102 // TODO: move this to compat.h
2103 char hostname[64];
2104 DWORD hostname_sz = 64;
2105 GetComputerNameA(hostname, &hostname_sz);
2106 metadata["hostname"] = hostname;
2107#else
7c673cae
FG
2108 struct utsname u;
2109 int r = uname(&u);
2110 if (r >= 0) {
2111 metadata["hostname"] = u.nodename;
2112 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2113 } else {
2114 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2115 }
f67539c2 2116#endif
7c673cae
FG
2117
2118 metadata["pid"] = stringify(getpid());
2119
2120 // Ceph entity id (the '0' in "client.0")
2121 metadata["entity_id"] = cct->_conf->name.get_id();
2122
2123 // Our mount position
2124 if (!mount_root.empty()) {
2125 metadata["root"] = mount_root;
2126 }
2127
2128 // Ceph version
2129 metadata["ceph_version"] = pretty_version_to_str();
2130 metadata["ceph_sha1"] = git_version_to_str();
2131
2132 // Apply any metadata from the user's configured overrides
2133 std::vector<std::string> tokens;
2134 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2135 for (const auto &i : tokens) {
2136 auto eqpos = i.find("=");
2137 // Throw out anything that isn't of the form "<str>=<str>"
2138 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2139 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2140 continue;
2141 }
2142 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2143 }
2144}
2145
2146/**
2147 * Optionally add or override client metadata fields.
2148 */
2149void Client::update_metadata(std::string const &k, std::string const &v)
2150{
f67539c2
TL
2151 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2152 ceph_assert(iref_reader.is_state_satisfied());
2153
2154 std::scoped_lock l(client_lock);
7c673cae 2155
11fdf7f2
TL
2156 auto it = metadata.find(k);
2157 if (it != metadata.end()) {
7c673cae 2158 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2159 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2160 }
2161
2162 metadata[k] = v;
2163}
2164
20effc67 2165MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
7c673cae 2166{
11fdf7f2
TL
2167 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2168 auto addrs = mdsmap->get_addrs(mds);
2169 auto em = mds_sessions.emplace(std::piecewise_construct,
2170 std::forward_as_tuple(mds),
20effc67 2171 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
11fdf7f2 2172 ceph_assert(em.second); /* not already present */
20effc67 2173 auto session = em.first->second;
7c673cae 2174
9f95a23c 2175 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2176 m->metadata = metadata;
2177 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
f67539c2 2178 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
11fdf7f2 2179 session->con->send_message2(std::move(m));
7c673cae
FG
2180 return session;
2181}
2182
2183void Client::_close_mds_session(MetaSession *s)
2184{
11fdf7f2 2185 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2186 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2187 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2188}
2189
f6b5b4d7 2190void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2191{
11fdf7f2 2192 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2193 if (rejected && s->state != MetaSession::STATE_CLOSING)
2194 s->state = MetaSession::STATE_REJECTED;
2195 else
2196 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2197 s->con->mark_down();
2198 signal_context_list(s->waiting_for_open);
9f95a23c 2199 mount_cond.notify_all();
f6b5b4d7 2200 remove_session_caps(s, err);
7c673cae 2201 kick_requests_closed(s);
f6b5b4d7
TL
2202 mds_ranks_closing.erase(s->mds_num);
2203 if (s->state == MetaSession::STATE_CLOSED)
2204 mds_sessions.erase(s->mds_num);
7c673cae
FG
2205}
2206
11fdf7f2 2207void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2208{
2209 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2210 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 2211
f67539c2 2212 std::scoped_lock cl(client_lock);
20effc67 2213 auto session = _get_mds_session(from, m->get_connection().get());
7c673cae
FG
2214 if (!session) {
2215 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2216 return;
2217 }
2218
2219 switch (m->get_op()) {
2220 case CEPH_SESSION_OPEN:
11fdf7f2
TL
2221 {
2222 feature_bitset_t missing_features(CEPHFS_FEATURES_CLIENT_REQUIRED);
2223 missing_features -= m->supported_features;
2224 if (!missing_features.empty()) {
2225 lderr(cct) << "mds." << from << " lacks required features '"
2226 << missing_features << "', closing session " << dendl;
20effc67
TL
2227 _close_mds_session(session.get());
2228 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2
TL
2229 break;
2230 }
2231 session->mds_features = std::move(m->supported_features);
33c7a0ef 2232 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
11fdf7f2 2233
20effc67 2234 renew_caps(session.get());
11fdf7f2 2235 session->state = MetaSession::STATE_OPEN;
f67539c2 2236 if (is_unmounting())
9f95a23c 2237 mount_cond.notify_all();
11fdf7f2
TL
2238 else
2239 connect_mds_targets(from);
2240 signal_context_list(session->waiting_for_open);
2241 break;
2242 }
7c673cae
FG
2243
2244 case CEPH_SESSION_CLOSE:
20effc67 2245 _closed_mds_session(session.get());
7c673cae
FG
2246 break;
2247
2248 case CEPH_SESSION_RENEWCAPS:
2249 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2250 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2251 session->cap_ttl =
2252 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298 2253 if (was_stale)
20effc67 2254 wake_up_session_caps(session.get(), false);
7c673cae
FG
2255 }
2256 break;
2257
2258 case CEPH_SESSION_STALE:
28e407b8
AA
2259 // invalidate session caps/leases
2260 session->cap_gen++;
2261 session->cap_ttl = ceph_clock_now();
2262 session->cap_ttl -= 1;
20effc67 2263 renew_caps(session.get());
7c673cae
FG
2264 break;
2265
2266 case CEPH_SESSION_RECALL_STATE:
f67539c2
TL
2267 /*
2268 * Call the renew caps and flush cap releases just before
2269 * triming the caps in case the tick() won't get a chance
2270 * to run them, which could cause the client to be blocklisted
2271 * and MDS daemons trying to recall the caps again and
2272 * again.
2273 *
2274 * In most cases it will do nothing, and the new cap releases
2275 * added by trim_caps() followed will be deferred flushing
2276 * by tick().
2277 */
2278 renew_and_flush_cap_releases();
20effc67 2279 trim_caps(session.get(), m->get_max_caps());
7c673cae
FG
2280 break;
2281
2282 case CEPH_SESSION_FLUSHMSG:
a8e16298 2283 /* flush cap release */
11fdf7f2
TL
2284 if (auto& m = session->release; m) {
2285 session->con->send_message2(std::move(m));
a8e16298 2286 }
9f95a23c 2287 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2288 break;
2289
2290 case CEPH_SESSION_FORCE_RO:
20effc67 2291 force_session_readonly(session.get());
7c673cae
FG
2292 break;
2293
2294 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2295 {
2296 std::string_view error_str;
2297 auto it = m->metadata.find("error_string");
2298 if (it != m->metadata.end())
2299 error_str = it->second;
2300 else
2301 error_str = "unknown error";
2302 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2303
20effc67 2304 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2 2305 }
7c673cae
FG
2306 break;
2307
2308 default:
2309 ceph_abort();
2310 }
7c673cae
FG
2311}
2312
2313bool Client::_any_stale_sessions() const
2314{
9f95a23c 2315 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2316
11fdf7f2 2317 for (const auto &p : mds_sessions) {
20effc67 2318 if (p.second->state == MetaSession::STATE_STALE) {
7c673cae
FG
2319 return true;
2320 }
2321 }
2322
2323 return false;
2324}
2325
2326void Client::_kick_stale_sessions()
2327{
11fdf7f2 2328 ldout(cct, 1) << __func__ << dendl;
7c673cae 2329
11fdf7f2 2330 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67
TL
2331 auto s = it->second;
2332 if (s->state == MetaSession::STATE_REJECTED) {
2333 mds_sessions.erase(it->first);
f6b5b4d7
TL
2334 continue;
2335 }
20effc67
TL
2336 if (s->state == MetaSession::STATE_STALE)
2337 _closed_mds_session(s.get());
7c673cae
FG
2338 }
2339}
2340
2341void Client::send_request(MetaRequest *request, MetaSession *session,
2342 bool drop_cap_releases)
2343{
2344 // make the request
2345 mds_rank_t mds = session->mds_num;
11fdf7f2 2346 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2347 << " for mds." << mds << dendl;
11fdf7f2 2348 auto r = build_client_request(request);
7c673cae
FG
2349 if (request->dentry()) {
2350 r->set_dentry_wanted();
2351 }
2352 if (request->got_unsafe) {
2353 r->set_replayed_op();
2354 if (request->target)
2355 r->head.ino = request->target->ino;
2356 } else {
2357 encode_cap_releases(request, mds);
2358 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2359 request->cap_releases.clear();
2360 else
2361 r->releases.swap(request->cap_releases);
2362 }
2363 r->set_mdsmap_epoch(mdsmap->get_epoch());
2364 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2365 objecter->with_osdmap([r](const OSDMap& o) {
2366 r->set_osdmap_epoch(o.get_epoch());
2367 });
2368 }
2369
2370 if (request->mds == -1) {
2371 request->sent_stamp = ceph_clock_now();
11fdf7f2 2372 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2373 }
2374 request->mds = mds;
2375
2376 Inode *in = request->inode();
11fdf7f2
TL
2377 if (in) {
2378 auto it = in->caps.find(mds);
2379 if (it != in->caps.end()) {
2380 request->sent_on_mseq = it->second.mseq;
2381 }
2382 }
7c673cae
FG
2383
2384 session->requests.push_back(&request->item);
2385
11fdf7f2
TL
2386 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2387 session->con->send_message2(std::move(r));
7c673cae
FG
2388}
2389
9f95a23c 2390ref_t<MClientRequest> Client::build_client_request(MetaRequest *request)
7c673cae 2391{
9f95a23c 2392 auto req = make_message<MClientRequest>(request->get_op());
7c673cae
FG
2393 req->set_tid(request->tid);
2394 req->set_stamp(request->op_stamp);
2395 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2396
2397 // if the filepath's haven't been set, set them!
2398 if (request->path.empty()) {
2399 Inode *in = request->inode();
2400 Dentry *de = request->dentry();
2401 if (in)
2402 in->make_nosnap_relative_path(request->path);
2403 else if (de) {
2404 if (de->inode)
2405 de->inode->make_nosnap_relative_path(request->path);
2406 else if (de->dir) {
2407 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2408 request->path.push_dentry(de->name);
2409 }
2410 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2411 << " No path, inode, or appropriately-endowed dentry given!"
2412 << dendl;
2413 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2414 << " No path, inode, or dentry given!"
2415 << dendl;
2416 }
2417 req->set_filepath(request->get_filepath());
2418 req->set_filepath2(request->get_filepath2());
f67539c2 2419 req->set_alternate_name(request->alternate_name);
7c673cae
FG
2420 req->set_data(request->data);
2421 req->set_retry_attempt(request->retry_attempt++);
2422 req->head.num_fwd = request->num_fwd;
2423 const gid_t *_gids;
2424 int gid_count = request->perms.get_gids(&_gids);
2425 req->set_gid_list(gid_count, _gids);
2426 return req;
2427}
2428
2429
2430
11fdf7f2 2431void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2432{
2433 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
f67539c2
TL
2434
2435 std::scoped_lock cl(client_lock);
20effc67 2436 auto session = _get_mds_session(mds, fwd->get_connection().get());
7c673cae 2437 if (!session) {
7c673cae
FG
2438 return;
2439 }
2440 ceph_tid_t tid = fwd->get_tid();
2441
2442 if (mds_requests.count(tid) == 0) {
11fdf7f2 2443 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2444 return;
2445 }
2446
2447 MetaRequest *request = mds_requests[tid];
11fdf7f2 2448 ceph_assert(request);
7c673cae 2449
33c7a0ef
TL
2450 /*
2451 * The type of 'num_fwd' in ceph 'MClientRequestForward'
2452 * is 'int32_t', while in 'ceph_mds_request_head' the
2453 * type is '__u8'. So in case the request bounces between
2454 * MDSes exceeding 256 times, the client will get stuck.
2455 *
2456 * In this case it's ususally a bug in MDS and continue
2457 * bouncing the request makes no sense.
2458 *
2459 * In future this could be fixed in ceph code, so avoid
2460 * using the hardcode here.
2461 */
2462 int max_fwd = sizeof(((struct ceph_mds_request_head*)0)->num_fwd);
2463 max_fwd = 1 << (max_fwd * CHAR_BIT) - 1;
2464 auto num_fwd = fwd->get_num_fwd();
2465 if (num_fwd <= request->num_fwd || num_fwd >= max_fwd) {
2466 if (request->num_fwd >= max_fwd || num_fwd >= max_fwd) {
2467 request->abort(-EMULTIHOP);
2468 request->caller_cond->notify_all();
2469 ldout(cct, 1) << __func__ << " tid " << tid << " seq overflow"
2470 << ", abort it" << dendl;
2471 } else {
2472 ldout(cct, 10) << __func__ << " tid " << tid
2473 << " old fwd seq " << fwd->get_num_fwd()
2474 << " <= req fwd " << request->num_fwd
2475 << ", ignore it" << dendl;
2476 }
2477 return;
2478 }
2479
7c673cae
FG
2480 // reset retry counter
2481 request->retry_attempt = 0;
2482
2483 // request not forwarded, or dest mds has no session.
2484 // resend.
11fdf7f2 2485 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2486 << " fwd " << fwd->get_num_fwd()
2487 << " to mds." << fwd->get_dest_mds()
2488 << ", resending to " << fwd->get_dest_mds()
2489 << dendl;
2490
2491 request->mds = -1;
2492 request->item.remove_myself();
33c7a0ef 2493 request->num_fwd = num_fwd;
7c673cae 2494 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2495 request->caller_cond->notify_all();
7c673cae
FG
2496}
2497
2498bool Client::is_dir_operation(MetaRequest *req)
2499{
2500 int op = req->get_op();
2501 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2502 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2503 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2504 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2505 return true;
2506 return false;
2507}
2508
11fdf7f2 2509void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2510{
2511 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
f67539c2
TL
2512
2513 std::scoped_lock cl(client_lock);
20effc67 2514 auto session = _get_mds_session(mds_num, reply->get_connection().get());
7c673cae 2515 if (!session) {
7c673cae
FG
2516 return;
2517 }
2518
2519 ceph_tid_t tid = reply->get_tid();
2520 bool is_safe = reply->is_safe();
2521
2522 if (mds_requests.count(tid) == 0) {
11fdf7f2 2523 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2524 << " safe is:" << is_safe << dendl;
7c673cae
FG
2525 return;
2526 }
2527 MetaRequest *request = mds_requests.at(tid);
2528
11fdf7f2 2529 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2530 << " tid " << tid << dendl;
2531
2532 if (request->got_unsafe && !is_safe) {
2533 //duplicate response
2534 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2535 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2536 return;
2537 }
2538
f67539c2 2539 if (-CEPHFS_ESTALE == reply->get_result()) { // see if we can get to proper MDS
7c673cae
FG
2540 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2541 << " from mds." << request->mds << dendl;
2542 request->send_to_auth = true;
2543 request->resend_mds = choose_target_mds(request);
2544 Inode *in = request->inode();
11fdf7f2 2545 std::map<mds_rank_t, Cap>::const_iterator it;
7c673cae
FG
2546 if (request->resend_mds >= 0 &&
2547 request->resend_mds == request->mds &&
2548 (in == NULL ||
11fdf7f2
TL
2549 (it = in->caps.find(request->resend_mds)) != in->caps.end() ||
2550 request->sent_on_mseq == it->second.mseq)) {
2551 ldout(cct, 20) << "have to return ESTALE" << dendl;
7c673cae 2552 } else {
9f95a23c 2553 request->caller_cond->notify_all();
7c673cae
FG
2554 return;
2555 }
7c673cae
FG
2556 }
2557
11fdf7f2 2558 ceph_assert(!request->reply);
7c673cae 2559 request->reply = reply;
20effc67 2560 insert_trace(request, session.get());
7c673cae
FG
2561
2562 // Handle unsafe reply
2563 if (!is_safe) {
2564 request->got_unsafe = true;
2565 session->unsafe_requests.push_back(&request->unsafe_item);
2566 if (is_dir_operation(request)) {
2567 Inode *dir = request->inode();
11fdf7f2 2568 ceph_assert(dir);
7c673cae
FG
2569 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2570 }
2571 if (request->target) {
2572 InodeRef &in = request->target;
2573 in->unsafe_ops.push_back(&request->unsafe_target_item);
2574 }
2575 }
2576
2577 // Only signal the caller once (on the first reply):
2578 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2579 if (!is_safe || !request->got_unsafe) {
9f95a23c 2580 ceph::condition_variable cond;
7c673cae
FG
2581 request->dispatch_cond = &cond;
2582
2583 // wake up waiter
11fdf7f2 2584 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2585 request->caller_cond->notify_all();
7c673cae
FG
2586
2587 // wake for kick back
9f95a23c
TL
2588 std::unique_lock l{client_lock, std::adopt_lock};
2589 cond.wait(l, [tid, request, &cond, this] {
2590 if (request->dispatch_cond) {
2591 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2592 << tid << " " << &cond << dendl;
2593 }
2594 return !request->dispatch_cond;
2595 });
2596 l.release();
7c673cae
FG
2597 }
2598
2599 if (is_safe) {
2600 // the filesystem change is committed to disk
2601 // we're done, clean up
2602 if (request->got_unsafe) {
2603 request->unsafe_item.remove_myself();
2604 request->unsafe_dir_item.remove_myself();
2605 request->unsafe_target_item.remove_myself();
2606 signal_cond_list(request->waitfor_safe);
2607 }
2608 request->item.remove_myself();
2609 unregister_request(request);
2610 }
f67539c2 2611 if (is_unmounting())
9f95a23c 2612 mount_cond.notify_all();
7c673cae
FG
2613}
2614
2615void Client::_handle_full_flag(int64_t pool)
2616{
2617 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2618 << "on " << pool << dendl;
f67539c2 2619 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
7c673cae
FG
2620 // to do this rather than blocking, because otherwise when we fill up we
2621 // potentially lock caps forever on files with dirty pages, and we need
2622 // to be able to release those caps to the MDS so that it can delete files
2623 // and free up space.
f67539c2 2624 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
7c673cae
FG
2625
2626 // For all inodes with layouts in this pool and a pending flush write op
2627 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2628 // from ObjectCacher so that it doesn't re-issue the write in response to
2629 // the ENOSPC error.
2630 // Fortunately since we're cancelling everything in a given pool, we don't
2631 // need to know which ops belong to which ObjectSet, we can just blow all
2632 // the un-flushed cached data away and mark any dirty inodes' async_err
f67539c2 2633 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
7c673cae
FG
2634 // affecting this pool, and all the objectsets we're purging were also
2635 // in this pool.
2636 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2637 i != inode_map.end(); ++i)
2638 {
2639 Inode *inode = i->second;
2640 if (inode->oset.dirty_or_tx
2641 && (pool == -1 || inode->layout.pool_id == pool)) {
2642 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2643 << " has dirty objects, purging and setting ENOSPC" << dendl;
2644 objectcacher->purge_set(&inode->oset);
f67539c2 2645 inode->set_async_err(-CEPHFS_ENOSPC);
7c673cae
FG
2646 }
2647 }
2648
2649 if (cancelled_epoch != (epoch_t)-1) {
2650 set_cap_epoch_barrier(cancelled_epoch);
2651 }
2652}
2653
11fdf7f2 2654void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2655{
f67539c2 2656 std::scoped_lock cl(client_lock);
31f18b77 2657
11fdf7f2 2658 const auto myaddrs = messenger->get_myaddrs();
33c7a0ef 2659 bool new_blocklist = objecter->with_osdmap(
11fdf7f2 2660 [&](const OSDMap& o) {
33c7a0ef 2661 return o.is_blocklisted(myaddrs);
11fdf7f2 2662 });
33c7a0ef
TL
2663
2664 if (new_blocklist && !blocklisted) {
31f18b77
FG
2665 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2666 return o.get_epoch();
2667 });
f67539c2
TL
2668 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2669 blocklisted = true;
31f18b77 2670
f67539c2 2671 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
2672
2673 // Since we know all our OSD ops will fail, cancel them all preemtively,
2674 // so that on an unhealthy cluster we can umount promptly even if e.g.
2675 // some PGs were inaccessible.
f67539c2
TL
2676 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2677
2678 }
31f18b77 2679
f67539c2
TL
2680 if (blocklisted) {
2681 // Handle case where we were blocklisted but no longer are
2682 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2683 return o.is_blocklisted(myaddrs);});
31f18b77
FG
2684 }
2685
f67539c2
TL
2686 // Always subscribe to next osdmap for blocklisted client
2687 // until this client is not blocklisted.
2688 if (blocklisted) {
f64942e4
AA
2689 objecter->maybe_request_map();
2690 }
2691
7c673cae
FG
2692 if (objecter->osdmap_full_flag()) {
2693 _handle_full_flag(-1);
2694 } else {
2695 // Accumulate local list of full pools so that I can drop
2696 // the objecter lock before re-entering objecter in
2697 // cancel_writes
2698 std::vector<int64_t> full_pools;
2699
2700 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2701 for (const auto& kv : o.get_pools()) {
2702 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2703 full_pools.push_back(kv.first);
2704 }
2705 }
2706 });
2707
2708 for (auto p : full_pools)
2709 _handle_full_flag(p);
2710
2711 // Subscribe to subsequent maps to watch for the full flag going
2712 // away. For the global full flag objecter does this for us, but
2713 // it pays no attention to the per-pool full flag so in this branch
2714 // we do it ourselves.
2715 if (!full_pools.empty()) {
2716 objecter->maybe_request_map();
2717 }
2718 }
7c673cae
FG
2719}
2720
2721
2722// ------------------------
2723// incoming messages
2724
2725
11fdf7f2 2726bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2727{
f67539c2
TL
2728 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2729 if (!iref_reader.is_state_satisfied()) {
7c673cae 2730 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2731 return true;
2732 }
2733
2734 switch (m->get_type()) {
2735 // mounting and mds sessions
2736 case CEPH_MSG_MDS_MAP:
9f95a23c 2737 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2738 break;
2739 case CEPH_MSG_FS_MAP:
9f95a23c 2740 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2741 break;
2742 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2743 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2744 break;
2745 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2746 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2747 break;
2748
2749 case CEPH_MSG_OSD_MAP:
9f95a23c 2750 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2751 break;
2752
2753 // requests
2754 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2755 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2756 break;
2757 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2758 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2759 break;
2760
2761 // reclaim reply
2762 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2763 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2764 break;
2765
2766 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2767 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2768 break;
2769 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2770 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2771 break;
2772 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2773 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2774 break;
2775 case MSG_COMMAND_REPLY:
2776 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2777 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2778 } else {
2779 return false;
2780 }
2781 break;
2782 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2783 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2784 break;
2785
2786 default:
2787 return false;
2788 }
2789
2790 // unmounting?
f67539c2
TL
2791 std::scoped_lock cl(client_lock);
2792 if (is_unmounting()) {
7c673cae
FG
2793 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2794 << "+" << inode_map.size() << dendl;
f67539c2 2795 uint64_t size = lru.lru_get_size() + inode_map.size();
7c673cae 2796 trim_cache();
f67539c2 2797 if (size > lru.lru_get_size() + inode_map.size()) {
7c673cae 2798 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2799 mount_cond.notify_all();
7c673cae
FG
2800 } else {
2801 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2802 << "+" << inode_map.size() << dendl;
2803 }
2804 }
2805
2806 return true;
2807}
2808
11fdf7f2 2809void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae 2810{
f67539c2 2811 std::scoped_lock cl(client_lock);
7c673cae 2812 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2813
2814 signal_cond_list(waiting_for_fsmap);
2815
2816 monclient->sub_got("fsmap", fsmap->get_epoch());
2817}
2818
11fdf7f2 2819void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae 2820{
f67539c2 2821 std::scoped_lock cl(client_lock);
7c673cae
FG
2822 fsmap_user.reset(new FSMapUser);
2823 *fsmap_user = m->get_fsmap();
7c673cae
FG
2824
2825 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2826 signal_cond_list(waiting_for_fsmap);
2827}
2828
f67539c2
TL
2829// Cancel all the commands for missing or laggy GIDs
2830void Client::cancel_commands(const MDSMap& newmap)
7c673cae 2831{
f67539c2 2832 std::vector<ceph_tid_t> cancel_ops;
7c673cae 2833
f67539c2 2834 std::scoped_lock cmd_lock(command_lock);
7c673cae 2835 auto &commands = command_table.get_commands();
f67539c2 2836 for (const auto &[tid, op] : commands) {
7c673cae 2837 const mds_gid_t op_mds_gid = op.mds_gid;
f67539c2
TL
2838 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2839 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2840 cancel_ops.push_back(tid);
7c673cae
FG
2841 if (op.outs) {
2842 std::ostringstream ss;
2843 ss << "MDS " << op_mds_gid << " went away";
2844 *(op.outs) = ss.str();
2845 }
f67539c2
TL
2846 /*
2847 * No need to make the con->mark_down under
2848 * client_lock here, because the con will
2849 * has its own lock.
2850 */
7c673cae 2851 op.con->mark_down();
f67539c2
TL
2852 if (op.on_finish)
2853 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
7c673cae
FG
2854 }
2855 }
2856
f67539c2
TL
2857 for (const auto &tid : cancel_ops)
2858 command_table.erase(tid);
2859}
2860
2861void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2862{
2863 std::unique_lock cl(client_lock);
2864 if (m->get_epoch() <= mdsmap->get_epoch()) {
2865 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
2866 << " is identical to or older than our "
2867 << mdsmap->get_epoch() << dendl;
2868 return;
7c673cae
FG
2869 }
2870
f67539c2
TL
2871 cl.unlock();
2872 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
2873 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
2874 _mdsmap->decode(m->get_encoded());
2875 cancel_commands(*_mdsmap.get());
2876 cl.lock();
2877
2878 _mdsmap.swap(mdsmap);
2879
7c673cae 2880 // reset session
11fdf7f2 2881 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 2882 mds_rank_t mds = p->first;
20effc67 2883 MetaSessionRef session = p->second;
7c673cae
FG
2884 ++p;
2885
f67539c2 2886 int oldstate = _mdsmap->get_state(mds);
7c673cae
FG
2887 int newstate = mdsmap->get_state(mds);
2888 if (!mdsmap->is_up(mds)) {
2889 session->con->mark_down();
11fdf7f2 2890 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f67539c2
TL
2891 auto old_inc = _mdsmap->get_incarnation(mds);
2892 auto new_inc = mdsmap->get_incarnation(mds);
f64942e4
AA
2893 if (old_inc != new_inc) {
2894 ldout(cct, 1) << "mds incarnation changed from "
2895 << old_inc << " to " << new_inc << dendl;
2896 oldstate = MDSMap::STATE_NULL;
2897 }
7c673cae 2898 session->con->mark_down();
11fdf7f2 2899 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
2900 // When new MDS starts to take over, notify kernel to trim unused entries
2901 // in its dcache/icache. Hopefully, the kernel will release some unused
2902 // inodes before the new MDS enters reconnect state.
20effc67 2903 trim_cache_for_reconnect(session.get());
7c673cae
FG
2904 } else if (oldstate == newstate)
2905 continue; // no change
f67539c2 2906
7c673cae
FG
2907 session->mds_state = newstate;
2908 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 2909 session->con = messenger->connect_to_mds(session->addrs);
20effc67 2910 send_reconnect(session.get());
81eedcae
TL
2911 } else if (newstate > MDSMap::STATE_RECONNECT) {
2912 if (oldstate < MDSMap::STATE_RECONNECT) {
2913 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
20effc67 2914 _closed_mds_session(session.get());
81eedcae
TL
2915 continue;
2916 }
2917 if (newstate >= MDSMap::STATE_ACTIVE) {
2918 if (oldstate < MDSMap::STATE_ACTIVE) {
2919 // kick new requests
20effc67
TL
2920 kick_requests(session.get());
2921 kick_flushing_caps(session.get());
81eedcae 2922 signal_context_list(session->waiting_for_open);
20effc67 2923 wake_up_session_caps(session.get(), true);
81eedcae
TL
2924 }
2925 connect_mds_targets(mds);
7c673cae 2926 }
7c673cae
FG
2927 } else if (newstate == MDSMap::STATE_NULL &&
2928 mds >= mdsmap->get_max_mds()) {
20effc67 2929 _closed_mds_session(session.get());
7c673cae
FG
2930 }
2931 }
2932
2933 // kick any waiting threads
2934 signal_cond_list(waiting_for_mdsmap);
2935
7c673cae
FG
2936 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2937}
2938
2939void Client::send_reconnect(MetaSession *session)
2940{
2941 mds_rank_t mds = session->mds_num;
11fdf7f2 2942 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
2943
2944 // trim unused caps to reduce MDS's cache rejoin time
2945 trim_cache_for_reconnect(session);
2946
2947 session->readonly = false;
2948
11fdf7f2 2949 session->release.reset();
7c673cae
FG
2950
2951 // reset my cap seq number
2952 session->seq = 0;
2953 //connect to the mds' offload targets
2954 connect_mds_targets(mds);
2955 //make sure unsafe requests get saved
2956 resend_unsafe_requests(session);
2957
11fdf7f2
TL
2958 early_kick_flushing_caps(session);
2959
9f95a23c 2960 auto m = make_message<MClientReconnect>();
11fdf7f2 2961 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
2962
2963 // i have an open session.
2964 ceph::unordered_set<inodeno_t> did_snaprealm;
2965 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2966 p != inode_map.end();
2967 ++p) {
2968 Inode *in = p->second;
11fdf7f2
TL
2969 auto it = in->caps.find(mds);
2970 if (it != in->caps.end()) {
2971 if (allow_multi &&
9f95a23c
TL
2972 m->get_approx_size() >=
2973 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
2974 m->mark_more();
2975 session->con->send_message2(std::move(m));
2976
9f95a23c 2977 m = make_message<MClientReconnect>();
11fdf7f2
TL
2978 }
2979
2980 Cap &cap = it->second;
7c673cae 2981 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 2982 << " " << ccap_string(cap.issued)
7c673cae
FG
2983 << " wants " << ccap_string(in->caps_wanted())
2984 << dendl;
2985 filepath path;
f91f0fd5 2986 in->make_short_path(path);
7c673cae
FG
2987 ldout(cct, 10) << " path " << path << dendl;
2988
2989 bufferlist flockbl;
2990 _encode_filelocks(in, flockbl);
2991
11fdf7f2
TL
2992 cap.seq = 0; // reset seq.
2993 cap.issue_seq = 0; // reset seq.
2994 cap.mseq = 0; // reset seq.
2995 // cap gen should catch up with session cap_gen
2996 if (cap.gen < session->cap_gen) {
2997 cap.gen = session->cap_gen;
2998 cap.issued = cap.implemented = CEPH_CAP_PIN;
2999 } else {
3000 cap.issued = cap.implemented;
3001 }
7c673cae
FG
3002 snapid_t snap_follows = 0;
3003 if (!in->cap_snaps.empty())
3004 snap_follows = in->cap_snaps.begin()->first;
3005
3006 m->add_cap(p->first.ino,
11fdf7f2 3007 cap.cap_id,
7c673cae
FG
3008 path.get_ino(), path.get_path(), // ino
3009 in->caps_wanted(), // wanted
11fdf7f2 3010 cap.issued, // issued
7c673cae
FG
3011 in->snaprealm->ino,
3012 snap_follows,
3013 flockbl);
3014
3015 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3016 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3017 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3018 did_snaprealm.insert(in->snaprealm->ino);
3019 }
3020 }
3021 }
3022
11fdf7f2
TL
3023 if (!allow_multi)
3024 m->set_encoding_version(0); // use connection features to choose encoding
3025 session->con->send_message2(std::move(m));
7c673cae 3026
9f95a23c 3027 mount_cond.notify_all();
11fdf7f2
TL
3028
3029 if (session->reclaim_state == MetaSession::RECLAIMING)
3030 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
3031}
3032
3033
3034void Client::kick_requests(MetaSession *session)
3035{
11fdf7f2 3036 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3037 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3038 p != mds_requests.end();
3039 ++p) {
31f18b77
FG
3040 MetaRequest *req = p->second;
3041 if (req->got_unsafe)
3042 continue;
3043 if (req->aborted()) {
3044 if (req->caller_cond) {
3045 req->kick = true;
9f95a23c 3046 req->caller_cond->notify_all();
31f18b77 3047 }
7c673cae 3048 continue;
31f18b77
FG
3049 }
3050 if (req->retry_attempt > 0)
7c673cae 3051 continue; // new requests only
31f18b77 3052 if (req->mds == session->mds_num) {
7c673cae
FG
3053 send_request(p->second, session);
3054 }
3055 }
3056}
3057
3058void Client::resend_unsafe_requests(MetaSession *session)
3059{
3060 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3061 !iter.end();
3062 ++iter)
3063 send_request(*iter, session);
3064
3065 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3066 // process completed requests in clientreplay stage.
3067 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3068 p != mds_requests.end();
3069 ++p) {
3070 MetaRequest *req = p->second;
3071 if (req->got_unsafe)
3072 continue;
31f18b77
FG
3073 if (req->aborted())
3074 continue;
7c673cae
FG
3075 if (req->retry_attempt == 0)
3076 continue; // old requests only
3077 if (req->mds == session->mds_num)
3078 send_request(req, session, true);
3079 }
3080}
3081
3082void Client::wait_unsafe_requests()
3083{
3084 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2 3085 for (const auto &p : mds_sessions) {
20effc67
TL
3086 const auto s = p.second;
3087 if (!s->unsafe_requests.empty()) {
3088 MetaRequest *req = s->unsafe_requests.back();
7c673cae
FG
3089 req->get();
3090 last_unsafe_reqs.push_back(req);
3091 }
3092 }
3093
3094 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3095 p != last_unsafe_reqs.end();
3096 ++p) {
3097 MetaRequest *req = *p;
3098 if (req->unsafe_item.is_on_list())
3099 wait_on_list(req->waitfor_safe);
3100 put_request(req);
3101 }
3102}
3103
3104void Client::kick_requests_closed(MetaSession *session)
3105{
11fdf7f2 3106 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3107 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3108 p != mds_requests.end(); ) {
3109 MetaRequest *req = p->second;
3110 ++p;
3111 if (req->mds == session->mds_num) {
3112 if (req->caller_cond) {
3113 req->kick = true;
9f95a23c 3114 req->caller_cond->notify_all();
7c673cae
FG
3115 }
3116 req->item.remove_myself();
3117 if (req->got_unsafe) {
11fdf7f2 3118 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 3119 req->unsafe_item.remove_myself();
eafe8130
TL
3120 if (is_dir_operation(req)) {
3121 Inode *dir = req->inode();
20effc67 3122 ceph_assert(dir);
f67539c2 3123 dir->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3124 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3125 << dir->ino << " " << req->get_tid() << dendl;
3126 req->unsafe_dir_item.remove_myself();
3127 }
3128 if (req->target) {
3129 InodeRef &in = req->target;
f67539c2 3130 in->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3131 lderr(cct) << "kick_requests_closed drop req of inode : "
3132 << in->ino << " " << req->get_tid() << dendl;
3133 req->unsafe_target_item.remove_myself();
3134 }
7c673cae
FG
3135 signal_cond_list(req->waitfor_safe);
3136 unregister_request(req);
3137 }
3138 }
3139 }
11fdf7f2
TL
3140 ceph_assert(session->requests.empty());
3141 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
3142}
3143
3144
3145
3146
3147/************
3148 * leases
3149 */
3150
3151void Client::got_mds_push(MetaSession *s)
3152{
3153 s->seq++;
3154 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3155 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 3156 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
3157 }
3158}
3159
11fdf7f2 3160void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3161{
11fdf7f2 3162 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3163
11fdf7f2 3164 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae 3165 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
3166
3167 std::scoped_lock cl(client_lock);
20effc67 3168 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 3169 if (!session) {
7c673cae
FG
3170 return;
3171 }
3172
20effc67 3173 got_mds_push(session.get());
7c673cae
FG
3174
3175 ceph_seq_t seq = m->get_seq();
3176
3177 Inode *in;
3178 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3179 if (inode_map.count(vino) == 0) {
3180 ldout(cct, 10) << " don't have vino " << vino << dendl;
3181 goto revoke;
3182 }
3183 in = inode_map[vino];
3184
9f95a23c 3185 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3186 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3187 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3188 goto revoke;
3189 }
3190 Dentry *dn = in->dir->dentries[m->dname];
3191 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3192 dn->lease_mds = -1;
3193 }
3194
3195 revoke:
11fdf7f2 3196 {
9f95a23c
TL
3197 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3198 m->get_mask(), m->get_ino(),
3199 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3200 m->get_connection()->send_message2(std::move(reply));
3201 }
7c673cae
FG
3202}
3203
f67539c2 3204void Client::_put_inode(Inode *in, int n)
7c673cae 3205{
f67539c2
TL
3206 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3207
b3b6e05e
TL
3208 int left = in->get_nref();
3209 ceph_assert(left >= n + 1);
3210 in->iput(n);
3211 left -= n;
3212 if (left == 1) { // the last one will be held by the inode_map
7c673cae
FG
3213 // release any caps
3214 remove_all_caps(in);
3215
11fdf7f2 3216 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3217 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3218 ceph_assert(!unclean);
7c673cae
FG
3219 inode_map.erase(in->vino());
3220 if (use_faked_inos())
3221 _release_faked_ino(in);
3222
b3b6e05e 3223 if (root == nullptr) {
7c673cae
FG
3224 root_ancestor = 0;
3225 while (!root_parents.empty())
3226 root_parents.erase(root_parents.begin());
3227 }
3228
b3b6e05e 3229 in->iput();
7c673cae
FG
3230 }
3231}
3232
f67539c2
TL
3233void Client::delay_put_inodes(bool wakeup)
3234{
3235 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3236
3237 std::map<Inode*,int> release;
3238 {
3239 std::scoped_lock dl(delay_i_lock);
3240 release.swap(delay_i_release);
3241 }
3242
3243 if (release.empty())
3244 return;
3245
3246 for (auto &[in, cnt] : release)
3247 _put_inode(in, cnt);
3248
3249 if (wakeup)
3250 mount_cond.notify_all();
3251}
3252
3253void Client::put_inode(Inode *in, int n)
3254{
3255 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3256
3257 std::scoped_lock dl(delay_i_lock);
3258 delay_i_release[in] += n;
3259}
3260
7c673cae
FG
3261void Client::close_dir(Dir *dir)
3262{
3263 Inode *in = dir->parent_inode;
11fdf7f2
TL
3264 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3265 ceph_assert(dir->is_empty());
3266 ceph_assert(in->dir == dir);
3267 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3268 if (!in->dentries.empty())
7c673cae
FG
3269 in->get_first_parent()->put(); // unpin dentry
3270
3271 delete in->dir;
3272 in->dir = 0;
3273 put_inode(in); // unpin inode
3274}
3275
3276 /**
3277 * Don't call this with in==NULL, use get_or_create for that
3278 * leave dn set to default NULL unless you're trying to add
3279 * a new inode to a pre-created Dentry
3280 */
3281Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3282{
3283 if (!dn) {
3284 // create a new Dentry
11fdf7f2
TL
3285 dn = new Dentry(dir, name);
3286
7c673cae
FG
3287 lru.lru_insert_mid(dn); // mid or top?
3288
3289 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3290 << " dn " << dn << " (new dn)" << dendl;
3291 } else {
11fdf7f2 3292 ceph_assert(!dn->inode);
7c673cae
FG
3293 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3294 << " dn " << dn << " (old dn)" << dendl;
3295 }
3296
3297 if (in) { // link to inode
11fdf7f2 3298 InodeRef tmp_ref;
7c673cae 3299 // only one parent for directories!
11fdf7f2
TL
3300 if (in->is_dir() && !in->dentries.empty()) {
3301 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3302 Dentry *olddn = in->get_first_parent();
11fdf7f2 3303 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3304 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3305 clear_dir_complete_and_ordered(old_diri, true);
3306 unlink(olddn, true, true); // keep dir, dentry
3307 }
3308
11fdf7f2 3309 dn->link(in);
f67539c2 3310 inc_dentry_nr();
11fdf7f2 3311 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3312 }
3313
3314 return dn;
3315}
3316
3317void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3318{
11fdf7f2 3319 InodeRef in(dn->inode);
7c673cae
FG
3320 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3321 << " inode " << dn->inode << dendl;
3322
3323 // unlink from inode
11fdf7f2
TL
3324 if (dn->inode) {
3325 dn->unlink();
f67539c2 3326 dec_dentry_nr();
11fdf7f2 3327 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3328 }
3329
3330 if (keepdentry) {
3331 dn->lease_mds = -1;
3332 } else {
3333 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3334
3335 // unlink from dir
11fdf7f2
TL
3336 Dir *dir = dn->dir;
3337 dn->detach();
7c673cae
FG
3338
3339 // delete den
3340 lru.lru_remove(dn);
3341 dn->put();
11fdf7f2
TL
3342
3343 if (dir->is_empty() && !keepdir)
3344 close_dir(dir);
7c673cae
FG
3345 }
3346}
3347
3348/**
3349 * For asynchronous flushes, check for errors from the IO and
3350 * update the inode if necessary
3351 */
3352class C_Client_FlushComplete : public Context {
3353private:
3354 Client *client;
3355 InodeRef inode;
3356public:
3357 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3358 void finish(int r) override {
9f95a23c 3359 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3360 if (r != 0) {
3361 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3362 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3363 << " 0x" << std::hex << inode->ino << std::dec
3364 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3365 inode->set_async_err(r);
3366 }
3367 }
3368};
3369
3370
3371/****
3372 * caps
3373 */
3374
3375void Client::get_cap_ref(Inode *in, int cap)
3376{
3377 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3378 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3379 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
b3b6e05e 3380 in->iget();
7c673cae
FG
3381 }
3382 if ((cap & CEPH_CAP_FILE_CACHE) &&
3383 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3384 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
b3b6e05e 3385 in->iget();
7c673cae
FG
3386 }
3387 in->get_cap_ref(cap);
3388}
3389
3390void Client::put_cap_ref(Inode *in, int cap)
3391{
3392 int last = in->put_cap_ref(cap);
3393 if (last) {
3394 int put_nref = 0;
3395 int drop = last & ~in->caps_issued();
3396 if (in->snapid == CEPH_NOSNAP) {
f67539c2 3397 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3398 !in->cap_snaps.empty() &&
3399 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3400 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3401 in->cap_snaps.rbegin()->second.writing = 0;
3402 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3403 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3404 }
3405 if (last & CEPH_CAP_FILE_BUFFER) {
3406 for (auto &p : in->cap_snaps)
3407 p.second.dirty_data = 0;
3408 signal_cond_list(in->waitfor_commit);
11fdf7f2 3409 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3410 ++put_nref;
3411 }
3412 }
3413 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3414 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3415 ++put_nref;
3416 }
3417 if (drop)
3418 check_caps(in, 0);
3419 if (put_nref)
3420 put_inode(in, put_nref);
3421 }
3422}
3423
f67539c2
TL
3424// get caps for a given file handle -- the inode should have @need caps
3425// issued by the mds and @want caps not revoked (or not under revocation).
3426// this routine blocks till the cap requirement is satisfied. also account
3427// (track) for capability hit when required (when cap requirement succeedes).
f6b5b4d7 3428int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3429{
f6b5b4d7
TL
3430 Inode *in = fh->inode.get();
3431
7c673cae
FG
3432 int r = check_pool_perm(in, need);
3433 if (r < 0)
3434 return r;
3435
3436 while (1) {
3437 int file_wanted = in->caps_file_wanted();
3438 if ((file_wanted & need) != need) {
3439 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3440 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3441 << dendl;
f67539c2 3442 return -CEPHFS_EBADF;
7c673cae
FG
3443 }
3444
f6b5b4d7 3445 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
f67539c2 3446 return -CEPHFS_EBADF;
f6b5b4d7
TL
3447
3448 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
f67539c2 3449 return -CEPHFS_EIO;
f6b5b4d7 3450
7c673cae
FG
3451 int implemented;
3452 int have = in->caps_issued(&implemented);
3453
3454 bool waitfor_caps = false;
3455 bool waitfor_commit = false;
3456
3457 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3458 if (endoff > 0) {
3459 if ((endoff >= (loff_t)in->max_size ||
3460 endoff > (loff_t)(in->size << 1)) &&
3461 endoff > (loff_t)in->wanted_max_size) {
3462 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3463 in->wanted_max_size = endoff;
3464 }
3465 if (in->wanted_max_size > in->max_size &&
3466 in->wanted_max_size > in->requested_max_size)
3467 check_caps(in, 0);
7c673cae
FG
3468 }
3469
3470 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3471 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3472 waitfor_caps = true;
3473 }
3474 if (!in->cap_snaps.empty()) {
3475 if (in->cap_snaps.rbegin()->second.writing) {
3476 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3477 waitfor_caps = true;
3478 }
3479 for (auto &p : in->cap_snaps) {
3480 if (p.second.dirty_data) {
3481 waitfor_commit = true;
3482 break;
3483 }
3484 }
3485 if (waitfor_commit) {
3486 _flush(in, new C_Client_FlushComplete(this, in));
3487 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3488 }
3489 }
3490 }
3491
3492 if (!waitfor_caps && !waitfor_commit) {
3493 if ((have & need) == need) {
7c673cae
FG
3494 int revoking = implemented & ~have;
3495 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3496 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3497 << " revoking " << ccap_string(revoking)
7c673cae 3498 << dendl;
c07f9fc5 3499 if ((revoking & want) == 0) {
7c673cae
FG
3500 *phave = need | (have & want);
3501 in->get_cap_ref(need);
f67539c2 3502 cap_hit();
7c673cae
FG
3503 return 0;
3504 }
3505 }
3506 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3507 waitfor_caps = true;
3508 }
3509
3510 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3511 in->auth_cap->session->readonly)
f67539c2 3512 return -CEPHFS_EROFS;
7c673cae
FG
3513
3514 if (in->flags & I_CAP_DROPPED) {
3515 int mds_wanted = in->caps_mds_wanted();
3516 if ((mds_wanted & need) != need) {
3517 int ret = _renew_caps(in);
3518 if (ret < 0)
3519 return ret;
3520 continue;
3521 }
a8e16298 3522 if (!(file_wanted & ~mds_wanted))
7c673cae 3523 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3524 }
3525
3526 if (waitfor_caps)
3527 wait_on_list(in->waitfor_caps);
3528 else if (waitfor_commit)
3529 wait_on_list(in->waitfor_commit);
3530 }
3531}
3532
3533int Client::get_caps_used(Inode *in)
3534{
3535 unsigned used = in->caps_used();
3536 if (!(used & CEPH_CAP_FILE_CACHE) &&
3537 !objectcacher->set_is_empty(&in->oset))
3538 used |= CEPH_CAP_FILE_CACHE;
3539 return used;
3540}
3541
3542void Client::cap_delay_requeue(Inode *in)
3543{
11fdf7f2 3544 ldout(cct, 10) << __func__ << " on " << *in << dendl;
7c673cae
FG
3545 in->hold_caps_until = ceph_clock_now();
3546 in->hold_caps_until += cct->_conf->client_caps_release_delay;
28e407b8 3547 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3548}
3549
3550void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3551 int flags, int used, int want, int retain,
7c673cae
FG
3552 int flush, ceph_tid_t flush_tid)
3553{
3554 int held = cap->issued | cap->implemented;
3555 int revoking = cap->implemented & ~cap->issued;
3556 retain &= ~revoking;
3557 int dropping = cap->issued & ~retain;
3558 int op = CEPH_CAP_OP_UPDATE;
3559
11fdf7f2 3560 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3561 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3562 << " used " << ccap_string(used)
3563 << " want " << ccap_string(want)
3564 << " flush " << ccap_string(flush)
3565 << " retain " << ccap_string(retain)
3566 << " held "<< ccap_string(held)
3567 << " revoking " << ccap_string(revoking)
3568 << " dropping " << ccap_string(dropping)
3569 << dendl;
3570
3571 if (cct->_conf->client_inject_release_failure && revoking) {
3572 const int would_have_issued = cap->issued & retain;
3573 const int would_have_implemented = cap->implemented & (cap->issued | used);
3574 // Simulated bug:
3575 // - tell the server we think issued is whatever they issued plus whatever we implemented
3576 // - leave what we have implemented in place
3577 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3578 cap->issued = cap->issued | cap->implemented;
3579
3580 // Make an exception for revoking xattr caps: we are injecting
3581 // failure to release other caps, but allow xattr because client
3582 // will block on xattr ops if it can't release these to MDS (#9800)
3583 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3584 cap->issued ^= xattr_mask & revoking;
3585 cap->implemented ^= xattr_mask & revoking;
3586
3587 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3588 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3589 } else {
3590 // Normal behaviour
3591 cap->issued &= retain;
3592 cap->implemented &= cap->issued | used;
3593 }
3594
3595 snapid_t follows = 0;
3596
3597 if (flush)
3598 follows = in->snaprealm->get_snap_context().seq;
20effc67 3599
9f95a23c 3600 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3601 in->ino,
3602 0,
3603 cap->cap_id, cap->seq,
3604 cap->implemented,
3605 want,
3606 flush,
3607 cap->mseq,
3608 cap_epoch_barrier);
3609 m->caller_uid = in->cap_dirtier_uid;
3610 m->caller_gid = in->cap_dirtier_gid;
3611
3612 m->head.issue_seq = cap->issue_seq;
3613 m->set_tid(flush_tid);
3614
3615 m->head.uid = in->uid;
3616 m->head.gid = in->gid;
3617 m->head.mode = in->mode;
20effc67 3618
7c673cae 3619 m->head.nlink = in->nlink;
20effc67 3620
7c673cae 3621 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3622 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3623 m->head.xattr_version = in->xattr_version;
3624 }
20effc67 3625
7c673cae
FG
3626 m->size = in->size;
3627 m->max_size = in->max_size;
3628 m->truncate_seq = in->truncate_seq;
3629 m->truncate_size = in->truncate_size;
3630 m->mtime = in->mtime;
3631 m->atime = in->atime;
3632 m->ctime = in->ctime;
3633 m->btime = in->btime;
3634 m->time_warp_seq = in->time_warp_seq;
3635 m->change_attr = in->change_attr;
eafe8130
TL
3636
3637 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3638 !in->cap_snaps.empty() &&
3639 in->cap_snaps.rbegin()->second.flush_tid == 0)
3640 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3641 m->flags = flags;
3642
7c673cae
FG
3643 if (flush & CEPH_CAP_FILE_WR) {
3644 m->inline_version = in->inline_version;
3645 m->inline_data = in->inline_data;
3646 }
3647
3648 in->reported_size = in->size;
3649 m->set_snap_follows(follows);
3650 cap->wanted = want;
3651 if (cap == in->auth_cap) {
1911f103
TL
3652 if (want & CEPH_CAP_ANY_FILE_WR) {
3653 m->set_max_size(in->wanted_max_size);
3654 in->requested_max_size = in->wanted_max_size;
3655 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3656 } else {
3657 in->requested_max_size = 0;
3658 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3659 }
7c673cae
FG
3660 }
3661
3662 if (!session->flushing_caps_tids.empty())
3663 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3664
11fdf7f2 3665 session->con->send_message2(std::move(m));
7c673cae
FG
3666}
3667
31f18b77
FG
3668static bool is_max_size_approaching(Inode *in)
3669{
3670 /* mds will adjust max size according to the reported size */
3671 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3672 return false;
3673 if (in->size >= in->max_size)
3674 return true;
3675 /* half of previous max_size increment has been used */
3676 if (in->max_size > in->reported_size &&
3677 (in->size << 1) >= in->max_size + in->reported_size)
3678 return true;
3679 return false;
3680}
7c673cae 3681
11fdf7f2
TL
3682static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3683{
3684 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3685 return used;
3686 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3687 return used;
3688
3689 if (issued & CEPH_CAP_FILE_LAZYIO) {
3690 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3691 used &= ~CEPH_CAP_FILE_CACHE;
3692 used |= CEPH_CAP_FILE_LAZYIO;
3693 }
3694 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3695 used &= ~CEPH_CAP_FILE_BUFFER;
3696 used |= CEPH_CAP_FILE_LAZYIO;
3697 }
3698 } else {
3699 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3700 used &= ~CEPH_CAP_FILE_CACHE;
3701 used |= CEPH_CAP_FILE_LAZYIO;
3702 }
3703 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3704 used &= ~CEPH_CAP_FILE_BUFFER;
3705 used |= CEPH_CAP_FILE_LAZYIO;
3706 }
3707 }
3708 return used;
3709}
3710
7c673cae
FG
3711/**
3712 * check_caps
3713 *
3714 * Examine currently used and wanted versus held caps. Release, flush or ack
3715 * revoked caps to the MDS as appropriate.
3716 *
3717 * @param in the inode to check
3718 * @param flags flags to apply to cap check
3719 */
3720void Client::check_caps(Inode *in, unsigned flags)
3721{
3722 unsigned wanted = in->caps_wanted();
3723 unsigned used = get_caps_used(in);
3724 unsigned cap_used;
3725
7c673cae
FG
3726 int implemented;
3727 int issued = in->caps_issued(&implemented);
3728 int revoking = implemented & ~issued;
3729
11fdf7f2
TL
3730 int orig_used = used;
3731 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3732
7c673cae 3733 int retain = wanted | used | CEPH_CAP_PIN;
f67539c2 3734 if (!is_unmounting() && in->nlink > 0) {
a8e16298 3735 if (wanted) {
7c673cae 3736 retain |= CEPH_CAP_ANY;
a8e16298
TL
3737 } else if (in->is_dir() &&
3738 (issued & CEPH_CAP_FILE_SHARED) &&
3739 (in->flags & I_COMPLETE)) {
3740 // we do this here because we don't want to drop to Fs (and then
3741 // drop the Fs if we do a create!) if that alone makes us send lookups
3742 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3743 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3744 retain |= wanted;
3745 } else {
7c673cae 3746 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3747 // keep RD only if we didn't have the file open RW,
3748 // because then the mds would revoke it anyway to
3749 // journal max_size=0.
3750 if (in->max_size == 0)
3751 retain |= CEPH_CAP_ANY_RD;
3752 }
7c673cae
FG
3753 }
3754
11fdf7f2 3755 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3756 << " wanted " << ccap_string(wanted)
3757 << " used " << ccap_string(used)
3758 << " issued " << ccap_string(issued)
3759 << " revoking " << ccap_string(revoking)
3760 << " flags=" << flags
3761 << dendl;
3762
3763 if (in->snapid != CEPH_NOSNAP)
3764 return; //snap caps last forever, can't write
3765
3766 if (in->caps.empty())
3767 return; // guard if at end of func
3768
11fdf7f2
TL
3769 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3770 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3771 if (_release(in))
11fdf7f2 3772 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3773 }
7c673cae 3774
20effc67
TL
3775 for (auto &[mds, cap] : in->caps) {
3776 auto session = mds_sessions.at(mds);
7c673cae
FG
3777
3778 cap_used = used;
11fdf7f2 3779 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3780 cap_used &= ~in->auth_cap->issued;
3781
11fdf7f2 3782 revoking = cap.implemented & ~cap.issued;
20effc67 3783
7c673cae 3784 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3785 << " issued " << ccap_string(cap.issued)
3786 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3787 << " revoking " << ccap_string(revoking) << dendl;
3788
3789 if (in->wanted_max_size > in->max_size &&
3790 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3791 &cap == in->auth_cap)
7c673cae
FG
3792 goto ack;
3793
3794 /* approaching file_max? */
11fdf7f2
TL
3795 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3796 &cap == in->auth_cap &&
31f18b77 3797 is_max_size_approaching(in)) {
7c673cae 3798 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3799 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3800 goto ack;
3801 }
3802
3803 /* completed revocation? */
3804 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3805 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3806 goto ack;
3807 }
3808
3809 /* want more caps from mds? */
11fdf7f2 3810 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3811 goto ack;
3812
f67539c2 3813 if (!revoking && is_unmounting() && (cap_used == 0))
7c673cae
FG
3814 goto ack;
3815
11fdf7f2 3816 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3817 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3818 continue;
3819
11fdf7f2 3820 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3821 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3822 cap_delay_requeue(in);
7c673cae
FG
3823 continue;
3824 }
3825
3826 ack:
eafe8130
TL
3827 if (&cap == in->auth_cap) {
3828 if (in->flags & I_KICK_FLUSH) {
3829 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3830 << " to mds." << mds << dendl;
20effc67 3831 kick_flushing_caps(in, session.get());
eafe8130
TL
3832 }
3833 if (!in->cap_snaps.empty() &&
3834 in->cap_snaps.rbegin()->second.flush_tid == 0)
3835 flush_snaps(in);
7c673cae
FG
3836 }
3837
3838 int flushing;
e306af50 3839 int msg_flags = 0;
7c673cae 3840 ceph_tid_t flush_tid;
11fdf7f2 3841 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3842 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3843 if (flags & CHECK_CAPS_SYNCHRONOUS)
3844 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3845 } else {
3846 flushing = 0;
3847 flush_tid = 0;
3848 }
3849
20effc67
TL
3850 in->delay_cap_item.remove_myself();
3851 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
eafe8130 3852 flushing, flush_tid);
7c673cae
FG
3853 }
3854}
3855
3856
3857void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3858{
3859 int used = get_caps_used(in);
3860 int dirty = in->caps_dirty();
11fdf7f2 3861 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
3862
3863 if (in->cap_snaps.size() &&
3864 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3865 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
3866 return;
3867 } else if (in->caps_dirty() ||
3868 (used & CEPH_CAP_FILE_WR) ||
3869 (dirty & CEPH_CAP_ANY_WR)) {
3870 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 3871 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
3872 CapSnap &capsnap = capsnapem.first->second;
3873 capsnap.context = old_snapc;
3874 capsnap.issued = in->caps_issued();
3875 capsnap.dirty = in->caps_dirty();
f67539c2 3876
7c673cae 3877 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
f67539c2 3878
7c673cae
FG
3879 capsnap.uid = in->uid;
3880 capsnap.gid = in->gid;
3881 capsnap.mode = in->mode;
3882 capsnap.btime = in->btime;
3883 capsnap.xattrs = in->xattrs;
3884 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
3885 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3886 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
f67539c2 3887
7c673cae 3888 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 3889 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
3890 capsnap.writing = 1;
3891 } else {
3892 finish_cap_snap(in, capsnap, used);
3893 }
3894 } else {
11fdf7f2 3895 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
3896 }
3897}
3898
3899void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3900{
11fdf7f2 3901 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
3902 capsnap.size = in->size;
3903 capsnap.mtime = in->mtime;
3904 capsnap.atime = in->atime;
3905 capsnap.ctime = in->ctime;
3906 capsnap.time_warp_seq = in->time_warp_seq;
3907 capsnap.change_attr = in->change_attr;
7c673cae
FG
3908 capsnap.dirty |= in->caps_dirty();
3909
11fdf7f2
TL
3910 /* Only reset it if it wasn't set before */
3911 if (capsnap.cap_dirtier_uid == -1) {
3912 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
3913 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
3914 }
3915
7c673cae
FG
3916 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3917 capsnap.inline_data = in->inline_data;
3918 capsnap.inline_version = in->inline_version;
3919 }
3920
3921 if (used & CEPH_CAP_FILE_BUFFER) {
f67539c2 3922 capsnap.writing = 1;
11fdf7f2 3923 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
3924 << " WRBUFFER, delaying" << dendl;
3925 } else {
3926 capsnap.dirty_data = 0;
3927 flush_snaps(in);
3928 }
3929}
3930
eafe8130
TL
3931void Client::send_flush_snap(Inode *in, MetaSession *session,
3932 snapid_t follows, CapSnap& capsnap)
3933{
9f95a23c
TL
3934 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
3935 in->ino, in->snaprealm->ino, 0,
3936 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
3937 m->caller_uid = capsnap.cap_dirtier_uid;
3938 m->caller_gid = capsnap.cap_dirtier_gid;
3939
3940 m->set_client_tid(capsnap.flush_tid);
3941 m->head.snap_follows = follows;
3942
3943 m->head.caps = capsnap.issued;
3944 m->head.dirty = capsnap.dirty;
3945
3946 m->head.uid = capsnap.uid;
3947 m->head.gid = capsnap.gid;
3948 m->head.mode = capsnap.mode;
3949 m->btime = capsnap.btime;
3950
3951 m->size = capsnap.size;
3952
3953 m->head.xattr_version = capsnap.xattr_version;
3954 encode(capsnap.xattrs, m->xattrbl);
3955
3956 m->ctime = capsnap.ctime;
3957 m->btime = capsnap.btime;
3958 m->mtime = capsnap.mtime;
3959 m->atime = capsnap.atime;
3960 m->time_warp_seq = capsnap.time_warp_seq;
3961 m->change_attr = capsnap.change_attr;
3962
3963 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3964 m->inline_version = in->inline_version;
3965 m->inline_data = in->inline_data;
3966 }
3967
3968 ceph_assert(!session->flushing_caps_tids.empty());
3969 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3970
3971 session->con->send_message2(std::move(m));
3972}
3973
3974void Client::flush_snaps(Inode *in)
7c673cae 3975{
eafe8130 3976 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 3977 ceph_assert(in->cap_snaps.size());
7c673cae
FG
3978
3979 // pick auth mds
11fdf7f2 3980 ceph_assert(in->auth_cap);
7c673cae 3981 MetaSession *session = in->auth_cap->session;
7c673cae
FG
3982
3983 for (auto &p : in->cap_snaps) {
3984 CapSnap &capsnap = p.second;
eafe8130
TL
3985 // only do new flush
3986 if (capsnap.flush_tid > 0)
3987 continue;
7c673cae
FG
3988
3989 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3990 << " follows " << p.first
3991 << " size " << capsnap.size
3992 << " mtime " << capsnap.mtime
3993 << " dirty_data=" << capsnap.dirty_data
3994 << " writing=" << capsnap.writing
3995 << " on " << *in << dendl;
3996 if (capsnap.dirty_data || capsnap.writing)
eafe8130 3997 break;
f67539c2 3998
eafe8130
TL
3999 capsnap.flush_tid = ++last_flush_tid;
4000 session->flushing_caps_tids.insert(capsnap.flush_tid);
4001 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4002 if (!in->flushing_cap_item.is_on_list())
4003 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 4004
eafe8130 4005 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
4006 }
4007}
4008
9f95a23c 4009void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 4010{
9f95a23c 4011 ceph::condition_variable cond;
7c673cae 4012 ls.push_back(&cond);
9f95a23c
TL
4013 std::unique_lock l{client_lock, std::adopt_lock};
4014 cond.wait(l);
4015 l.release();
7c673cae
FG
4016 ls.remove(&cond);
4017}
4018
9f95a23c 4019void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 4020{
9f95a23c
TL
4021 for (auto cond : ls) {
4022 cond->notify_all();
4023 }
7c673cae
FG
4024}
4025
4026void Client::wait_on_context_list(list<Context*>& ls)
4027{
9f95a23c 4028 ceph::condition_variable cond;
7c673cae
FG
4029 bool done = false;
4030 int r;
9f95a23c
TL
4031 ls.push_back(new C_Cond(cond, &done, &r));
4032 std::unique_lock l{client_lock, std::adopt_lock};
4033 cond.wait(l, [&done] { return done;});
4034 l.release();
7c673cae
FG
4035}
4036
4037void Client::signal_context_list(list<Context*>& ls)
4038{
4039 while (!ls.empty()) {
4040 ls.front()->complete(0);
4041 ls.pop_front();
4042 }
4043}
4044
a8e16298 4045void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 4046{
11fdf7f2
TL
4047 for (const auto &cap : s->caps) {
4048 auto &in = cap->inode;
a8e16298 4049 if (reconnect) {
11fdf7f2
TL
4050 in.requested_max_size = 0;
4051 in.wanted_max_size = 0;
a8e16298
TL
4052 } else {
4053 if (cap->gen < s->cap_gen) {
4054 // mds did not re-issue stale cap.
4055 cap->issued = cap->implemented = CEPH_CAP_PIN;
4056 // make sure mds knows what we want.
11fdf7f2
TL
4057 if (in.caps_file_wanted() & ~cap->wanted)
4058 in.flags |= I_CAP_DROPPED;
a8e16298
TL
4059 }
4060 }
11fdf7f2 4061 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4062 }
4063}
4064
4065
4066// flush dirty data (from objectcache)
4067
4068class C_Client_CacheInvalidate : public Context {
4069private:
4070 Client *client;
4071 vinodeno_t ino;
4072 int64_t offset, length;
4073public:
4074 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4075 client(c), offset(off), length(len) {
4076 if (client->use_faked_inos())
4077 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4078 else
4079 ino = in->vino();
4080 }
4081 void finish(int r) override {
4082 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 4083 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
4084 client->_async_invalidate(ino, offset, length);
4085 }
4086};
4087
4088void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4089{
f67539c2
TL
4090 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4091 if (!mref_reader.is_state_satisfied())
7c673cae 4092 return;
f67539c2 4093
11fdf7f2 4094 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
4095 ino_invalidate_cb(callback_handle, ino, off, len);
4096}
4097
4098void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4099
4100 if (ino_invalidate_cb)
4101 // we queue the invalidate, which calls the callback and decrements the ref
4102 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4103}
4104
4105void Client::_invalidate_inode_cache(Inode *in)
4106{
11fdf7f2 4107 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
4108
4109 // invalidate our userspace inode cache
94b18763 4110 if (cct->_conf->client_oc) {
7c673cae 4111 objectcacher->release_set(&in->oset);
94b18763
FG
4112 if (!objectcacher->set_is_empty(&in->oset))
4113 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4114 }
7c673cae
FG
4115
4116 _schedule_invalidate_callback(in, 0, 0);
4117}
4118
4119void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4120{
11fdf7f2 4121 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
4122
4123 // invalidate our userspace inode cache
4124 if (cct->_conf->client_oc) {
4125 vector<ObjectExtent> ls;
4126 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 4127 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
4128 }
4129
4130 _schedule_invalidate_callback(in, off, len);
4131}
4132
4133bool Client::_release(Inode *in)
4134{
4135 ldout(cct, 20) << "_release " << *in << dendl;
4136 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4137 _invalidate_inode_cache(in);
4138 return true;
4139 }
4140 return false;
4141}
4142
4143bool Client::_flush(Inode *in, Context *onfinish)
4144{
4145 ldout(cct, 10) << "_flush " << *in << dendl;
4146
4147 if (!in->oset.dirty_or_tx) {
4148 ldout(cct, 10) << " nothing to flush" << dendl;
4149 onfinish->complete(0);
4150 return true;
4151 }
4152
4153 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 4154 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
4155 objectcacher->purge_set(&in->oset);
4156 if (onfinish) {
f67539c2 4157 onfinish->complete(-CEPHFS_ENOSPC);
7c673cae
FG
4158 }
4159 return true;
4160 }
4161
4162 return objectcacher->flush_set(&in->oset, onfinish);
4163}
4164
4165void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4166{
f67539c2 4167 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
4168 if (!in->oset.dirty_or_tx) {
4169 ldout(cct, 10) << " nothing to flush" << dendl;
4170 return;
4171 }
4172
11fdf7f2 4173 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 4174 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 4175 offset, size, &onflush);
7c673cae
FG
4176 if (!ret) {
4177 // wait for flush
9f95a23c 4178 client_lock.unlock();
11fdf7f2 4179 onflush.wait();
9f95a23c 4180 client_lock.lock();
7c673cae
FG
4181 }
4182}
4183
4184void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4185{
f67539c2
TL
4186 // std::scoped_lock l(client_lock);
4187 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 4188 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 4189 ceph_assert(in);
7c673cae
FG
4190 _flushed(in);
4191}
4192
4193void Client::_flushed(Inode *in)
4194{
4195 ldout(cct, 10) << "_flushed " << *in << dendl;
4196
4197 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4198}
4199
4200
4201
4202// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4203void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4204{
4205 unsigned had = in->caps_issued();
4206
4207 if ((issued & CEPH_CAP_FILE_CACHE) &&
4208 !(had & CEPH_CAP_FILE_CACHE))
4209 in->cache_gen++;
4210
f91f0fd5
TL
4211 if ((issued & CEPH_CAP_FILE_SHARED) !=
4212 (had & CEPH_CAP_FILE_SHARED)) {
4213 if (issued & CEPH_CAP_FILE_SHARED)
4214 in->shared_gen++;
7c673cae
FG
4215 if (in->is_dir())
4216 clear_dir_complete_and_ordered(in, true);
4217 }
4218}
4219
4220void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4221 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4222 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4223{
11fdf7f2
TL
4224 if (!in->is_any_caps()) {
4225 ceph_assert(in->snaprealm == 0);
4226 in->snaprealm = get_snap_realm(realm);
4227 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4228 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4229 } else {
4230 ceph_assert(in->snaprealm);
4231 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4232 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4233 in->snaprealm_item.remove_myself();
4234 auto oldrealm = in->snaprealm;
4235 in->snaprealm = get_snap_realm(realm);
4236 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4237 put_snap_realm(oldrealm);
4238 }
4239 }
4240
7c673cae 4241 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4242 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4243 Cap &cap = capem.first->second;
4244 if (!capem.second) {
4245 if (cap.gen < mds_session->cap_gen)
4246 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4247
4248 /*
4249 * auth mds of the inode changed. we received the cap export
4250 * message, but still haven't received the cap import message.
4251 * handle_cap_export() updated the new auth MDS' cap.
4252 *
4253 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4254 * a message that was send before the cap import message. So
4255 * don't remove caps.
4256 */
11fdf7f2 4257 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4258 if (&cap != in->auth_cap)
4259 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4260
11fdf7f2
TL
4261 ceph_assert(cap.cap_id == cap_id);
4262 seq = cap.seq;
4263 mseq = cap.mseq;
4264 issued |= cap.issued;
7c673cae
FG
4265 flags |= CEPH_CAP_FLAG_AUTH;
4266 }
f67539c2
TL
4267 } else {
4268 inc_pinned_icaps();
7c673cae
FG
4269 }
4270
11fdf7f2 4271 check_cap_issue(in, issued);
7c673cae
FG
4272
4273 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4274 if (in->auth_cap != &cap &&
7c673cae
FG
4275 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4276 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4277 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4278 << "add myself to new auth MDS' flushing caps list" << dendl;
4279 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4280 }
11fdf7f2 4281 in->auth_cap = &cap;
7c673cae
FG
4282 }
4283 }
4284
11fdf7f2
TL
4285 unsigned old_caps = cap.issued;
4286 cap.cap_id = cap_id;
4287 cap.issued = issued;
4288 cap.implemented |= issued;
4289 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4290 cap.wanted = wanted;
a8e16298 4291 else
11fdf7f2
TL
4292 cap.wanted |= wanted;
4293 cap.seq = seq;
4294 cap.issue_seq = seq;
4295 cap.mseq = mseq;
4296 cap.gen = mds_session->cap_gen;
4297 cap.latest_perms = cap_perms;
4298 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4299 << " from mds." << mds
4300 << " on " << *in
4301 << dendl;
4302
4303 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4304 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4305 for (auto &p : in->caps) {
4306 if (&p.second == &cap)
7c673cae 4307 continue;
11fdf7f2 4308 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4309 check_caps(in, CHECK_CAPS_NODELAY);
4310 break;
4311 }
4312 }
4313 }
4314
4315 if (issued & ~old_caps)
4316 signal_cond_list(in->waitfor_caps);
4317}
4318
4319void Client::remove_cap(Cap *cap, bool queue_release)
4320{
11fdf7f2 4321 auto &in = cap->inode;
7c673cae
FG
4322 MetaSession *session = cap->session;
4323 mds_rank_t mds = cap->session->mds_num;
4324
11fdf7f2 4325 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4326
4327 if (queue_release) {
4328 session->enqueue_cap_release(
11fdf7f2 4329 in.ino,
7c673cae
FG
4330 cap->cap_id,
4331 cap->issue_seq,
4332 cap->mseq,
4333 cap_epoch_barrier);
f67539c2
TL
4334 } else {
4335 dec_pinned_icaps();
7c673cae
FG
4336 }
4337
f67539c2 4338
11fdf7f2
TL
4339 if (in.auth_cap == cap) {
4340 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4341 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4342 in.flushing_cap_item.remove_myself();
7c673cae 4343 }
11fdf7f2 4344 in.auth_cap = NULL;
7c673cae 4345 }
11fdf7f2
TL
4346 size_t n = in.caps.erase(mds);
4347 ceph_assert(n == 1);
7c673cae
FG
4348 cap = nullptr;
4349
11fdf7f2
TL
4350 if (!in.is_any_caps()) {
4351 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4352 in.snaprealm_item.remove_myself();
4353 put_snap_realm(in.snaprealm);
4354 in.snaprealm = 0;
7c673cae
FG
4355 }
4356}
4357
4358void Client::remove_all_caps(Inode *in)
4359{
4360 while (!in->caps.empty())
11fdf7f2 4361 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4362}
4363
f6b5b4d7 4364void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4365{
11fdf7f2 4366 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4367
4368 while (s->caps.size()) {
4369 Cap *cap = *s->caps.begin();
11fdf7f2 4370 InodeRef in(&cap->inode);
eafe8130 4371 bool dirty_caps = false;
7c673cae 4372 if (in->auth_cap == cap) {
7c673cae
FG
4373 dirty_caps = in->dirty_caps | in->flushing_caps;
4374 in->wanted_max_size = 0;
4375 in->requested_max_size = 0;
f6b5b4d7
TL
4376 if (in->has_any_filelocks())
4377 in->flags |= I_ERROR_FILELOCK;
7c673cae 4378 }
f6b5b4d7 4379 auto caps = cap->implemented;
a8e16298
TL
4380 if (cap->wanted | cap->issued)
4381 in->flags |= I_CAP_DROPPED;
7c673cae 4382 remove_cap(cap, false);
eafe8130 4383 in->cap_snaps.clear();
7c673cae 4384 if (dirty_caps) {
11fdf7f2 4385 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4386 if (in->flushing_caps) {
4387 num_flushing_caps--;
4388 in->flushing_cap_tids.clear();
4389 }
4390 in->flushing_caps = 0;
28e407b8 4391 in->mark_caps_clean();
11fdf7f2 4392 put_inode(in.get());
7c673cae 4393 }
f6b5b4d7
TL
4394 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4395 if (caps && !in->caps_issued_mask(caps, true)) {
f67539c2 4396 if (err == -CEPHFS_EBLOCKLISTED) {
f6b5b4d7
TL
4397 if (in->oset.dirty_or_tx) {
4398 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4399 in->set_async_err(err);
4400 }
4401 objectcacher->purge_set(&in->oset);
4402 } else {
4403 objectcacher->release_set(&in->oset);
4404 }
4405 _schedule_invalidate_callback(in.get(), 0, 0);
4406 }
4407
a8e16298 4408 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4409 }
4410 s->flushing_caps_tids.clear();
9f95a23c 4411 sync_cond.notify_all();
7c673cae
FG
4412}
4413
1d09f67e 4414std::pair<int, bool> Client::_do_remount(bool retry_on_error)
b32b8144 4415{
adb31ebb 4416 uint64_t max_retries = cct->_conf.get_val<uint64_t>("mds_max_retries_on_remount_failure");
1d09f67e 4417 bool abort_on_failure = false;
91327a77 4418
b32b8144
FG
4419 errno = 0;
4420 int r = remount_cb(callback_handle);
91327a77
AA
4421 if (r == 0) {
4422 retries_on_invalidate = 0;
4423 } else {
b32b8144
FG
4424 int e = errno;
4425 client_t whoami = get_nodeid();
4426 if (r == -1) {
4427 lderr(cct) <<
4428 "failed to remount (to trim kernel dentries): "
4429 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4430 } else {
4431 lderr(cct) <<
4432 "failed to remount (to trim kernel dentries): "
4433 "return code = " << r << dendl;
4434 }
91327a77 4435 bool should_abort =
11fdf7f2
TL
4436 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4437 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4438 !(retry_on_error && (++retries_on_invalidate < max_retries));
f67539c2 4439 if (should_abort && !is_unmounting()) {
b32b8144 4440 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
1d09f67e 4441 abort_on_failure = true;
b32b8144
FG
4442 }
4443 }
1d09f67e 4444 return std::make_pair(r, abort_on_failure);
b32b8144
FG
4445}
4446
7c673cae
FG
4447class C_Client_Remount : public Context {
4448private:
4449 Client *client;
4450public:
4451 explicit C_Client_Remount(Client *c) : client(c) {}
4452 void finish(int r) override {
11fdf7f2 4453 ceph_assert(r == 0);
91327a77 4454 client->_do_remount(true);
7c673cae
FG
4455 }
4456};
4457
4458void Client::_invalidate_kernel_dcache()
4459{
f67539c2
TL
4460 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4461 if (!mref_reader.is_state_satisfied())
7c673cae 4462 return;
f67539c2 4463
94b18763
FG
4464 if (can_invalidate_dentries) {
4465 if (dentry_invalidate_cb && root->dir) {
4466 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4467 p != root->dir->dentries.end();
4468 ++p) {
4469 if (p->second->inode)
4470 _schedule_invalidate_dentry_callback(p->second, false);
4471 }
7c673cae
FG
4472 }
4473 } else if (remount_cb) {
4474 // Hacky:
4475 // when remounting a file system, linux kernel trims all unused dentries in the fs
4476 remount_finisher.queue(new C_Client_Remount(this));
4477 }
4478}
4479
91327a77
AA
4480void Client::_trim_negative_child_dentries(InodeRef& in)
4481{
4482 if (!in->is_dir())
4483 return;
4484
4485 Dir* dir = in->dir;
4486 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4487 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4488 Dentry *dn = p->second;
4489 ++p;
11fdf7f2 4490 ceph_assert(!dn->inode);
91327a77
AA
4491 if (dn->lru_is_expireable())
4492 unlink(dn, true, false); // keep dir, drop dentry
4493 }
4494 if (dir->dentries.empty()) {
4495 close_dir(dir);
4496 }
4497 }
4498
4499 if (in->flags & I_SNAPDIR_OPEN) {
4500 InodeRef snapdir = open_snapdir(in.get());
4501 _trim_negative_child_dentries(snapdir);
4502 }
4503}
4504
e306af50
TL
4505class C_Client_CacheRelease : public Context {
4506private:
4507 Client *client;
4508 vinodeno_t ino;
4509public:
4510 C_Client_CacheRelease(Client *c, Inode *in) :
4511 client(c) {
4512 if (client->use_faked_inos())
4513 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4514 else
4515 ino = in->vino();
4516 }
4517 void finish(int r) override {
4518 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4519 client->_async_inode_release(ino);
4520 }
4521};
4522
4523void Client::_async_inode_release(vinodeno_t ino)
4524{
f67539c2
TL
4525 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4526 if (!mref_reader.is_state_satisfied())
e306af50 4527 return;
f67539c2 4528
e306af50
TL
4529 ldout(cct, 10) << __func__ << " " << ino << dendl;
4530 ino_release_cb(callback_handle, ino);
4531}
4532
4533void Client::_schedule_ino_release_callback(Inode *in) {
4534
4535 if (ino_release_cb)
4536 // we queue the invalidate, which calls the callback and decrements the ref
4537 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4538}
4539
28e407b8 4540void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4541{
4542 mds_rank_t mds = s->mds_num;
28e407b8 4543 size_t caps_size = s->caps.size();
11fdf7f2 4544 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4545 << " caps " << caps_size << dendl;
4546
28e407b8
AA
4547 uint64_t trimmed = 0;
4548 auto p = s->caps.begin();
4549 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4550 * looking at from getting deleted during traversal. */
7c673cae
FG
4551 while ((caps_size - trimmed) > max && !p.end()) {
4552 Cap *cap = *p;
11fdf7f2 4553 InodeRef in(&cap->inode);
7c673cae
FG
4554
4555 // Increment p early because it will be invalidated if cap
4556 // is deleted inside remove_cap
4557 ++p;
4558
4559 if (in->caps.size() > 1 && cap != in->auth_cap) {
4560 int mine = cap->issued | cap->implemented;
4561 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4562 // disposable non-auth cap
b32b8144 4563 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4564 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4565 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4566 trimmed++;
4567 }
4568 } else {
4569 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4570 _trim_negative_child_dentries(in);
7c673cae 4571 bool all = true;
11fdf7f2
TL
4572 auto q = in->dentries.begin();
4573 while (q != in->dentries.end()) {
4574 Dentry *dn = *q;
4575 ++q;
7c673cae
FG
4576 if (dn->lru_is_expireable()) {
4577 if (can_invalidate_dentries &&
b3b6e05e 4578 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
7c673cae
FG
4579 // Only issue one of these per DN for inodes in root: handle
4580 // others more efficiently by calling for root-child DNs at
4581 // the end of this function.
4582 _schedule_invalidate_dentry_callback(dn, true);
4583 }
28e407b8
AA
4584 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4585 to_trim.insert(dn);
7c673cae
FG
4586 } else {
4587 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4588 all = false;
4589 }
4590 }
b3b6e05e 4591 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
f91f0fd5
TL
4592 _schedule_ino_release_callback(in.get());
4593 }
b3b6e05e 4594 if (all && in->ino != CEPH_INO_ROOT) {
7c673cae
FG
4595 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4596 trimmed++;
4597 }
4598 }
4599 }
28e407b8
AA
4600 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4601 for (const auto &dn : to_trim) {
4602 trim_dentry(dn);
4603 }
4604 to_trim.clear();
7c673cae 4605
b32b8144 4606 caps_size = s->caps.size();
11fdf7f2 4607 if (caps_size > (size_t)max)
7c673cae
FG
4608 _invalidate_kernel_dcache();
4609}
4610
4611void Client::force_session_readonly(MetaSession *s)
4612{
4613 s->readonly = true;
4614 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4615 auto &in = (*p)->inode;
4616 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4617 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4618 }
4619}
4620
7c673cae
FG
4621int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4622{
4623 MetaSession *session = in->auth_cap->session;
4624
4625 int flushing = in->dirty_caps;
11fdf7f2 4626 ceph_assert(flushing);
7c673cae
FG
4627
4628 ceph_tid_t flush_tid = ++last_flush_tid;
4629 in->flushing_cap_tids[flush_tid] = flushing;
4630
4631 if (!in->flushing_caps) {
11fdf7f2 4632 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4633 num_flushing_caps++;
4634 } else {
11fdf7f2 4635 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4636 }
4637
4638 in->flushing_caps |= flushing;
28e407b8 4639 in->mark_caps_clean();
7c673cae
FG
4640
4641 if (!in->flushing_cap_item.is_on_list())
4642 session->flushing_caps.push_back(&in->flushing_cap_item);
4643 session->flushing_caps_tids.insert(flush_tid);
4644
4645 *ptid = flush_tid;
4646 return flushing;
4647}
4648
4649void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4650{
4651 for (auto &p : in->cap_snaps) {
4652 CapSnap &capsnap = p.second;
4653 if (capsnap.flush_tid > 0) {
4654 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4655 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4656 }
4657 }
4658 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4659 it != in->flushing_cap_tids.end();
4660 ++it) {
4661 old_s->flushing_caps_tids.erase(it->first);
4662 new_s->flushing_caps_tids.insert(it->first);
4663 }
4664 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4665}
4666
4667/*
20effc67
TL
4668 * Flush all the dirty caps back to the MDS. Because the callers
4669 * generally wait on the result of this function (syncfs and umount
4670 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
7c673cae
FG
4671 */
4672void Client::flush_caps_sync()
4673{
4674 ldout(cct, 10) << __func__ << dendl;
20effc67
TL
4675 for (auto &q : mds_sessions) {
4676 auto s = q.second;
4677 xlist<Inode*>::iterator p = s->dirty_list.begin();
4678 while (!p.end()) {
4679 unsigned flags = CHECK_CAPS_NODELAY;
4680 Inode *in = *p;
7c673cae 4681
20effc67
TL
4682 ++p;
4683 if (p.end())
4684 flags |= CHECK_CAPS_SYNCHRONOUS;
4685 check_caps(in, flags);
4686 }
7c673cae
FG
4687 }
4688}
4689
7c673cae
FG
4690void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4691{
4692 while (in->flushing_caps) {
4693 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4694 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4695 if (it->first > want)
4696 break;
11fdf7f2 4697 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4698 << ccap_string(it->second) << " want " << want
4699 << " last " << it->first << dendl;
4700 wait_on_list(in->waitfor_caps);
4701 }
4702}
4703
4704void Client::wait_sync_caps(ceph_tid_t want)
4705{
4706 retry:
11fdf7f2 4707 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4708 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2 4709 for (auto &p : mds_sessions) {
20effc67 4710 auto s = p.second;
7c673cae
FG
4711 if (s->flushing_caps_tids.empty())
4712 continue;
4713 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4714 if (oldest_tid <= want) {
11fdf7f2 4715 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4716 << " (want " << want << ")" << dendl;
9f95a23c
TL
4717 std::unique_lock l{client_lock, std::adopt_lock};
4718 sync_cond.wait(l);
4719 l.release();
7c673cae
FG
4720 goto retry;
4721 }
4722 }
4723}
4724
eafe8130
TL
4725void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4726{
4727 in->flags &= ~I_KICK_FLUSH;
4728
4729 Cap *cap = in->auth_cap;
4730 ceph_assert(cap->session == session);
4731
4732 ceph_tid_t last_snap_flush = 0;
4733 for (auto p = in->flushing_cap_tids.rbegin();
4734 p != in->flushing_cap_tids.rend();
4735 ++p) {
4736 if (!p->second) {
4737 last_snap_flush = p->first;
4738 break;
4739 }
4740 }
4741
4742 int wanted = in->caps_wanted();
4743 int used = get_caps_used(in) | in->caps_dirty();
4744 auto it = in->cap_snaps.begin();
4745 for (auto& p : in->flushing_cap_tids) {
4746 if (p.second) {
4747 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4748 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4749 p.second, p.first);
4750 } else {
4751 ceph_assert(it != in->cap_snaps.end());
4752 ceph_assert(it->second.flush_tid == p.first);
4753 send_flush_snap(in, session, it->first, it->second);
4754 ++it;
4755 }
4756 }
4757}
4758
7c673cae
FG
4759void Client::kick_flushing_caps(MetaSession *session)
4760{
4761 mds_rank_t mds = session->mds_num;
11fdf7f2 4762 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4763
4764 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4765 Inode *in = *p;
eafe8130
TL
4766 if (in->flags & I_KICK_FLUSH) {
4767 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4768 kick_flushing_caps(in, session);
4769 }
7c673cae 4770 }
7c673cae
FG
4771}
4772
4773void Client::early_kick_flushing_caps(MetaSession *session)
4774{
7c673cae
FG
4775 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4776 Inode *in = *p;
11fdf7f2
TL
4777 Cap *cap = in->auth_cap;
4778 ceph_assert(cap);
7c673cae
FG
4779
4780 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4781 // stage. This guarantees that MDS processes the cap flush message before issuing
4782 // the flushing caps to other client.
eafe8130
TL
4783 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4784 in->flags |= I_KICK_FLUSH;
7c673cae 4785 continue;
eafe8130 4786 }
7c673cae
FG
4787
4788 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4789 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4790 // send_reconnect() also will reset these sequence numbers. make sure
4791 // sequence numbers in cap flush message match later reconnect message.
4792 cap->seq = 0;
4793 cap->issue_seq = 0;
4794 cap->mseq = 0;
4795 cap->issued = cap->implemented;
4796
eafe8130 4797 kick_flushing_caps(in, session);
7c673cae
FG
4798 }
4799}
4800
7c673cae
FG
4801void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4802{
4803 list<SnapRealm*> q;
4804 q.push_back(realm);
4805
4806 while (!q.empty()) {
4807 realm = q.front();
4808 q.pop_front();
4809
11fdf7f2 4810 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4811 realm->invalidate_cache();
4812
4813 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4814 p != realm->pchildren.end();
4815 ++p)
4816 q.push_back(*p);
4817 }
4818}
4819
4820SnapRealm *Client::get_snap_realm(inodeno_t r)
4821{
4822 SnapRealm *realm = snap_realms[r];
4823 if (!realm)
4824 snap_realms[r] = realm = new SnapRealm(r);
11fdf7f2 4825 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4826 realm->nref++;
4827 return realm;
4828}
4829
4830SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4831{
4832 if (snap_realms.count(r) == 0) {
11fdf7f2 4833 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4834 return NULL;
4835 }
4836 SnapRealm *realm = snap_realms[r];
11fdf7f2 4837 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4838 realm->nref++;
4839 return realm;
4840}
4841
4842void Client::put_snap_realm(SnapRealm *realm)
4843{
11fdf7f2 4844 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4845 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4846 if (--realm->nref == 0) {
4847 snap_realms.erase(realm->ino);
4848 if (realm->pparent) {
4849 realm->pparent->pchildren.erase(realm);
4850 put_snap_realm(realm->pparent);
4851 }
4852 delete realm;
4853 }
4854}
4855
4856bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4857{
4858 if (realm->parent != parent) {
11fdf7f2 4859 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
4860 << " " << realm->parent << " -> " << parent << dendl;
4861 realm->parent = parent;
4862 if (realm->pparent) {
4863 realm->pparent->pchildren.erase(realm);
4864 put_snap_realm(realm->pparent);
4865 }
4866 realm->pparent = get_snap_realm(parent);
4867 realm->pparent->pchildren.insert(realm);
4868 return true;
4869 }
4870 return false;
4871}
4872
4873static bool has_new_snaps(const SnapContext& old_snapc,
4874 const SnapContext& new_snapc)
4875{
4876 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4877}
4878
4879
11fdf7f2 4880void Client::update_snap_trace(const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
4881{
4882 SnapRealm *first_realm = NULL;
11fdf7f2 4883 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
4884
4885 map<SnapRealm*, SnapContext> dirty_realms;
4886
11fdf7f2 4887 auto p = bl.cbegin();
7c673cae
FG
4888 while (!p.end()) {
4889 SnapRealmInfo info;
11fdf7f2 4890 decode(info, p);
7c673cae
FG
4891 SnapRealm *realm = get_snap_realm(info.ino());
4892
4893 bool invalidate = false;
4894
4895 if (info.seq() > realm->seq) {
11fdf7f2 4896 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
7c673cae
FG
4897 << dendl;
4898
4899 if (flush) {
4900 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4901 // flush me + children
4902 list<SnapRealm*> q;
4903 q.push_back(realm);
4904 while (!q.empty()) {
4905 SnapRealm *realm = q.front();
4906 q.pop_front();
4907
4908 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4909 p != realm->pchildren.end();
4910 ++p)
4911 q.push_back(*p);
4912
4913 if (dirty_realms.count(realm) == 0) {
4914 realm->nref++;
4915 dirty_realms[realm] = realm->get_snap_context();
4916 }
4917 }
4918 }
4919
4920 // update
4921 realm->seq = info.seq();
4922 realm->created = info.created();
4923 realm->parent_since = info.parent_since();
4924 realm->prior_parent_snaps = info.prior_parent_snaps;
4925 realm->my_snaps = info.my_snaps;
4926 invalidate = true;
4927 }
4928
4929 // _always_ verify parent
4930 if (adjust_realm_parent(realm, info.parent()))
4931 invalidate = true;
4932
4933 if (invalidate) {
4934 invalidate_snaprealm_and_children(realm);
11fdf7f2 4935 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
4936 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4937 } else {
11fdf7f2 4938 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
4939 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4940 }
f67539c2 4941
7c673cae
FG
4942 if (!first_realm)
4943 first_realm = realm;
4944 else
4945 put_snap_realm(realm);
4946 }
4947
f67539c2 4948 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 4949 // if there are new snaps ?
f67539c2 4950 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 4951 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
f67539c2
TL
4952 for (auto&& in : realm->inodes_with_caps) {
4953 queue_cap_snap(in, snapc);
7c673cae
FG
4954 }
4955 } else {
4956 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4957 }
4958 put_snap_realm(realm);
4959 }
4960
4961 if (realm_ret)
4962 *realm_ret = first_realm;
4963 else
4964 put_snap_realm(first_realm);
4965}
4966
11fdf7f2 4967void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 4968{
11fdf7f2 4969 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 4970 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
4971
4972 std::scoped_lock cl(client_lock);
20effc67 4973 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 4974 if (!session) {
7c673cae
FG
4975 return;
4976 }
4977
20effc67 4978 got_mds_push(session.get());
7c673cae
FG
4979
4980 map<Inode*, SnapContext> to_move;
4981 SnapRealm *realm = 0;
4982
4983 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 4984 ceph_assert(m->head.split);
7c673cae 4985 SnapRealmInfo info;
11fdf7f2
TL
4986 auto p = m->bl.cbegin();
4987 decode(info, p);
4988 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
4989
4990 // flush, then move, ino's.
4991 realm = get_snap_realm(info.ino());
4992 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
4993 for (auto& ino : m->split_inos) {
4994 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
4995 if (inode_map.count(vino)) {
4996 Inode *in = inode_map[vino];
4997 if (!in->snaprealm || in->snaprealm == realm)
4998 continue;
4999 if (in->snaprealm->created > info.created()) {
5000 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5001 << *in->snaprealm << dendl;
5002 continue;
5003 }
5004 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5005
5006
5007 in->snaprealm_item.remove_myself();
5008 to_move[in] = in->snaprealm->get_snap_context();
5009 put_snap_realm(in->snaprealm);
5010 }
5011 }
5012
5013 // move child snaprealms, too
11fdf7f2
TL
5014 for (auto& child_realm : m->split_realms) {
5015 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5016 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
5017 if (!child)
5018 continue;
5019 adjust_realm_parent(child, realm->ino);
5020 put_snap_realm(child);
5021 }
5022 }
5023
5024 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
5025
5026 if (realm) {
5027 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5028 Inode *in = p->first;
5029 in->snaprealm = realm;
5030 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5031 realm->nref++;
5032 // queue for snap writeback
5033 if (has_new_snaps(p->second, realm->get_snap_context()))
5034 queue_cap_snap(in, p->second);
5035 }
5036 put_snap_realm(realm);
5037 }
7c673cae
FG
5038}
5039
11fdf7f2 5040void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
5041{
5042 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5043
5044 std::scoped_lock cl(client_lock);
20effc67 5045 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5046 if (!session) {
7c673cae
FG
5047 return;
5048 }
5049
20effc67 5050 got_mds_push(session.get());
7c673cae 5051
11fdf7f2 5052 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
5053
5054 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5055 if (inode_map.count(vino)) {
5056 Inode *in = NULL;
5057 in = inode_map[vino];
5058
5059 if (in) {
5060 in->quota = m->quota;
5061 in->rstat = m->rstat;
5062 }
5063 }
7c673cae
FG
5064}
5065
11fdf7f2 5066void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
5067{
5068 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5069
5070 std::scoped_lock cl(client_lock);
20effc67 5071 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5072 if (!session) {
7c673cae
FG
5073 return;
5074 }
5075
5076 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5077 // Pause RADOS operations until we see the required epoch
5078 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5079 }
5080
5081 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5082 // Record the barrier so that we will transmit it to MDS when releasing
5083 set_cap_epoch_barrier(m->osd_epoch_barrier);
5084 }
5085
20effc67 5086 got_mds_push(session.get());
7c673cae 5087
11fdf7f2 5088 Inode *in;
7c673cae 5089 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
5090 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5091 in = it->second;
5092 } else {
7c673cae 5093 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 5094 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
5095 session->enqueue_cap_release(
5096 m->get_ino(),
5097 m->get_cap_id(),
5098 m->get_seq(),
5099 m->get_mseq(),
5100 cap_epoch_barrier);
5101 } else {
11fdf7f2 5102 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 5103 }
7c673cae
FG
5104
5105 // in case the mds is waiting on e.g. a revocation
5106 flush_cap_releases();
5107 return;
5108 }
5109
5110 switch (m->get_op()) {
20effc67
TL
5111 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5112 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5113 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
7c673cae
FG
5114 }
5115
11fdf7f2
TL
5116 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5117 Cap &cap = in->caps.at(mds);
7c673cae 5118
11fdf7f2 5119 switch (m->get_op()) {
20effc67 5120 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
11fdf7f2
TL
5121 case CEPH_CAP_OP_IMPORT:
5122 case CEPH_CAP_OP_REVOKE:
20effc67
TL
5123 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5124 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
11fdf7f2
TL
5125 }
5126 } else {
5127 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5128 return;
7c673cae
FG
5129 }
5130}
5131
11fdf7f2 5132void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5133{
5134 mds_rank_t mds = session->mds_num;
5135
11fdf7f2 5136 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5137 << " IMPORT from mds." << mds << dendl;
5138
5139 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5140 Cap *cap = NULL;
5141 UserPerm cap_perms;
11fdf7f2
TL
5142 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5143 cap = &it->second;
5144 cap_perms = cap->latest_perms;
7c673cae
FG
5145 }
5146
5147 // add/update it
5148 SnapRealm *realm = NULL;
5149 update_snap_trace(m->snapbl, &realm);
5150
1911f103
TL
5151 int issued = m->get_caps();
5152 int wanted = m->get_wanted();
7c673cae 5153 add_update_cap(in, session, m->get_cap_id(),
1911f103 5154 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 5155 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
5156
5157 if (cap && cap->cap_id == m->peer.cap_id) {
5158 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5159 }
5160
5161 if (realm)
5162 put_snap_realm(realm);
5163
eafe8130 5164 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
5165 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5166 in->requested_max_size > m->get_max_size()) {
5167 in->requested_max_size = 0;
5168 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5169 }
7c673cae 5170 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5171 kick_flushing_caps(in, session);
7c673cae
FG
5172 }
5173}
5174
11fdf7f2 5175void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5176{
5177 mds_rank_t mds = session->mds_num;
5178
11fdf7f2 5179 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5180 << " EXPORT from mds." << mds << dendl;
5181
11fdf7f2
TL
5182 auto it = in->caps.find(mds);
5183 if (it != in->caps.end()) {
5184 Cap &cap = it->second;
5185 if (cap.cap_id == m->get_cap_id()) {
5186 if (m->peer.cap_id) {
5187 const auto peer_mds = mds_rank_t(m->peer.mds);
20effc67 5188 auto tsession = _get_or_open_mds_session(peer_mds);
11fdf7f2
TL
5189 auto it = in->caps.find(peer_mds);
5190 if (it != in->caps.end()) {
5191 Cap &tcap = it->second;
5192 if (tcap.cap_id == m->peer.cap_id &&
5193 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5194 tcap.cap_id = m->peer.cap_id;
5195 tcap.seq = m->peer.seq - 1;
5196 tcap.issue_seq = tcap.seq;
5197 tcap.issued |= cap.issued;
5198 tcap.implemented |= cap.issued;
5199 if (&cap == in->auth_cap)
5200 in->auth_cap = &tcap;
5201 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
20effc67 5202 adjust_session_flushing_caps(in, session, tsession.get());
11fdf7f2
TL
5203 }
5204 } else {
20effc67 5205 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
11fdf7f2
TL
5206 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5207 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5208 cap.latest_perms);
5209 }
7c673cae 5210 } else {
11fdf7f2
TL
5211 if (cap.wanted | cap.issued)
5212 in->flags |= I_CAP_DROPPED;
7c673cae 5213 }
7c673cae 5214
11fdf7f2
TL
5215 remove_cap(&cap, false);
5216 }
7c673cae 5217 }
7c673cae
FG
5218}
5219
11fdf7f2 5220void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5221{
5222 mds_rank_t mds = session->mds_num;
11fdf7f2 5223 ceph_assert(in->caps.count(mds));
7c673cae 5224
11fdf7f2 5225 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5226 << " size " << in->size << " -> " << m->get_size()
5227 << dendl;
5228
1adf2230
AA
5229 int issued;
5230 in->caps_issued(&issued);
5231 issued |= in->caps_dirty();
5232 update_inode_file_size(in, issued, m->get_size(),
5233 m->get_truncate_seq(), m->get_truncate_size());
7c673cae
FG
5234}
5235
11fdf7f2 5236void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5237{
5238 ceph_tid_t flush_ack_tid = m->get_client_tid();
5239 int dirty = m->get_dirty();
5240 int cleaned = 0;
5241 int flushed = 0;
5242
11fdf7f2
TL
5243 auto it = in->flushing_cap_tids.begin();
5244 if (it->first < flush_ack_tid) {
5245 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5246 << " got unexpected flush ack tid " << flush_ack_tid
5247 << " expected is " << it->first << dendl;
5248 }
5249 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5250 if (!it->second) {
5251 // cap snap
5252 ++it;
5253 continue;
5254 }
7c673cae
FG
5255 if (it->first == flush_ack_tid)
5256 cleaned = it->second;
5257 if (it->first <= flush_ack_tid) {
5258 session->flushing_caps_tids.erase(it->first);
5259 in->flushing_cap_tids.erase(it++);
5260 ++flushed;
5261 continue;
5262 }
5263 cleaned &= ~it->second;
5264 if (!cleaned)
5265 break;
5266 ++it;
5267 }
5268
11fdf7f2 5269 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5270 << " cleaned " << ccap_string(cleaned) << " on " << *in
5271 << " with " << ccap_string(dirty) << dendl;
5272
5273 if (flushed) {
5274 signal_cond_list(in->waitfor_caps);
5275 if (session->flushing_caps_tids.empty() ||
5276 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5277 sync_cond.notify_all();
7c673cae
FG
5278 }
5279
5280 if (!dirty) {
5281 in->cap_dirtier_uid = -1;
5282 in->cap_dirtier_gid = -1;
5283 }
5284
5285 if (!cleaned) {
5286 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5287 } else {
5288 if (in->flushing_caps) {
5289 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5290 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5291 in->flushing_caps &= ~cleaned;
5292 if (in->flushing_caps == 0) {
5293 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5294 num_flushing_caps--;
eafe8130 5295 if (in->flushing_cap_tids.empty())
7c673cae
FG
5296 in->flushing_cap_item.remove_myself();
5297 }
5298 if (!in->caps_dirty())
5299 put_inode(in);
5300 }
5301 }
7c673cae
FG
5302}
5303
5304
11fdf7f2 5305void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5306{
eafe8130 5307 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5308 mds_rank_t mds = session->mds_num;
11fdf7f2 5309 ceph_assert(in->caps.count(mds));
7c673cae
FG
5310 snapid_t follows = m->get_snap_follows();
5311
11fdf7f2
TL
5312 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5313 auto& capsnap = it->second;
eafe8130
TL
5314 if (flush_ack_tid != capsnap.flush_tid) {
5315 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5316 } else {
eafe8130 5317 InodeRef tmp_ref(in);
11fdf7f2 5318 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5319 << " on " << *in << dendl;
7c673cae 5320 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5321 in->flushing_cap_tids.erase(capsnap.flush_tid);
5322 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5323 in->flushing_cap_item.remove_myself();
11fdf7f2 5324 in->cap_snaps.erase(it);
eafe8130
TL
5325
5326 signal_cond_list(in->waitfor_caps);
5327 if (session->flushing_caps_tids.empty() ||
5328 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5329 sync_cond.notify_all();
7c673cae
FG
5330 }
5331 } else {
11fdf7f2 5332 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5333 << " on " << *in << dendl;
5334 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5335 }
7c673cae
FG
5336}
5337
5338class C_Client_DentryInvalidate : public Context {
5339private:
5340 Client *client;
5341 vinodeno_t dirino;
5342 vinodeno_t ino;
5343 string name;
5344public:
5345 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5346 client(c), name(dn->name) {
5347 if (client->use_faked_inos()) {
5348 dirino.ino = dn->dir->parent_inode->faked_ino;
5349 if (del)
5350 ino.ino = dn->inode->faked_ino;
5351 } else {
5352 dirino = dn->dir->parent_inode->vino();
5353 if (del)
5354 ino = dn->inode->vino();
5355 }
5356 if (!del)
5357 ino.ino = inodeno_t();
5358 }
5359 void finish(int r) override {
5360 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5361 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5362 client->_async_dentry_invalidate(dirino, ino, name);
5363 }
5364};
5365
5366void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5367{
f67539c2
TL
5368 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5369 if (!mref_reader.is_state_satisfied())
7c673cae 5370 return;
f67539c2 5371
11fdf7f2 5372 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5373 << " in dir " << dirino << dendl;
e306af50 5374 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5375}
5376
5377void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5378{
5379 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5380 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5381}
5382
5383void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5384{
b3b6e05e 5385 int ref = in->get_nref();
494da23a 5386 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5387
5388 if (in->dir && !in->dir->dentries.empty()) {
5389 for (auto p = in->dir->dentries.begin();
5390 p != in->dir->dentries.end(); ) {
5391 Dentry *dn = p->second;
5392 ++p;
5393 /* rmsnap removes whole subtree, need trim inodes recursively.
5394 * we don't need to invalidate dentries recursively. because
5395 * invalidating a directory dentry effectively invalidate
5396 * whole subtree */
5397 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5398 _try_to_trim_inode(dn->inode.get(), false);
5399
5400 if (dn->lru_is_expireable())
5401 unlink(dn, true, false); // keep dir, drop dentry
5402 }
5403 if (in->dir->dentries.empty()) {
5404 close_dir(in->dir);
5405 --ref;
5406 }
5407 }
5408
b3b6e05e 5409 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
7c673cae
FG
5410 InodeRef snapdir = open_snapdir(in);
5411 _try_to_trim_inode(snapdir.get(), false);
5412 --ref;
5413 }
5414
b3b6e05e 5415 if (ref > 1) {
11fdf7f2
TL
5416 auto q = in->dentries.begin();
5417 while (q != in->dentries.end()) {
5418 Dentry *dn = *q;
5419 ++q;
494da23a
TL
5420 if( in->ll_ref > 0 && sched_inval) {
5421 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5422 // so in->dentries doesn't always reflect the state of kernel's dcache.
5423 _schedule_invalidate_dentry_callback(dn, true);
5424 }
7c673cae
FG
5425 unlink(dn, true, true);
5426 }
5427 }
5428}
5429
11fdf7f2 5430void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5431{
5432 mds_rank_t mds = session->mds_num;
5433 int used = get_caps_used(in);
5434 int wanted = in->caps_wanted();
a4b75251 5435 int flags = 0;
7c673cae 5436
a8e16298
TL
5437 const unsigned new_caps = m->get_caps();
5438 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5439 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5440 << " mds." << mds << " seq " << m->get_seq()
5441 << " caps now " << ccap_string(new_caps)
a8e16298 5442 << " was " << ccap_string(cap->issued)
92f5a8d4 5443 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5444
5445 if (was_stale)
5446 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5447 cap->seq = m->get_seq();
28e407b8 5448 cap->gen = session->cap_gen;
7c673cae 5449
11fdf7f2 5450 check_cap_issue(in, new_caps);
a8e16298 5451
7c673cae 5452 // update inode
1adf2230
AA
5453 int issued;
5454 in->caps_issued(&issued);
5455 issued |= in->caps_dirty();
7c673cae 5456
1adf2230
AA
5457 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5458 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5459 in->mode = m->head.mode;
5460 in->uid = m->head.uid;
5461 in->gid = m->head.gid;
5462 in->btime = m->btime;
5463 }
5464 bool deleted_inode = false;
1adf2230
AA
5465 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5466 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae 5467 in->nlink = m->head.nlink;
20effc67 5468 if (in->nlink == 0)
7c673cae
FG
5469 deleted_inode = true;
5470 }
1adf2230 5471 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5472 m->xattrbl.length() &&
5473 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5474 auto p = m->xattrbl.cbegin();
5475 decode(in->xattrs, p);
7c673cae
FG
5476 in->xattr_version = m->head.xattr_version;
5477 }
28e407b8
AA
5478
5479 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5480 in->dirstat.nfiles = m->get_nfiles();
5481 in->dirstat.nsubdirs = m->get_nsubdirs();
5482 }
5483
1adf2230
AA
5484 if (new_caps & CEPH_CAP_ANY_RD) {
5485 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5486 m->get_ctime(), m->get_mtime(), m->get_atime());
5487 }
5488
5489 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5490 in->layout = m->get_layout();
5491 update_inode_file_size(in, issued, m->get_size(),
5492 m->get_truncate_seq(), m->get_truncate_size());
5493 }
5494
5495 if (m->inline_version > in->inline_version) {
5496 in->inline_data = m->inline_data;
5497 in->inline_version = m->inline_version;
5498 }
5499
5500 /* always take a newer change attr */
5501 if (m->get_change_attr() > in->change_attr)
5502 in->change_attr = m->get_change_attr();
7c673cae
FG
5503
5504 // max_size
5505 if (cap == in->auth_cap &&
1adf2230
AA
5506 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5507 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5508 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5509 in->max_size = m->get_max_size();
5510 if (in->max_size > in->wanted_max_size) {
5511 in->wanted_max_size = 0;
5512 in->requested_max_size = 0;
5513 }
5514 }
5515
5516 bool check = false;
a8e16298
TL
5517 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5518 (wanted & ~(cap->wanted | new_caps))) {
5519 // If mds is importing cap, prior cap messages that update 'wanted'
5520 // may get dropped by mds (migrate seq mismatch).
5521 //
5522 // We don't send cap message to update 'wanted' if what we want are
5523 // already issued. If mds revokes caps, cap message that releases caps
5524 // also tells mds what we want. But if caps got revoked by mds forcedly
5525 // (session stale). We may haven't told mds what we want.
7c673cae 5526 check = true;
a8e16298 5527 }
7c673cae 5528
7c673cae
FG
5529
5530 // update caps
a8e16298 5531 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5532 if (revoked) {
5533 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5534 cap->issued = new_caps;
5535 cap->implemented |= new_caps;
5536
b32b8144
FG
5537 // recall delegations if we're losing caps necessary for them
5538 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5539 in->recall_deleg(false);
5540 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5541 in->recall_deleg(true);
5542
11fdf7f2
TL
5543 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5544 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5545 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5546 // waitin' for flush
11fdf7f2 5547 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
a4b75251
TL
5548 if (_release(in)) {
5549 check = true;
5550 flags = CHECK_CAPS_NODELAY;
5551 }
7c673cae
FG
5552 } else {
5553 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5554 check = true;
a4b75251 5555 flags = CHECK_CAPS_NODELAY;
7c673cae 5556 }
a8e16298
TL
5557 } else if (cap->issued == new_caps) {
5558 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5559 } else {
a8e16298 5560 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5561 cap->issued = new_caps;
5562 cap->implemented |= new_caps;
5563
5564 if (cap == in->auth_cap) {
5565 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5566 for (const auto &p : in->caps) {
5567 if (&p.second == cap)
7c673cae 5568 continue;
11fdf7f2 5569 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5570 check = true;
5571 break;
5572 }
5573 }
5574 }
5575 }
5576
5577 if (check)
a4b75251 5578 check_caps(in, flags);
7c673cae
FG
5579
5580 // wake up waiters
5581 if (new_caps)
5582 signal_cond_list(in->waitfor_caps);
5583
5584 // may drop inode's last ref
5585 if (deleted_inode)
5586 _try_to_trim_inode(in, true);
7c673cae
FG
5587}
5588
7c673cae
FG
5589int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5590{
b3b6e05e
TL
5591 if (perms.uid() == 0) {
5592 // Executable are overridable when there is at least one exec bit set
5593 if((want & MAY_EXEC) && !(in->mode & S_IXUGO))
5594 return -CEPHFS_EACCES;
7c673cae 5595 return 0;
b3b6e05e 5596 }
7c673cae
FG
5597
5598 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5599 int ret = _posix_acl_permission(in, perms, want);
f67539c2 5600 if (ret != -CEPHFS_EAGAIN)
7c673cae
FG
5601 return ret;
5602 }
5603
5604 // check permissions before doing anything else
5605 if (!in->check_mode(perms, want))
f67539c2 5606 return -CEPHFS_EACCES;
7c673cae
FG
5607 return 0;
5608}
5609
5610int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5611 const UserPerm& perms)
5612{
5613 int r = _getattr_for_perm(in, perms);
5614 if (r < 0)
5615 goto out;
5616
5617 r = 0;
5618 if (strncmp(name, "system.", 7) == 0) {
5619 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
f67539c2 5620 r = -CEPHFS_EPERM;
7c673cae
FG
5621 } else {
5622 r = inode_permission(in, perms, want);
5623 }
5624out:
1adf2230 5625 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5626 return r;
5627}
5628
20effc67 5629std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
7c673cae
FG
5630 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5631 return out;
5632}
5633
5634int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5635 const UserPerm& perms)
5636{
181888fb 5637 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5638 int r = _getattr_for_perm(in, perms);
5639 if (r < 0)
5640 goto out;
5641
5642 if (mask & CEPH_SETATTR_SIZE) {
5643 r = inode_permission(in, perms, MAY_WRITE);
5644 if (r < 0)
5645 goto out;
5646 }
5647
f67539c2 5648 r = -CEPHFS_EPERM;
7c673cae
FG
5649 if (mask & CEPH_SETATTR_UID) {
5650 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5651 goto out;
5652 }
5653 if (mask & CEPH_SETATTR_GID) {
5654 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5655 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5656 goto out;
5657 }
5658
5659 if (mask & CEPH_SETATTR_MODE) {
5660 if (perms.uid() != 0 && perms.uid() != in->uid)
5661 goto out;
5662
5663 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5664 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5665 stx->stx_mode &= ~S_ISGID;
5666 }
5667
5668 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5669 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5670 if (perms.uid() != 0 && perms.uid() != in->uid) {
5671 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5672 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5673 check_mask |= CEPH_SETATTR_MTIME;
5674 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5675 check_mask |= CEPH_SETATTR_ATIME;
5676 if (check_mask & mask) {
5677 goto out;
5678 } else {
5679 r = inode_permission(in, perms, MAY_WRITE);
5680 if (r < 0)
5681 goto out;
5682 }
5683 }
5684 }
5685 r = 0;
5686out:
5687 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5688 return r;
5689}
5690
5691int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5692{
181888fb 5693 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5694 unsigned want = 0;
5695
5696 if ((flags & O_ACCMODE) == O_WRONLY)
5697 want = MAY_WRITE;
5698 else if ((flags & O_ACCMODE) == O_RDWR)
5699 want = MAY_READ | MAY_WRITE;
5700 else if ((flags & O_ACCMODE) == O_RDONLY)
5701 want = MAY_READ;
5702 if (flags & O_TRUNC)
5703 want |= MAY_WRITE;
5704
5705 int r = 0;
5706 switch (in->mode & S_IFMT) {
5707 case S_IFLNK:
f67539c2 5708 r = -CEPHFS_ELOOP;
7c673cae
FG
5709 goto out;
5710 case S_IFDIR:
5711 if (want & MAY_WRITE) {
f67539c2 5712 r = -CEPHFS_EISDIR;
7c673cae
FG
5713 goto out;
5714 }
5715 break;
5716 }
5717
5718 r = _getattr_for_perm(in, perms);
5719 if (r < 0)
5720 goto out;
5721
5722 r = inode_permission(in, perms, want);
5723out:
5724 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5725 return r;
5726}
5727
5728int Client::may_lookup(Inode *dir, const UserPerm& perms)
5729{
181888fb 5730 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5731 int r = _getattr_for_perm(dir, perms);
5732 if (r < 0)
5733 goto out;
5734
5735 r = inode_permission(dir, perms, MAY_EXEC);
5736out:
5737 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5738 return r;
5739}
5740
5741int Client::may_create(Inode *dir, const UserPerm& perms)
5742{
181888fb 5743 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5744 int r = _getattr_for_perm(dir, perms);
5745 if (r < 0)
5746 goto out;
5747
5748 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5749out:
5750 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5751 return r;
5752}
5753
5754int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5755{
181888fb 5756 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5757 int r = _getattr_for_perm(dir, perms);
5758 if (r < 0)
5759 goto out;
5760
5761 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5762 if (r < 0)
5763 goto out;
5764
f67539c2 5765 /* 'name == NULL' means rmsnap w/o permission checks */
7c673cae
FG
5766 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5767 InodeRef otherin;
5768 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5769 if (r < 0)
5770 goto out;
5771 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
f67539c2 5772 r = -CEPHFS_EPERM;
7c673cae
FG
5773 }
5774out:
5775 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5776 return r;
5777}
5778
f67539c2
TL
5779int Client::may_delete(const char *relpath, const UserPerm& perms) {
5780 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5781
5782 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5783 if (!mref_reader.is_state_satisfied())
5784 return -ENOTCONN;
5785
5786 filepath path(relpath);
5787 string name = path.last_dentry();
5788 path.pop_dentry();
5789 InodeRef dir;
5790
5791 std::scoped_lock lock(client_lock);
5792 int r = path_walk(path, &dir, perms);
5793 if (r < 0)
5794 return r;
5795 if (cct->_conf->client_permissions) {
5796 int r = may_delete(dir.get(), name.c_str(), perms);
5797 if (r < 0)
5798 return r;
5799 }
5800
5801 return 0;
5802}
5803
7c673cae
FG
5804int Client::may_hardlink(Inode *in, const UserPerm& perms)
5805{
181888fb 5806 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5807 int r = _getattr_for_perm(in, perms);
5808 if (r < 0)
5809 goto out;
5810
5811 if (perms.uid() == 0 || perms.uid() == in->uid) {
5812 r = 0;
5813 goto out;
5814 }
5815
f67539c2 5816 r = -CEPHFS_EPERM;
7c673cae
FG
5817 if (!S_ISREG(in->mode))
5818 goto out;
5819
5820 if (in->mode & S_ISUID)
5821 goto out;
5822
5823 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5824 goto out;
5825
5826 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5827out:
5828 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5829 return r;
5830}
5831
5832int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5833{
5834 int mask = CEPH_STAT_CAP_MODE;
5835 bool force = false;
5836 if (acl_type != NO_ACL) {
5837 mask |= CEPH_STAT_CAP_XATTR;
5838 force = in->xattr_version == 0;
5839 }
5840 return _getattr(in, mask, perms, force);
5841}
5842
5843vinodeno_t Client::_get_vino(Inode *in)
5844{
5845 /* The caller must hold the client lock */
5846 return vinodeno_t(in->ino, in->snapid);
5847}
5848
7c673cae
FG
5849/**
5850 * Resolve an MDS spec to a list of MDS daemon GIDs.
5851 *
5852 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5853 * It may be '*' in which case it matches all GIDs.
5854 *
5855 * If no error is returned, the `targets` vector will be populated with at least
5856 * one MDS.
5857 */
5858int Client::resolve_mds(
5859 const std::string &mds_spec,
5860 std::vector<mds_gid_t> *targets)
5861{
11fdf7f2
TL
5862 ceph_assert(fsmap);
5863 ceph_assert(targets != nullptr);
7c673cae
FG
5864
5865 mds_role_t role;
f67539c2
TL
5866 CachedStackStringStream css;
5867 int role_r = fsmap->parse_role(mds_spec, &role, *css);
7c673cae
FG
5868 if (role_r == 0) {
5869 // We got a role, resolve it to a GID
f67539c2
TL
5870 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
5871 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
5872 << role << "' aka " << info.human_name() << dendl;
5873 targets->push_back(info.global_id);
7c673cae
FG
5874 return 0;
5875 }
5876
5877 std::string strtol_err;
5878 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5879 if (strtol_err.empty()) {
5880 // It is a possible GID
5881 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5882 if (fsmap->gid_exists(mds_gid)) {
f67539c2
TL
5883 auto& info = fsmap->get_info_gid(mds_gid);
5884 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
5885 << info.human_name() << dendl;
7c673cae 5886 targets->push_back(mds_gid);
f67539c2 5887 return 0;
7c673cae 5888 } else {
f67539c2 5889 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
7c673cae 5890 << dendl;
f67539c2
TL
5891 lderr(cct) << "FSMap: " << *fsmap << dendl;
5892 return -CEPHFS_ENOENT;
7c673cae
FG
5893 }
5894 } else if (mds_spec == "*") {
5895 // It is a wildcard: use all MDSs
f67539c2 5896 const auto& mds_info = fsmap->get_mds_info();
7c673cae 5897
f67539c2 5898 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
7c673cae 5899 if (mds_info.empty()) {
f67539c2
TL
5900 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
5901 lderr(cct) << "FSMap: " << *fsmap << dendl;
5902 return -CEPHFS_ENOENT;
7c673cae
FG
5903 }
5904
f67539c2
TL
5905 for (const auto& [gid, info] : mds_info) {
5906 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
5907 targets->push_back(gid);
7c673cae 5908 }
f67539c2 5909 return 0;
7c673cae
FG
5910 } else {
5911 // It did not parse as an integer, it is not a wildcard, it must be a name
5912 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5913 if (mds_gid == 0) {
f67539c2 5914 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
7c673cae 5915 lderr(cct) << "FSMap: " << *fsmap << dendl;
f67539c2 5916 return -CEPHFS_ENOENT;
7c673cae 5917 } else {
f67539c2
TL
5918 auto& info = fsmap->get_info_gid(mds_gid);
5919 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
5920 << "' to " << info.human_name() << dendl;
7c673cae
FG
5921 targets->push_back(mds_gid);
5922 }
f67539c2 5923 return 0;
7c673cae 5924 }
7c673cae
FG
5925}
5926
5927
5928/**
5929 * Authenticate with mon and establish global ID
5930 */
5931int Client::authenticate()
5932{
9f95a23c 5933 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
5934
5935 if (monclient->is_authenticated()) {
5936 return 0;
5937 }
5938
9f95a23c 5939 client_lock.unlock();
7c673cae 5940 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
9f95a23c 5941 client_lock.lock();
7c673cae
FG
5942 if (r < 0) {
5943 return r;
5944 }
5945
5946 whoami = monclient->get_global_id();
5947 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5948
5949 return 0;
5950}
5951
5952int Client::fetch_fsmap(bool user)
5953{
f67539c2
TL
5954 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
5955
7c673cae
FG
5956 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5957 // rather than MDSMap because no one MDSMap contains all the daemons, and
5958 // a `tell` can address any daemon.
5959 version_t fsmap_latest;
f67539c2 5960 bs::error_code ec;
7c673cae 5961 do {
9f95a23c 5962 client_lock.unlock();
f67539c2
TL
5963 std::tie(fsmap_latest, std::ignore) =
5964 monclient->get_version("fsmap", ca::use_blocked[ec]);
9f95a23c 5965 client_lock.lock();
f67539c2 5966 } while (ec == bs::errc::resource_unavailable_try_again);
7c673cae 5967
f67539c2
TL
5968 if (ec) {
5969 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
5970 return ceph::from_error_code(ec);
7c673cae
FG
5971 }
5972
5973 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5974
5975 if (user) {
5976 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5977 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5978 monclient->renew_subs();
5979 wait_on_list(waiting_for_fsmap);
5980 }
11fdf7f2
TL
5981 ceph_assert(fsmap_user);
5982 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
5983 } else {
5984 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5985 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5986 monclient->renew_subs();
5987 wait_on_list(waiting_for_fsmap);
5988 }
11fdf7f2
TL
5989 ceph_assert(fsmap);
5990 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
5991 }
5992 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5993 << fsmap_latest << dendl;
5994 return 0;
5995}
5996
5997/**
5998 *
5999 * @mds_spec one of ID, rank, GID, "*"
6000 *
6001 */
6002int Client::mds_command(
6003 const std::string &mds_spec,
6004 const vector<string>& cmd,
6005 const bufferlist& inbl,
6006 bufferlist *outbl,
6007 string *outs,
6008 Context *onfinish)
6009{
f67539c2
TL
6010 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6011 if (!iref_reader.is_state_satisfied())
6012 return -CEPHFS_ENOTCONN;
7c673cae 6013
f67539c2 6014 std::unique_lock cl(client_lock);
7c673cae
FG
6015
6016 int r;
6017 r = authenticate();
6018 if (r < 0) {
6019 return r;
6020 }
6021
6022 r = fetch_fsmap(false);
6023 if (r < 0) {
6024 return r;
6025 }
6026
6027 // Look up MDS target(s) of the command
6028 std::vector<mds_gid_t> targets;
6029 r = resolve_mds(mds_spec, &targets);
6030 if (r < 0) {
6031 return r;
6032 }
6033
6034 // If daemons are laggy, we won't send them commands. If all
6035 // are laggy then we fail.
6036 std::vector<mds_gid_t> non_laggy;
f67539c2 6037 for (const auto& gid : targets) {
7c673cae
FG
6038 const auto info = fsmap->get_info_gid(gid);
6039 if (!info.laggy()) {
6040 non_laggy.push_back(gid);
6041 }
6042 }
6043 if (non_laggy.size() == 0) {
6044 *outs = "All targeted MDS daemons are laggy";
f67539c2 6045 return -CEPHFS_ENOENT;
7c673cae
FG
6046 }
6047
6048 if (metadata.empty()) {
6049 // We are called on an unmounted client, so metadata
6050 // won't be initialized yet.
6051 populate_metadata("");
6052 }
6053
6054 // Send commands to targets
6055 C_GatherBuilder gather(cct, onfinish);
f67539c2 6056 for (const auto& target_gid : non_laggy) {
7c673cae
FG
6057 const auto info = fsmap->get_info_gid(target_gid);
6058
6059 // Open a connection to the target MDS
11fdf7f2 6060 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae 6061
f67539c2
TL
6062 cl.unlock();
6063 {
6064 std::scoped_lock cmd_lock(command_lock);
6065 // Generate MDSCommandOp state
6066 auto &op = command_table.start_command();
7c673cae 6067
f67539c2
TL
6068 op.on_finish = gather.new_sub();
6069 op.cmd = cmd;
6070 op.outbl = outbl;
6071 op.outs = outs;
6072 op.inbl = inbl;
6073 op.mds_gid = target_gid;
6074 op.con = conn;
7c673cae 6075
f67539c2
TL
6076 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6077 << " tid=" << op.tid << cmd << dendl;
7c673cae 6078
f67539c2
TL
6079 // Construct and send MCommand
6080 MessageRef m = op.get_message(monclient->get_fsid());
6081 conn->send_message2(std::move(m));
6082 }
6083 cl.lock();
7c673cae
FG
6084 }
6085 gather.activate();
6086
6087 return 0;
6088}
6089
11fdf7f2 6090void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
6091{
6092 ceph_tid_t const tid = m->get_tid();
6093
6094 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6095
f67539c2 6096 std::scoped_lock cmd_lock(command_lock);
7c673cae
FG
6097 if (!command_table.exists(tid)) {
6098 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
6099 return;
6100 }
6101
6102 auto &op = command_table.get_command(tid);
6103 if (op.outbl) {
11fdf7f2 6104 *op.outbl = m->get_data();
7c673cae
FG
6105 }
6106 if (op.outs) {
6107 *op.outs = m->rs;
6108 }
6109
6110 if (op.on_finish) {
6111 op.on_finish->complete(m->r);
6112 }
6113
6114 command_table.erase(tid);
7c673cae
FG
6115}
6116
6117// -------------------
6118// MOUNT
6119
11fdf7f2 6120int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 6121{
7c673cae
FG
6122 int r = authenticate();
6123 if (r < 0) {
6124 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6125 return r;
6126 }
6127
11fdf7f2
TL
6128 std::string resolved_fs_name;
6129 if (fs_name.empty()) {
9f95a23c
TL
6130 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6131 if (resolved_fs_name.empty())
6132 // Try the backwards compatibility fs name option
6133 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
6134 } else {
6135 resolved_fs_name = fs_name;
6136 }
6137
7c673cae 6138 std::string want = "mdsmap";
11fdf7f2 6139 if (!resolved_fs_name.empty()) {
7c673cae
FG
6140 r = fetch_fsmap(true);
6141 if (r < 0)
6142 return r;
11fdf7f2
TL
6143 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6144 if (fscid == FS_CLUSTER_ID_NONE) {
f67539c2 6145 return -CEPHFS_ENOENT;
11fdf7f2 6146 }
7c673cae
FG
6147
6148 std::ostringstream oss;
11fdf7f2 6149 oss << want << "." << fscid;
7c673cae
FG
6150 want = oss.str();
6151 }
6152 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6153
6154 monclient->sub_want(want, 0, 0);
6155 monclient->renew_subs();
6156
11fdf7f2
TL
6157 return 0;
6158}
6159
6160int Client::mount(const std::string &mount_root, const UserPerm& perms,
6161 bool require_mds, const std::string &fs_name)
6162{
f67539c2 6163 ceph_assert(is_initialized());
11fdf7f2 6164
f67539c2
TL
6165 /*
6166 * To make sure that the _unmount() must wait until the mount()
6167 * is done.
6168 */
6169 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6170 if (!mref_writer.is_first_writer()) // already mounting or mounted
11fdf7f2 6171 return 0;
11fdf7f2 6172
f67539c2 6173 std::unique_lock cl(client_lock);
11fdf7f2
TL
6174
6175 int r = subscribe_mdsmap(fs_name);
6176 if (r < 0) {
6177 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6178 return r;
6179 }
6180
f67539c2
TL
6181 start_tick_thread(); // start tick thread
6182
7c673cae
FG
6183 if (require_mds) {
6184 while (1) {
6185 auto availability = mdsmap->is_cluster_available();
6186 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6187 // Error out
6188 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6189 return CEPH_FUSE_NO_MDS_UP;
6190 } else if (availability == MDSMap::AVAILABLE) {
6191 // Continue to mount
6192 break;
6193 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6194 // Else, wait. MDSMonitor will update the map to bring
6195 // us to a conclusion eventually.
6196 wait_on_list(waiting_for_mdsmap);
6197 } else {
6198 // Unexpected value!
6199 ceph_abort();
6200 }
6201 }
6202 }
6203
6204 populate_metadata(mount_root.empty() ? "/" : mount_root);
6205
6206 filepath fp(CEPH_INO_ROOT);
6207 if (!mount_root.empty()) {
6208 fp = filepath(mount_root.c_str());
6209 }
6210 while (true) {
6211 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6212 req->set_filepath(fp);
6213 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6214 int res = make_request(req, perms);
6215 if (res < 0) {
f67539c2 6216 if (res == -CEPHFS_EACCES && root) {
7c673cae
FG
6217 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6218 break;
6219 }
6220 return res;
6221 }
6222
6223 if (fp.depth())
6224 fp.pop_dentry();
6225 else
6226 break;
6227 }
6228
11fdf7f2 6229 ceph_assert(root);
b3b6e05e 6230 _ll_get(root.get());
7c673cae 6231
7c673cae
FG
6232 // trace?
6233 if (!cct->_conf->client_trace.empty()) {
6234 traceout.open(cct->_conf->client_trace.c_str());
6235 if (traceout.is_open()) {
6236 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6237 } else {
6238 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6239 }
6240 }
6241
6242 /*
6243 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6244 ldout(cct, 3) << "op: struct stat st;" << dendl;
6245 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6246 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6247 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6248 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6249 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6250 ldout(cct, 3) << "op: int fd;" << dendl;
6251 */
f67539c2
TL
6252
6253 mref_writer.update_state(CLIENT_MOUNTED);
7c673cae
FG
6254 return 0;
6255}
6256
6257// UNMOUNT
6258
6259void Client::_close_sessions()
6260{
f6b5b4d7 6261 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67 6262 if (it->second->state == MetaSession::STATE_REJECTED)
f6b5b4d7
TL
6263 mds_sessions.erase(it++);
6264 else
6265 ++it;
6266 }
6267
7c673cae
FG
6268 while (!mds_sessions.empty()) {
6269 // send session closes!
11fdf7f2 6270 for (auto &p : mds_sessions) {
20effc67
TL
6271 if (p.second->state != MetaSession::STATE_CLOSING) {
6272 _close_mds_session(p.second.get());
f6b5b4d7 6273 mds_ranks_closing.insert(p.first);
7c673cae
FG
6274 }
6275 }
6276
6277 // wait for sessions to close
f6b5b4d7
TL
6278 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6279 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6280 << timo << "s)" << dendl;
9f95a23c 6281 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6282 if (!timo) {
6283 mount_cond.wait(l);
6284 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6285 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6286 while (!mds_ranks_closing.empty()) {
6287 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6288 // this prunes entry from mds_sessions and mds_ranks_closing
20effc67 6289 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
f6b5b4d7
TL
6290 }
6291 }
6292
6293 mds_ranks_closing.clear();
9f95a23c 6294 l.release();
7c673cae
FG
6295 }
6296}
6297
522d829b
TL
6298void Client::flush_mdlog_sync(Inode *in)
6299{
6300 if (in->unsafe_ops.empty()) {
6301 return;
6302 }
6303
6304 std::set<mds_rank_t> anchor;
6305 for (auto &&p : in->unsafe_ops) {
6306 anchor.emplace(p->mds);
6307 }
6308 if (in->auth_cap) {
6309 anchor.emplace(in->auth_cap->session->mds_num);
6310 }
6311
6312 for (auto &rank : anchor) {
6313 auto session = &mds_sessions.at(rank);
20effc67 6314 flush_mdlog(session->get());
522d829b
TL
6315 }
6316}
6317
31f18b77
FG
6318void Client::flush_mdlog_sync()
6319{
522d829b 6320 if (mds_requests.empty())
31f18b77 6321 return;
11fdf7f2 6322 for (auto &p : mds_sessions) {
20effc67 6323 flush_mdlog(p.second.get());
31f18b77
FG
6324 }
6325}
6326
6327void Client::flush_mdlog(MetaSession *session)
6328{
6329 // Only send this to Luminous or newer MDS daemons, older daemons
6330 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6331 const uint64_t features = session->con->get_features();
6332 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6333 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6334 session->con->send_message2(std::move(m));
31f18b77
FG
6335 }
6336}
6337
6338
11fdf7f2
TL
6339void Client::_abort_mds_sessions(int err)
6340{
6341 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6342 auto req = p->second;
6343 ++p;
6344 // unsafe requests will be removed during close session below.
6345 if (req->got_unsafe)
6346 continue;
6347
6348 req->abort(err);
6349 if (req->caller_cond) {
6350 req->kick = true;
9f95a23c 6351 req->caller_cond->notify_all();
11fdf7f2
TL
6352 }
6353 }
6354
6355 // Process aborts on any requests that were on this waitlist.
6356 // Any requests that were on a waiting_for_open session waitlist
6357 // will get kicked during close session below.
6358 signal_cond_list(waiting_for_mdsmap);
6359
6360 // Force-close all sessions
6361 while(!mds_sessions.empty()) {
20effc67
TL
6362 auto session = mds_sessions.begin()->second;
6363 _closed_mds_session(session.get(), err);
11fdf7f2
TL
6364 }
6365}
6366
6367void Client::_unmount(bool abort)
7c673cae 6368{
f67539c2
TL
6369 /*
6370 * We are unmounting the client.
6371 *
6372 * Just declare the state to STATE_UNMOUNTING to block and fail
6373 * any new comming "reader" and then try to wait all the in-flight
6374 * "readers" to finish.
6375 */
6376 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6377 if (!mref_writer.is_first_writer())
181888fb 6378 return;
f67539c2 6379 mref_writer.wait_readers_done();
7c673cae 6380
f67539c2
TL
6381 std::unique_lock lock{client_lock};
6382
6383 if (abort || blocklisted) {
6384 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
11fdf7f2
TL
6385 } else {
6386 ldout(cct, 2) << "unmounting" << dendl;
6387 }
7c673cae 6388
b32b8144
FG
6389 deleg_timeout = 0;
6390
11fdf7f2 6391 if (abort) {
f67539c2 6392 mount_aborted = true;
11fdf7f2 6393 // Abort all mds sessions
f67539c2 6394 _abort_mds_sessions(-CEPHFS_ENOTCONN);
11fdf7f2 6395
f67539c2 6396 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
11fdf7f2
TL
6397 } else {
6398 // flush the mdlog for pending requests, if any
6399 flush_mdlog_sync();
6400 }
6401
9f95a23c
TL
6402 mount_cond.wait(lock, [this] {
6403 if (!mds_requests.empty()) {
6404 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests"
6405 << dendl;
6406 }
6407 return mds_requests.empty();
6408 });
7c673cae
FG
6409
6410 cwd.reset();
b3b6e05e 6411 root.reset();
7c673cae
FG
6412
6413 // clean up any unclosed files
6414 while (!fd_map.empty()) {
6415 Fh *fh = fd_map.begin()->second;
6416 fd_map.erase(fd_map.begin());
6417 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6418 _release_fh(fh);
6419 }
6420
6421 while (!ll_unclosed_fh_set.empty()) {
6422 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6423 Fh *fh = *it;
6424 ll_unclosed_fh_set.erase(fh);
6425 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6426 _release_fh(fh);
6427 }
6428
6429 while (!opened_dirs.empty()) {
6430 dir_result_t *dirp = *opened_dirs.begin();
6431 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6432 _closedir(dirp);
6433 }
6434
6435 _ll_drop_pins();
6436
7c673cae
FG
6437 if (cct->_conf->client_oc) {
6438 // flush/release all buffered data
11fdf7f2
TL
6439 std::list<InodeRef> anchor;
6440 for (auto& p : inode_map) {
6441 Inode *in = p.second;
7c673cae 6442 if (!in) {
11fdf7f2
TL
6443 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6444 ceph_assert(in);
7c673cae 6445 }
11fdf7f2
TL
6446
6447 // prevent inode from getting freed
6448 anchor.emplace_back(in);
6449
f67539c2 6450 if (abort || blocklisted) {
11fdf7f2
TL
6451 objectcacher->purge_set(&in->oset);
6452 } else if (!in->caps.empty()) {
7c673cae
FG
6453 _release(in);
6454 _flush(in, new C_Client_FlushComplete(this, in));
6455 }
6456 }
6457 }
6458
f67539c2 6459 if (abort || blocklisted) {
20effc67
TL
6460 for (auto &q : mds_sessions) {
6461 auto s = q.second;
6462 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6463 Inode *in = *p;
6464 ++p;
6465 if (in->dirty_caps) {
6466 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6467 in->mark_caps_clean();
6468 put_inode(in);
6469 }
11fdf7f2
TL
6470 }
6471 }
6472 } else {
6473 flush_caps_sync();
6474 wait_sync_caps(last_flush_tid);
6475 }
7c673cae
FG
6476
6477 // empty lru cache
7c673cae
FG
6478 trim_cache();
6479
f67539c2
TL
6480 delay_put_inodes();
6481
7c673cae
FG
6482 while (lru.lru_get_size() > 0 ||
6483 !inode_map.empty()) {
6484 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6485 << "+" << inode_map.size() << " items"
6486 << ", waiting (for caps to release?)"
6487 << dendl;
f67539c2 6488
9f95a23c
TL
6489 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6490 r == std::cv_status::timeout) {
7c673cae
FG
6491 dump_cache(NULL);
6492 }
6493 }
11fdf7f2
TL
6494 ceph_assert(lru.lru_get_size() == 0);
6495 ceph_assert(inode_map.empty());
7c673cae
FG
6496
6497 // stop tracing
6498 if (!cct->_conf->client_trace.empty()) {
6499 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6500 traceout.close();
6501 }
6502
f67539c2
TL
6503 // stop the tick thread
6504 tick_thread_stopped = true;
6505 upkeep_cond.notify_one();
6506
7c673cae
FG
6507 _close_sessions();
6508
f67539c2 6509 mref_writer.update_state(CLIENT_UNMOUNTED);
7c673cae
FG
6510
6511 ldout(cct, 2) << "unmounted." << dendl;
6512}
6513
b32b8144
FG
6514void Client::unmount()
6515{
11fdf7f2
TL
6516 _unmount(false);
6517}
6518
6519void Client::abort_conn()
6520{
11fdf7f2 6521 _unmount(true);
b32b8144
FG
6522}
6523
7c673cae
FG
6524void Client::flush_cap_releases()
6525{
f67539c2
TL
6526 uint64_t nr_caps = 0;
6527
7c673cae 6528 // send any cap releases
11fdf7f2 6529 for (auto &p : mds_sessions) {
20effc67
TL
6530 auto session = p.second;
6531 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
11fdf7f2 6532 p.first)) {
20effc67 6533 nr_caps += session->release->caps.size();
7c673cae
FG
6534 if (cct->_conf->client_inject_release_failure) {
6535 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6536 } else {
20effc67 6537 session->con->send_message2(std::move(session->release));
7c673cae 6538 }
20effc67 6539 session->release.reset();
7c673cae
FG
6540 }
6541 }
f67539c2
TL
6542
6543 if (nr_caps > 0) {
6544 dec_pinned_icaps(nr_caps);
6545 }
7c673cae
FG
6546}
6547
f67539c2 6548void Client::renew_and_flush_cap_releases()
7c673cae 6549{
f67539c2
TL
6550 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6551
6552 if (!mount_aborted && mdsmap->get_epoch()) {
6553 // renew caps?
6554 utime_t el = ceph_clock_now() - last_cap_renew;
6555 if (unlikely(el > mdsmap->get_session_timeout() / 3.0))
6556 renew_caps();
6557
6558 flush_cap_releases();
7c673cae 6559 }
f67539c2
TL
6560}
6561
6562void Client::tick()
6563{
6564 ldout(cct, 20) << "tick" << dendl;
7c673cae 6565
7c673cae
FG
6566 utime_t now = ceph_clock_now();
6567
f67539c2
TL
6568 /*
6569 * If the mount() is not finished
6570 */
6571 if (is_mounting() && !mds_requests.empty()) {
7c673cae 6572 MetaRequest *req = mds_requests.begin()->second;
f67539c2 6573
7c673cae 6574 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
f67539c2 6575 req->abort(-CEPHFS_ETIMEDOUT);
7c673cae 6576 if (req->caller_cond) {
f67539c2
TL
6577 req->kick = true;
6578 req->caller_cond->notify_all();
7c673cae
FG
6579 }
6580 signal_cond_list(waiting_for_mdsmap);
11fdf7f2 6581 for (auto &p : mds_sessions) {
20effc67 6582 signal_context_list(p.second->waiting_for_open);
11fdf7f2 6583 }
7c673cae
FG
6584 }
6585 }
6586
f67539c2 6587 renew_and_flush_cap_releases();
7c673cae
FG
6588
6589 // delayed caps
28e407b8 6590 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6591 while (!p.end()) {
6592 Inode *in = *p;
6593 ++p;
f67539c2 6594 if (!mount_aborted && in->hold_caps_until > now)
7c673cae 6595 break;
28e407b8 6596 delayed_list.pop_front();
f67539c2
TL
6597 if (!mount_aborted)
6598 check_caps(in, CHECK_CAPS_NODELAY);
7c673cae
FG
6599 }
6600
f67539c2
TL
6601 if (!mount_aborted)
6602 collect_and_send_metrics();
6603
6604 delay_put_inodes(is_unmounting());
7c673cae 6605 trim_cache(true);
f6b5b4d7 6606
f67539c2 6607 if (blocklisted && (is_mounted() || is_unmounting()) &&
f6b5b4d7
TL
6608 last_auto_reconnect + 30 * 60 < now &&
6609 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6610 messenger->client_reset();
6611 fd_gen++; // invalidate open files
f67539c2 6612 blocklisted = false;
f6b5b4d7
TL
6613 _kick_stale_sessions();
6614 last_auto_reconnect = now;
6615 }
7c673cae
FG
6616}
6617
f67539c2
TL
6618void Client::start_tick_thread()
6619{
6620 upkeeper = std::thread([this]() {
6621 using time = ceph::coarse_mono_time;
6622 using sec = std::chrono::seconds;
6623
6624 auto last_tick = time::min();
6625
6626 std::unique_lock cl(client_lock);
6627 while (!tick_thread_stopped) {
6628 auto now = clock::now();
6629 auto since = now - last_tick;
6630
6631 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6632 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6633
6634 auto interval = std::max(t_interval, d_interval);
6635 if (likely(since >= interval*.90)) {
6636 tick();
6637 last_tick = clock::now();
6638 } else {
6639 interval -= since;
6640 }
6641
6642 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6643 if (!tick_thread_stopped)
6644 upkeep_cond.wait_for(cl, interval);
6645 }
6646 });
6647}
6648
6649void Client::collect_and_send_metrics() {
6650 ldout(cct, 20) << __func__ << dendl;
6651
6652 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6653
6654 // right now, we only track and send global metrics. its sufficient
6655 // to send these metrics to MDS rank0.
6656 collect_and_send_global_metrics();
6657}
6658
6659void Client::collect_and_send_global_metrics() {
6660 ldout(cct, 20) << __func__ << dendl;
6661 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6662
6663 if (!have_open_session((mds_rank_t)0)) {
6664 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6665 << dendl;
6666 return;
6667 }
6668 auto session = _get_or_open_mds_session((mds_rank_t)0);
6669 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6670 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6671 return;
6672 }
6673
6674 ClientMetricMessage metric;
6675 std::vector<ClientMetricMessage> message;
6676
6677 // read latency
33c7a0ef
TL
6678 if (_collect_and_send_global_metrics ||
6679 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
6680 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read)));
6681 message.push_back(metric);
6682 }
f67539c2
TL
6683
6684 // write latency
33c7a0ef
TL
6685 if (_collect_and_send_global_metrics ||
6686 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
6687 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat)));
6688 message.push_back(metric);
6689 }
f67539c2
TL
6690
6691 // metadata latency
33c7a0ef
TL
6692 if (_collect_and_send_global_metrics ||
6693 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
6694 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat)));
6695 message.push_back(metric);
6696 }
f67539c2
TL
6697
6698 // cap hit ratio -- nr_caps is unused right now
33c7a0ef
TL
6699 if (_collect_and_send_global_metrics ||
6700 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6701 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6702 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6703 message.push_back(metric);
6704 }
f67539c2
TL
6705
6706 // dentry lease hit ratio
33c7a0ef
TL
6707 if (_collect_and_send_global_metrics ||
6708 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6709 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6710 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6711 message.push_back(metric);
6712 }
f67539c2
TL
6713
6714 // opened files
33c7a0ef
TL
6715 if (_collect_and_send_global_metrics ||
6716 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
f67539c2
TL
6717 auto [opened_files, total_inodes] = get_opened_files_rates();
6718 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
33c7a0ef 6719 message.push_back(metric);
f67539c2 6720 }
f67539c2
TL
6721
6722 // pinned i_caps
33c7a0ef
TL
6723 if (_collect_and_send_global_metrics ||
6724 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
f67539c2
TL
6725 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6726 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
33c7a0ef 6727 message.push_back(metric);
f67539c2 6728 }
f67539c2
TL
6729
6730 // opened inodes
33c7a0ef
TL
6731 if (_collect_and_send_global_metrics ||
6732 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
f67539c2
TL
6733 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6734 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
33c7a0ef 6735 message.push_back(metric);
f67539c2 6736 }
f67539c2 6737
a4b75251 6738 // read io sizes
33c7a0ef
TL
6739 if (_collect_and_send_global_metrics ||
6740 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6741 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6742 total_read_size));
6743 message.push_back(metric);
6744 }
a4b75251
TL
6745
6746 // write io sizes
33c7a0ef
TL
6747 if (_collect_and_send_global_metrics ||
6748 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6749 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6750 total_write_size));
6751 message.push_back(metric);
6752 }
a4b75251 6753
f67539c2
TL
6754 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6755}
6756
7c673cae
FG
6757void Client::renew_caps()
6758{
6759 ldout(cct, 10) << "renew_caps()" << dendl;
6760 last_cap_renew = ceph_clock_now();
6761
11fdf7f2
TL
6762 for (auto &p : mds_sessions) {
6763 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
6764 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
20effc67 6765 renew_caps(p.second.get());
7c673cae
FG
6766 }
6767}
6768
6769void Client::renew_caps(MetaSession *session)
6770{
6771 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6772 session->last_cap_renew_request = ceph_clock_now();
6773 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 6774 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
6775}
6776
6777
6778// ===============================================================
6779// high level (POSIXy) interface
6780
6781int Client::_do_lookup(Inode *dir, const string& name, int mask,
6782 InodeRef *target, const UserPerm& perms)
6783{
6784 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6785 MetaRequest *req = new MetaRequest(op);
6786 filepath path;
6787 dir->make_nosnap_relative_path(path);
6788 path.push_dentry(name);
6789 req->set_filepath(path);
6790 req->set_inode(dir);
6791 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6792 mask |= DEBUG_GETATTR_CAPS;
6793 req->head.args.getattr.mask = mask;
6794
11fdf7f2 6795 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
6796
6797 int r = make_request(req, perms, target);
11fdf7f2 6798 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
6799 return r;
6800}
6801
f67539c2
TL
6802bool Client::_dentry_valid(const Dentry *dn)
6803{
6804 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6805
6806 // is dn lease valid?
6807 utime_t now = ceph_clock_now();
6808 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
6809 mds_sessions.count(dn->lease_mds)) {
20effc67
TL
6810 auto s = mds_sessions.at(dn->lease_mds);
6811 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
f67539c2
TL
6812 dlease_hit();
6813 return true;
6814 }
6815
20effc67 6816 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
f67539c2
TL
6817 << " vs lease_gen " << dn->lease_gen << dendl;
6818 }
6819
6820 dlease_miss();
6821 return false;
6822}
6823
7c673cae 6824int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
f67539c2 6825 const UserPerm& perms, std::string* alternate_name)
7c673cae
FG
6826{
6827 int r = 0;
6828 Dentry *dn = NULL;
f67539c2 6829 bool did_lookup_request = false;
f91f0fd5
TL
6830 // can only request shared caps
6831 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 6832
7c673cae 6833 if (dname == "..") {
11fdf7f2
TL
6834 if (dir->dentries.empty()) {
6835 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
6836 filepath path(dir->ino);
6837 req->set_filepath(path);
6838
6839 InodeRef tmptarget;
6840 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
6841
6842 if (r == 0) {
f91f0fd5 6843 *target = std::move(tmptarget);
11fdf7f2
TL
6844 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
6845 } else {
6846 *target = dir;
6847 }
6848 }
7c673cae
FG
6849 else
6850 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6851 goto done;
6852 }
6853
6854 if (dname == ".") {
6855 *target = dir;
6856 goto done;
6857 }
6858
11fdf7f2 6859 if (!dir->is_dir()) {
f67539c2 6860 r = -CEPHFS_ENOTDIR;
11fdf7f2
TL
6861 goto done;
6862 }
6863
7c673cae 6864 if (dname.length() > NAME_MAX) {
f67539c2 6865 r = -CEPHFS_ENAMETOOLONG;
7c673cae
FG
6866 goto done;
6867 }
6868
6869 if (dname == cct->_conf->client_snapdir &&
6870 dir->snapid == CEPH_NOSNAP) {
6871 *target = open_snapdir(dir);
6872 goto done;
6873 }
6874
f67539c2 6875relookup:
7c673cae
FG
6876 if (dir->dir &&
6877 dir->dir->dentries.count(dname)) {
6878 dn = dir->dir->dentries[dname];
6879
f67539c2
TL
6880 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
6881 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7c673cae 6882
94b18763 6883 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
f67539c2
TL
6884 if (_dentry_valid(dn)) {
6885 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6886 // make trim_caps() behave.
6887 dir->try_touch_cap(dn->lease_mds);
6888 goto hit_dn;
7c673cae 6889 }
92f5a8d4 6890 // dir shared caps?
94b18763 6891 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 6892 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 6893 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
6894 goto hit_dn;
6895 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 6896 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae 6897 << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6898 return -CEPHFS_ENOENT;
7c673cae
FG
6899 }
6900 }
6901 } else {
6902 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6903 }
6904 } else {
6905 // can we conclude ENOENT locally?
94b18763 6906 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 6907 (dir->flags & I_COMPLETE)) {
11fdf7f2 6908 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
f67539c2 6909 return -CEPHFS_ENOENT;
7c673cae
FG
6910 }
6911 }
6912
f67539c2
TL
6913 if (did_lookup_request) {
6914 r = 0;
6915 goto done;
6916 }
7c673cae 6917 r = _do_lookup(dir, dname, mask, target, perms);
f67539c2
TL
6918 did_lookup_request = true;
6919 if (r == 0) {
6920 /* complete lookup to get dentry for alternate_name */
6921 goto relookup;
6922 } else {
6923 goto done;
6924 }
6925
6926 hit_dn:
6927 if (dn->inode) {
7c673cae 6928 *target = dn->inode;
f67539c2
TL
6929 if (alternate_name)
6930 *alternate_name = dn->alternate_name;
7c673cae 6931 } else {
f67539c2 6932 r = -CEPHFS_ENOENT;
7c673cae
FG
6933 }
6934 touch_dn(dn);
f67539c2 6935 goto done;
7c673cae
FG
6936
6937 done:
6938 if (r < 0)
11fdf7f2 6939 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 6940 else
11fdf7f2 6941 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
6942 return r;
6943}
6944
6945int Client::get_or_create(Inode *dir, const char* name,
6946 Dentry **pdn, bool expect_null)
6947{
6948 // lookup
11fdf7f2 6949 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae
FG
6950 dir->open_dir();
6951 if (dir->dir->dentries.count(name)) {
6952 Dentry *dn = dir->dir->dentries[name];
f67539c2
TL
6953 if (_dentry_valid(dn)) {
6954 if (expect_null)
6955 return -CEPHFS_EEXIST;
7c673cae
FG
6956 }
6957 *pdn = dn;
6958 } else {
6959 // otherwise link up a new one
6960 *pdn = link(dir->dir, name, NULL, NULL);
6961 }
6962
6963 // success
6964 return 0;
6965}
6966
f67539c2
TL
6967int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
6968{
6969 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
6970 if (!mref_reader.is_state_satisfied())
6971 return -CEPHFS_ENOTCONN;
6972
6973 ldout(cct, 10) << __func__ << ": " << path << dendl;
6974
6975 std::scoped_lock lock(client_lock);
6976
6977 return path_walk(path, wdr, perms, followsym);
6978}
6979
7c673cae 6980int Client::path_walk(const filepath& origpath, InodeRef *end,
b3b6e05e 6981 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
f67539c2
TL
6982{
6983 walk_dentry_result wdr;
b3b6e05e 6984 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
f67539c2
TL
6985 *end = std::move(wdr.in);
6986 return rc;
6987}
6988
b3b6e05e
TL
6989int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
6990 bool followsym, int mask, InodeRef dirinode)
7c673cae
FG
6991{
6992 filepath path = origpath;
6993 InodeRef cur;
f67539c2 6994 std::string alternate_name;
7c673cae
FG
6995 if (origpath.absolute())
6996 cur = root;
b3b6e05e 6997 else if (!dirinode)
7c673cae 6998 cur = cwd;
b3b6e05e
TL
6999 else {
7000 cur = dirinode;
7001 }
11fdf7f2 7002 ceph_assert(cur);
7c673cae 7003
b3b6e05e 7004 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
11fdf7f2 7005 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
7006
7007 int symlinks = 0;
7008
7009 unsigned i=0;
7010 while (i < path.depth() && cur) {
7011 int caps = 0;
7012 const string &dname = path[i];
7013 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7014 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7015 InodeRef next;
7016 if (cct->_conf->client_permissions) {
7017 int r = may_lookup(cur.get(), perms);
7018 if (r < 0)
7019 return r;
7020 caps = CEPH_CAP_AUTH_SHARED;
7021 }
7022
7023 /* Get extra requested caps on the last component */
7024 if (i == (path.depth() - 1))
7025 caps |= mask;
f67539c2 7026 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7c673cae
FG
7027 if (r < 0)
7028 return r;
7029 // only follow trailing symlink if followsym. always follow
7030 // 'directory' symlinks.
7031 if (next && next->is_symlink()) {
7032 symlinks++;
7033 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7034 if (symlinks > MAXSYMLINKS) {
f67539c2 7035 return -CEPHFS_ELOOP;
7c673cae
FG
7036 }
7037
7038 if (i < path.depth() - 1) {
7039 // dir symlink
7040 // replace consumed components of path with symlink dir target
7041 filepath resolved(next->symlink.c_str());
7042 resolved.append(path.postfixpath(i + 1));
7043 path = resolved;
7044 i = 0;
7045 if (next->symlink[0] == '/') {
7046 cur = root;
7047 }
7048 continue;
7049 } else if (followsym) {
7050 if (next->symlink[0] == '/') {
7051 path = next->symlink.c_str();
7052 i = 0;
7053 // reset position
7054 cur = root;
7055 } else {
7056 filepath more(next->symlink.c_str());
7057 // we need to remove the symlink component from off of the path
7058 // before adding the target that the symlink points to. remain
7059 // at the same position in the path.
7060 path.pop_dentry();
7061 path.append(more);
7062 }
7063 continue;
7064 }
7065 }
7066 cur.swap(next);
7067 i++;
7068 }
7069 if (!cur)
f67539c2
TL
7070 return -CEPHFS_ENOENT;
7071 if (result) {
7072 result->in = std::move(cur);
7073 result->alternate_name = std::move(alternate_name);
7074 }
7c673cae
FG
7075 return 0;
7076}
7077
7078
7079// namespace ops
7080
f67539c2 7081int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7c673cae 7082{
f67539c2
TL
7083 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7084 if (!mref_reader.is_state_satisfied())
7085 return -CEPHFS_ENOTCONN;
7086
7c673cae
FG
7087 tout(cct) << "link" << std::endl;
7088 tout(cct) << relexisting << std::endl;
7089 tout(cct) << relpath << std::endl;
7090
7091 filepath existing(relexisting);
7092
7093 InodeRef in, dir;
f67539c2
TL
7094
7095 std::scoped_lock lock(client_lock);
7c673cae
FG
7096 int r = path_walk(existing, &in, perm, true);
7097 if (r < 0)
7098 return r;
7099 if (std::string(relpath) == "/") {
f67539c2 7100 r = -CEPHFS_EEXIST;
7c673cae
FG
7101 return r;
7102 }
7103 filepath path(relpath);
7104 string name = path.last_dentry();
7105 path.pop_dentry();
7106
7107 r = path_walk(path, &dir, perm, true);
7108 if (r < 0)
7109 return r;
7110 if (cct->_conf->client_permissions) {
7111 if (S_ISDIR(in->mode)) {
f67539c2 7112 r = -CEPHFS_EPERM;
7c673cae
FG
7113 return r;
7114 }
7115 r = may_hardlink(in.get(), perm);
7116 if (r < 0)
7117 return r;
7118 r = may_create(dir.get(), perm);
7119 if (r < 0)
7120 return r;
7121 }
f67539c2 7122 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7123 return r;
7124}
7125
7126int Client::unlink(const char *relpath, const UserPerm& perm)
b3b6e05e
TL
7127{
7128 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7129}
7130
7131int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7c673cae 7132{
f67539c2 7133 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7134 if (!mref_reader.is_state_satisfied()) {
f67539c2 7135 return -CEPHFS_ENOTCONN;
b3b6e05e 7136 }
f67539c2 7137
11fdf7f2 7138 tout(cct) << __func__ << std::endl;
b3b6e05e 7139 tout(cct) << dirfd << std::endl;
7c673cae 7140 tout(cct) << relpath << std::endl;
b3b6e05e 7141 tout(cct) << flags << std::endl;
7c673cae 7142
b3b6e05e
TL
7143 if (std::string(relpath) == "/") {
7144 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7145 }
7c673cae
FG
7146
7147 filepath path(relpath);
7148 string name = path.last_dentry();
7149 path.pop_dentry();
7150 InodeRef dir;
f67539c2
TL
7151
7152 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7153
7154 InodeRef dirinode;
7155 int r = get_fd_inode(dirfd, &dirinode);
7156 if (r < 0) {
7157 return r;
7158 }
7159
7160 r = path_walk(path, &dir, perm, true, 0, dirinode);
7161 if (r < 0) {
7c673cae 7162 return r;
b3b6e05e 7163 }
7c673cae
FG
7164 if (cct->_conf->client_permissions) {
7165 r = may_delete(dir.get(), name.c_str(), perm);
b3b6e05e 7166 if (r < 0) {
7c673cae 7167 return r;
b3b6e05e 7168 }
7c673cae 7169 }
b3b6e05e
TL
7170 if (flags & AT_REMOVEDIR) {
7171 r = _rmdir(dir.get(), name.c_str(), perm);
7172 } else {
7173 r = _unlink(dir.get(), name.c_str(), perm);
7174 }
7175 return r;
7c673cae
FG
7176}
7177
f67539c2 7178int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7c673cae 7179{
f67539c2
TL
7180 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7181 if (!mref_reader.is_state_satisfied())
7182 return -CEPHFS_ENOTCONN;
7183
11fdf7f2 7184 tout(cct) << __func__ << std::endl;
7c673cae
FG
7185 tout(cct) << relfrom << std::endl;
7186 tout(cct) << relto << std::endl;
7187
7188 if (std::string(relfrom) == "/" || std::string(relto) == "/")
f67539c2 7189 return -CEPHFS_EBUSY;
7c673cae
FG
7190
7191 filepath from(relfrom);
7192 filepath to(relto);
7193 string fromname = from.last_dentry();
7194 from.pop_dentry();
7195 string toname = to.last_dentry();
7196 to.pop_dentry();
7197
7198 InodeRef fromdir, todir;
f67539c2
TL
7199
7200 std::scoped_lock lock(client_lock);
7c673cae
FG
7201 int r = path_walk(from, &fromdir, perm);
7202 if (r < 0)
7203 goto out;
7204 r = path_walk(to, &todir, perm);
7205 if (r < 0)
7206 goto out;
7207
7208 if (cct->_conf->client_permissions) {
7209 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7210 if (r < 0)
7211 return r;
7212 r = may_delete(todir.get(), toname.c_str(), perm);
f67539c2 7213 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
7214 return r;
7215 }
f67539c2 7216 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7217out:
7218 return r;
7219}
7220
7221// dirs
7222
f67539c2 7223int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
b3b6e05e
TL
7224{
7225 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7226}
7227
7228int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7229 std::string alternate_name)
7c673cae 7230{
f67539c2
TL
7231 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7232 if (!mref_reader.is_state_satisfied())
7233 return -CEPHFS_ENOTCONN;
7234
11fdf7f2 7235 tout(cct) << __func__ << std::endl;
b3b6e05e 7236 tout(cct) << dirfd << std::endl;
7c673cae
FG
7237 tout(cct) << relpath << std::endl;
7238 tout(cct) << mode << std::endl;
11fdf7f2 7239 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 7240
b3b6e05e 7241 if (std::string(relpath) == "/") {
f67539c2 7242 return -CEPHFS_EEXIST;
b3b6e05e 7243 }
7c673cae
FG
7244
7245 filepath path(relpath);
7246 string name = path.last_dentry();
7247 path.pop_dentry();
7248 InodeRef dir;
f67539c2
TL
7249
7250 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7251
7252 InodeRef dirinode;
7253 int r = get_fd_inode(dirfd, &dirinode);
7254 if (r < 0) {
7c673cae 7255 return r;
b3b6e05e
TL
7256 }
7257
7258 r = path_walk(path, &dir, perm, true, 0, dirinode);
7259 if (r < 0) {
7260 return r;
7261 }
7c673cae
FG
7262 if (cct->_conf->client_permissions) {
7263 r = may_create(dir.get(), perm);
b3b6e05e 7264 if (r < 0) {
7c673cae 7265 return r;
b3b6e05e 7266 }
7c673cae 7267 }
f67539c2 7268 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7c673cae
FG
7269}
7270
7271int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7272{
f67539c2
TL
7273 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7274 if (!mref_reader.is_state_satisfied())
7275 return -CEPHFS_ENOTCONN;
7276
7c673cae 7277 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 7278 tout(cct) << __func__ << std::endl;
7c673cae
FG
7279 tout(cct) << relpath << std::endl;
7280 tout(cct) << mode << std::endl;
7281
7282 //get through existing parts of path
7283 filepath path(relpath);
7284 unsigned int i;
7285 int r = 0, caps = 0;
7286 InodeRef cur, next;
f67539c2
TL
7287
7288 std::scoped_lock lock(client_lock);
7c673cae
FG
7289 cur = cwd;
7290 for (i=0; i<path.depth(); ++i) {
7291 if (cct->_conf->client_permissions) {
7292 r = may_lookup(cur.get(), perms);
7293 if (r < 0)
7294 break;
7295 caps = CEPH_CAP_AUTH_SHARED;
7296 }
7297 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7298 if (r < 0)
7299 break;
7300 cur.swap(next);
7301 }
f67539c2 7302 if (r!=-CEPHFS_ENOENT) return r;
11fdf7f2 7303 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
7304 //make new directory at each level
7305 for (; i<path.depth(); ++i) {
7306 if (cct->_conf->client_permissions) {
7307 r = may_create(cur.get(), perms);
7308 if (r < 0)
7309 return r;
7310 }
7311 //make new dir
7312 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 7313
7c673cae 7314 //check proper creation/existence
f67539c2 7315 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
c07f9fc5
FG
7316 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7317 }
7318 if (r < 0)
7319 return r;
7c673cae
FG
7320 //move to new dir and continue
7321 cur.swap(next);
11fdf7f2 7322 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
7323 << filepath(cur->ino).get_path() << dendl;
7324 }
7325 return 0;
7326}
7327
7328int Client::rmdir(const char *relpath, const UserPerm& perms)
7329{
b3b6e05e 7330 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7c673cae
FG
7331}
7332
7333int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
f67539c2
TL
7334{
7335 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7336 if (!mref_reader.is_state_satisfied())
7337 return -CEPHFS_ENOTCONN;
7338
11fdf7f2 7339 tout(cct) << __func__ << std::endl;
7c673cae
FG
7340 tout(cct) << relpath << std::endl;
7341 tout(cct) << mode << std::endl;
7342 tout(cct) << rdev << std::endl;
7343
7344 if (std::string(relpath) == "/")
f67539c2 7345 return -CEPHFS_EEXIST;
7c673cae
FG
7346
7347 filepath path(relpath);
7348 string name = path.last_dentry();
7349 path.pop_dentry();
7350 InodeRef dir;
f67539c2
TL
7351
7352 std::scoped_lock lock(client_lock);
7c673cae
FG
7353 int r = path_walk(path, &dir, perms);
7354 if (r < 0)
7355 return r;
7356 if (cct->_conf->client_permissions) {
7357 int r = may_create(dir.get(), perms);
7358 if (r < 0)
7359 return r;
7360 }
7361 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7362}
7363
7364// symlinks
7365
f67539c2 7366int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
b3b6e05e
TL
7367{
7368 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7369}
7370
7371int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7372 std::string alternate_name)
7c673cae 7373{
f67539c2 7374 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7375 if (!mref_reader.is_state_satisfied()) {
f67539c2 7376 return -CEPHFS_ENOTCONN;
b3b6e05e 7377 }
f67539c2 7378
11fdf7f2 7379 tout(cct) << __func__ << std::endl;
7c673cae 7380 tout(cct) << target << std::endl;
b3b6e05e 7381 tout(cct) << dirfd << std::endl;
7c673cae
FG
7382 tout(cct) << relpath << std::endl;
7383
b3b6e05e 7384 if (std::string(relpath) == "/") {
f67539c2 7385 return -CEPHFS_EEXIST;
b3b6e05e 7386 }
7c673cae
FG
7387
7388 filepath path(relpath);
7389 string name = path.last_dentry();
7390 path.pop_dentry();
7391 InodeRef dir;
f67539c2
TL
7392
7393 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7394
7395 InodeRef dirinode;
7396 int r = get_fd_inode(dirfd, &dirinode);
7397 if (r < 0) {
7c673cae 7398 return r;
b3b6e05e
TL
7399 }
7400 r = path_walk(path, &dir, perms, true, 0, dirinode);
7401 if (r < 0) {
7402 return r;
7403 }
7c673cae
FG
7404 if (cct->_conf->client_permissions) {
7405 int r = may_create(dir.get(), perms);
b3b6e05e 7406 if (r < 0) {
7c673cae 7407 return r;
b3b6e05e 7408 }
7c673cae 7409 }
f67539c2 7410 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7c673cae
FG
7411}
7412
7413int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7414{
b3b6e05e
TL
7415 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7416}
7417
7418int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
f67539c2 7419 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7420 if (!mref_reader.is_state_satisfied()) {
f67539c2 7421 return -CEPHFS_ENOTCONN;
b3b6e05e 7422 }
f67539c2 7423
11fdf7f2 7424 tout(cct) << __func__ << std::endl;
b3b6e05e 7425 tout(cct) << dirfd << std::endl;
7c673cae
FG
7426 tout(cct) << relpath << std::endl;
7427
b3b6e05e 7428 InodeRef dirinode;
f67539c2 7429 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7430 int r = get_fd_inode(dirfd, &dirinode);
7431 if (r < 0) {
7c673cae 7432 return r;
b3b6e05e
TL
7433 }
7434
7435 InodeRef in;
7436 filepath path(relpath);
7437 r = path_walk(path, &in, perms, false, 0, dirinode);
7438 if (r < 0) {
7439 return r;
7440 }
7c673cae
FG
7441
7442 return _readlink(in.get(), buf, size);
7443}
7444
7445int Client::_readlink(Inode *in, char *buf, size_t size)
7446{
7447 if (!in->is_symlink())
f67539c2 7448 return -CEPHFS_EINVAL;
7c673cae
FG
7449
7450 // copy into buf (at most size bytes)
7451 int r = in->symlink.length();
7452 if (r > (int)size)
7453 r = size;
7454 memcpy(buf, in->symlink.c_str(), r);
7455 return r;
7456}
7457
7458
7459// inode stuff
7460
7461int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7462{
94b18763 7463 bool yes = in->caps_issued_mask(mask, true);
7c673cae 7464
11fdf7f2 7465 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
7466 if (yes && !force)
7467 return 0;
7468
7469 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7470 filepath path;
7471 in->make_nosnap_relative_path(path);
7472 req->set_filepath(path);
7473 req->set_inode(in);
7474 req->head.args.getattr.mask = mask;
7475
7476 int res = make_request(req, perms);
11fdf7f2 7477 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
7478 return res;
7479}
7480
1d09f67e
TL
7481int Client::_getvxattr(
7482 Inode *in,
7483 const UserPerm& perms,
7484 const char *xattr_name,
7485 ssize_t size,
7486 void *value,
7487 mds_rank_t rank)
7488{
7489 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7490 return -CEPHFS_ENODATA;
7491 }
7492
7493 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7494 filepath path;
7495 in->make_nosnap_relative_path(path);
7496 req->set_filepath(path);
7497 req->set_inode(in);
7498 req->set_string2(xattr_name);
7499
7500 bufferlist bl;
7501 int res = make_request(req, perms, nullptr, nullptr, rank, &bl);
7502 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7503
7504 if (res < 0) {
7505 return res;
7506 }
7507
7508 std::string buf;
7509 auto p = bl.cbegin();
7510
7511 DECODE_START(1, p);
7512 decode(buf, p);
7513 DECODE_FINISH(p);
7514
7515 ssize_t len = buf.length();
7516
7517 res = len; // refer to man getxattr(2) for output buffer size == 0
7518
7519 if (size > 0) {
7520 if (len > size) {
7521 res = -CEPHFS_ERANGE; // insufficient output buffer space
7522 } else {
7523 memcpy(value, buf.c_str(), len);
7524 }
7525 }
7526 return res;
7527}
7528
7c673cae
FG
7529int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
7530 const UserPerm& perms, InodeRef *inp)
7531{
7532 int issued = in->caps_issued();
20effc67
TL
7533 union ceph_mds_request_args args;
7534 bool kill_sguid = false;
7535 int inode_drop = 0;
7c673cae 7536
11fdf7f2 7537 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
7c673cae
FG
7538 ccap_string(issued) << dendl;
7539
7540 if (in->snapid != CEPH_NOSNAP) {
f67539c2 7541 return -CEPHFS_EROFS;
7c673cae
FG
7542 }
7543 if ((mask & CEPH_SETATTR_SIZE) &&
f67539c2
TL
7544 (uint64_t)stx->stx_size > in->size &&
7545 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7c673cae 7546 perms)) {
f67539c2 7547 return -CEPHFS_EDQUOT;
7c673cae
FG
7548 }
7549
20effc67
TL
7550 memset(&args, 0, sizeof(args));
7551
7c673cae
FG
7552 // make the change locally?
7553 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7554 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7555 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7556 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7557 << in->cap_dirtier_gid << ", forcing sync setattr"
7558 << dendl;
7559 /*
7560 * This works because we implicitly flush the caps as part of the
7561 * request, so the cap update check will happen with the writeback
7562 * cap context, and then the setattr check will happen with the
7563 * caller's context.
7564 *
7565 * In reality this pattern is likely pretty rare (different users
7566 * setattr'ing the same file). If that turns out not to be the
7567 * case later, we can build a more complex pipelined cap writeback
7568 * infrastructure...
7569 */
20effc67 7570 mask |= CEPH_SETATTR_CTIME;
7c673cae
FG
7571 }
7572
7573 if (!mask) {
7574 // caller just needs us to bump the ctime
7575 in->ctime = ceph_clock_now();
7576 in->cap_dirtier_uid = perms.uid();
7577 in->cap_dirtier_gid = perms.gid();
7578 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7579 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7580 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7581 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7582 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7583 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7584 else
7585 mask |= CEPH_SETATTR_CTIME;
7586 }
7587
7588 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
20effc67 7589 kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
7c673cae
FG
7590
7591 mask &= ~CEPH_SETATTR_KILL_SGUID;
20effc67
TL
7592 } else if (mask & CEPH_SETATTR_SIZE) {
7593 /* If we don't have Ax, then we must ask the server to clear them on truncate */
7594 mask |= CEPH_SETATTR_KILL_SGUID;
7595 inode_drop |= CEPH_CAP_AUTH_SHARED;
7596 }
7597
7598 if (mask & CEPH_SETATTR_UID) {
7599 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7c673cae 7600
20effc67 7601 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7602 in->ctime = ceph_clock_now();
7603 in->cap_dirtier_uid = perms.uid();
7604 in->cap_dirtier_gid = perms.gid();
7605 in->uid = stx->stx_uid;
28e407b8 7606 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7607 mask &= ~CEPH_SETATTR_UID;
7608 kill_sguid = true;
20effc67
TL
7609 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7610 in->uid != stx->stx_uid) {
7611 args.setattr.uid = stx->stx_uid;
7612 inode_drop |= CEPH_CAP_AUTH_SHARED;
7613 } else {
7614 mask &= ~CEPH_SETATTR_UID;
7c673cae 7615 }
20effc67
TL
7616 }
7617
7618 if (mask & CEPH_SETATTR_GID) {
7619 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7620
7621 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7622 in->ctime = ceph_clock_now();
7623 in->cap_dirtier_uid = perms.uid();
7624 in->cap_dirtier_gid = perms.gid();
7625 in->gid = stx->stx_gid;
28e407b8 7626 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7627 mask &= ~CEPH_SETATTR_GID;
7628 kill_sguid = true;
20effc67
TL
7629 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7630 in->gid != stx->stx_gid) {
7631 args.setattr.gid = stx->stx_gid;
7632 inode_drop |= CEPH_CAP_AUTH_SHARED;
7633 } else {
7634 mask &= ~CEPH_SETATTR_GID;
7c673cae 7635 }
20effc67 7636 }
7c673cae 7637
20effc67
TL
7638 if (mask & CEPH_SETATTR_MODE) {
7639 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7640
7641 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7642 in->ctime = ceph_clock_now();
7643 in->cap_dirtier_uid = perms.uid();
7644 in->cap_dirtier_gid = perms.gid();
7645 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7646 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7647 mask &= ~CEPH_SETATTR_MODE;
20effc67
TL
7648 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7649 in->mode != stx->stx_mode) {
7650 args.setattr.mode = stx->stx_mode;
7651 inode_drop |= CEPH_CAP_AUTH_SHARED;
7652 } else {
7653 mask &= ~CEPH_SETATTR_MODE;
7c673cae 7654 }
20effc67
TL
7655 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) &&
7656 kill_sguid && S_ISREG(in->mode) &&
7657 (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7658 /* Must squash the any setuid/setgid bits with an ownership change */
7659 in->mode &= ~(S_ISUID|S_ISGID);
7660 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7661 }
7662
7663 if (mask & CEPH_SETATTR_BTIME) {
7664 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7c673cae 7665
20effc67 7666 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7667 in->ctime = ceph_clock_now();
7668 in->cap_dirtier_uid = perms.uid();
7669 in->cap_dirtier_gid = perms.gid();
7670 in->btime = utime_t(stx->stx_btime);
28e407b8 7671 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7672 mask &= ~CEPH_SETATTR_BTIME;
20effc67
TL
7673 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7674 in->btime != utime_t(stx->stx_btime)) {
7675 args.setattr.btime = utime_t(stx->stx_btime);
7676 inode_drop |= CEPH_CAP_AUTH_SHARED;
7677 } else {
7678 mask &= ~CEPH_SETATTR_BTIME;
7679 }
7680 }
7681
7682 if (mask & CEPH_SETATTR_SIZE) {
7683 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7684 //too big!
7685 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7686 return -CEPHFS_EFBIG;
7687 }
7688
7689 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7690 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7691 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7692 stx->stx_size >= in->size) {
7693 if (stx->stx_size > in->size) {
7694 in->size = in->reported_size = stx->stx_size;
7695 in->cap_dirtier_uid = perms.uid();
7696 in->cap_dirtier_gid = perms.gid();
7697 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7698 mask &= ~(CEPH_SETATTR_SIZE);
7699 mask |= CEPH_SETATTR_MTIME;
7700 } else {
7701 // ignore it when size doesn't change
7702 mask &= ~(CEPH_SETATTR_SIZE);
7703 }
7704 } else {
7705 args.setattr.size = stx->stx_size;
7706 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7707 CEPH_CAP_FILE_WR;
7708 }
7709 }
7710
7711 if (mask & CEPH_SETATTR_MTIME) {
7712 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7713 in->mtime = utime_t(stx->stx_mtime);
7714 in->ctime = ceph_clock_now();
7715 in->cap_dirtier_uid = perms.uid();
7716 in->cap_dirtier_gid = perms.gid();
7717 in->time_warp_seq++;
7718 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7719 mask &= ~CEPH_SETATTR_MTIME;
7720 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7721 utime_t(stx->stx_mtime) > in->mtime) {
7722 in->mtime = utime_t(stx->stx_mtime);
7723 in->ctime = ceph_clock_now();
7724 in->cap_dirtier_uid = perms.uid();
7725 in->cap_dirtier_gid = perms.gid();
7726 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7727 mask &= ~CEPH_SETATTR_MTIME;
7728 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7729 in->mtime != utime_t(stx->stx_mtime)) {
7730 args.setattr.mtime = utime_t(stx->stx_mtime);
7731 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7732 CEPH_CAP_FILE_WR;
7733 } else {
7734 mask &= ~CEPH_SETATTR_MTIME;
7c673cae 7735 }
7c673cae
FG
7736 }
7737
20effc67
TL
7738 if (mask & CEPH_SETATTR_ATIME) {
7739 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7740 in->atime = utime_t(stx->stx_atime);
7c673cae
FG
7741 in->ctime = ceph_clock_now();
7742 in->cap_dirtier_uid = perms.uid();
7743 in->cap_dirtier_gid = perms.gid();
7744 in->time_warp_seq++;
28e407b8 7745 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
20effc67
TL
7746 mask &= ~CEPH_SETATTR_ATIME;
7747 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
7748 utime_t(stx->stx_atime) > in->atime) {
7749 in->atime = utime_t(stx->stx_atime);
7750 in->ctime = ceph_clock_now();
7751 in->cap_dirtier_uid = perms.uid();
7752 in->cap_dirtier_gid = perms.gid();
7753 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7754 mask &= ~CEPH_SETATTR_ATIME;
7755 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7756 in->atime != utime_t(stx->stx_atime)) {
7757 args.setattr.atime = utime_t(stx->stx_atime);
7758 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
7759 CEPH_CAP_FILE_WR;
7760 } else {
7761 mask &= ~CEPH_SETATTR_ATIME;
7c673cae
FG
7762 }
7763 }
20effc67 7764
7c673cae
FG
7765 if (!mask) {
7766 in->change_attr++;
7767 return 0;
7768 }
7769
7c673cae
FG
7770 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
7771
7772 filepath path;
7773
7774 in->make_nosnap_relative_path(path);
7775 req->set_filepath(path);
7776 req->set_inode(in);
7777
20effc67
TL
7778 req->head.args = args;
7779 req->inode_drop = inode_drop;
7c673cae 7780 req->head.args.setattr.mask = mask;
7c673cae
FG
7781 req->regetattr_mask = mask;
7782
7783 int res = make_request(req, perms, inp);
7784 ldout(cct, 10) << "_setattr result=" << res << dendl;
7785 return res;
7786}
7787
7788/* Note that we only care about attrs that setattr cares about */
7789void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
7790{
7791 stx->stx_size = st->st_size;
7792 stx->stx_mode = st->st_mode;
7793 stx->stx_uid = st->st_uid;
7794 stx->stx_gid = st->st_gid;
11fdf7f2
TL
7795#ifdef __APPLE__
7796 stx->stx_mtime = st->st_mtimespec;
7797 stx->stx_atime = st->st_atimespec;
f67539c2
TL
7798#elif __WIN32
7799 stx->stx_mtime.tv_sec = st->st_mtime;
7800 stx->stx_atime.tv_sec = st->st_atime;
11fdf7f2 7801#else
7c673cae
FG
7802 stx->stx_mtime = st->st_mtim;
7803 stx->stx_atime = st->st_atim;
11fdf7f2 7804#endif
7c673cae
FG
7805}
7806
7807int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
7808 const UserPerm& perms, InodeRef *inp)
7809{
7810 int ret = _do_setattr(in, stx, mask, perms, inp);
7811 if (ret < 0)
7812 return ret;
7813 if (mask & CEPH_SETATTR_MODE)
7814 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
7815 return ret;
7816}
7817
7818int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
7819 const UserPerm& perms)
7820{
7821 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
7822 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
7823 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
7824 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
7825 if (cct->_conf->client_permissions) {
7826 int r = may_setattr(in.get(), stx, mask, perms);
7827 if (r < 0)
7828 return r;
7829 }
7830 return __setattrx(in.get(), stx, mask, perms);
7831}
7832
7833int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
7834 const UserPerm& perms)
7835{
7836 struct ceph_statx stx;
7837
7838 stat_to_statx(attr, &stx);
7839 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
7840
7841 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
7842 mask &= ~CEPH_SETATTR_UID;
7843 }
7844 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
7845 mask &= ~CEPH_SETATTR_GID;
7846 }
7847
7c673cae
FG
7848 return _setattrx(in, &stx, mask, perms);
7849}
7850
7851int Client::setattr(const char *relpath, struct stat *attr, int mask,
7852 const UserPerm& perms)
7853{
f67539c2
TL
7854 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7855 if (!mref_reader.is_state_satisfied())
7856 return -CEPHFS_ENOTCONN;
7857
11fdf7f2 7858 tout(cct) << __func__ << std::endl;
7c673cae
FG
7859 tout(cct) << relpath << std::endl;
7860 tout(cct) << mask << std::endl;
7861
7862 filepath path(relpath);
7863 InodeRef in;
f67539c2
TL
7864
7865 std::scoped_lock lock(client_lock);
7c673cae
FG
7866 int r = path_walk(path, &in, perms);
7867 if (r < 0)
7868 return r;
7869 return _setattr(in, attr, mask, perms);
7870}
7871
7872int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
7873 const UserPerm& perms, int flags)
7874{
f67539c2
TL
7875 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7876 if (!mref_reader.is_state_satisfied())
7877 return -CEPHFS_ENOTCONN;
7878
11fdf7f2 7879 tout(cct) << __func__ << std::endl;
7c673cae
FG
7880 tout(cct) << relpath << std::endl;
7881 tout(cct) << mask << std::endl;
7882
7883 filepath path(relpath);
7884 InodeRef in;
f67539c2
TL
7885
7886 std::scoped_lock lock(client_lock);
7c673cae
FG
7887 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
7888 if (r < 0)
7889 return r;
7890 return _setattrx(in, stx, mask, perms);
7891}
7892
7893int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
7894{
f67539c2
TL
7895 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7896 if (!mref_reader.is_state_satisfied())
7897 return -CEPHFS_ENOTCONN;
7898
11fdf7f2 7899 tout(cct) << __func__ << std::endl;
7c673cae
FG
7900 tout(cct) << fd << std::endl;
7901 tout(cct) << mask << std::endl;
7902
f67539c2 7903 std::scoped_lock lock(client_lock);
7c673cae
FG
7904 Fh *f = get_filehandle(fd);
7905 if (!f)
f67539c2 7906 return -CEPHFS_EBADF;
7c673cae
FG
7907#if defined(__linux__) && defined(O_PATH)
7908 if (f->flags & O_PATH)
f67539c2 7909 return -CEPHFS_EBADF;
7c673cae
FG
7910#endif
7911 return _setattr(f->inode, attr, mask, perms);
7912}
7913
7914int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
7915{
f67539c2
TL
7916 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7917 if (!mref_reader.is_state_satisfied())
7918 return -CEPHFS_ENOTCONN;
7919
11fdf7f2 7920 tout(cct) << __func__ << std::endl;
7c673cae
FG
7921 tout(cct) << fd << std::endl;
7922 tout(cct) << mask << std::endl;
7923
f67539c2 7924 std::scoped_lock lock(client_lock);
7c673cae
FG
7925 Fh *f = get_filehandle(fd);
7926 if (!f)
f67539c2 7927 return -CEPHFS_EBADF;
7c673cae
FG
7928#if defined(__linux__) && defined(O_PATH)
7929 if (f->flags & O_PATH)
f67539c2 7930 return -CEPHFS_EBADF;
7c673cae
FG
7931#endif
7932 return _setattrx(f->inode, stx, mask, perms);
7933}
7934
7935int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
7936 frag_info_t *dirstat, int mask)
7937{
f67539c2
TL
7938 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7939 if (!mref_reader.is_state_satisfied())
7940 return -CEPHFS_ENOTCONN;
7941
11fdf7f2 7942 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7943 tout(cct) << "stat" << std::endl;
7944 tout(cct) << relpath << std::endl;
181888fb 7945
7c673cae
FG
7946 filepath path(relpath);
7947 InodeRef in;
f67539c2
TL
7948
7949 std::scoped_lock lock(client_lock);
7c673cae
FG
7950 int r = path_walk(path, &in, perms, true, mask);
7951 if (r < 0)
7952 return r;
7953 r = _getattr(in, mask, perms);
7954 if (r < 0) {
11fdf7f2 7955 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
7956 return r;
7957 }
7958 fill_stat(in, stbuf, dirstat);
11fdf7f2 7959 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
7960 return r;
7961}
7962
7963unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
7964{
7965 unsigned mask = 0;
7966
7967 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
7968 if (flags & AT_NO_ATTR_SYNC)
7969 goto out;
7970
7971 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
7972 mask |= CEPH_CAP_PIN;
7973 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7974 mask |= CEPH_CAP_AUTH_SHARED;
7975 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
7976 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 7977 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
7978 mask |= CEPH_CAP_FILE_SHARED;
7979 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
7980 mask |= CEPH_CAP_XATTR_SHARED;
7981out:
7982 return mask;
7983}
7984
7985int Client::statx(const char *relpath, struct ceph_statx *stx,
7986 const UserPerm& perms,
7987 unsigned int want, unsigned int flags)
7988{
b3b6e05e 7989 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7c673cae
FG
7990}
7991
7992int Client::lstat(const char *relpath, struct stat *stbuf,
7993 const UserPerm& perms, frag_info_t *dirstat, int mask)
7994{
f67539c2
TL
7995 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7996 if (!mref_reader.is_state_satisfied())
7997 return -CEPHFS_ENOTCONN;
7998
11fdf7f2 7999 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
11fdf7f2 8000 tout(cct) << __func__ << std::endl;
7c673cae 8001 tout(cct) << relpath << std::endl;
181888fb 8002
7c673cae
FG
8003 filepath path(relpath);
8004 InodeRef in;
f67539c2
TL
8005
8006 std::scoped_lock lock(client_lock);
7c673cae
FG
8007 // don't follow symlinks
8008 int r = path_walk(path, &in, perms, false, mask);
8009 if (r < 0)
8010 return r;
8011 r = _getattr(in, mask, perms);
8012 if (r < 0) {
11fdf7f2 8013 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
8014 return r;
8015 }
8016 fill_stat(in, stbuf, dirstat);
11fdf7f2 8017 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8018 return r;
8019}
8020
8021int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8022{
11fdf7f2 8023 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8024 << " mode 0" << oct << in->mode << dec
8025 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8026 memset(st, 0, sizeof(struct stat));
8027 if (use_faked_inos())
8028 st->st_ino = in->faked_ino;
8029 else
8030 st->st_ino = in->ino;
8031 st->st_dev = in->snapid;
8032 st->st_mode = in->mode;
8033 st->st_rdev = in->rdev;
28e407b8
AA
8034 if (in->is_dir()) {
8035 switch (in->nlink) {
8036 case 0:
8037 st->st_nlink = 0; /* dir is unlinked */
8038 break;
8039 case 1:
8040 st->st_nlink = 1 /* parent dentry */
8041 + 1 /* <dir>/. */
8042 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8043 break;
8044 default:
8045 ceph_abort();
8046 }
8047 } else {
8048 st->st_nlink = in->nlink;
8049 }
7c673cae
FG
8050 st->st_uid = in->uid;
8051 st->st_gid = in->gid;
8052 if (in->ctime > in->mtime) {
8053 stat_set_ctime_sec(st, in->ctime.sec());
8054 stat_set_ctime_nsec(st, in->ctime.nsec());
8055 } else {
8056 stat_set_ctime_sec(st, in->mtime.sec());
8057 stat_set_ctime_nsec(st, in->mtime.nsec());
8058 }
8059 stat_set_atime_sec(st, in->atime.sec());
8060 stat_set_atime_nsec(st, in->atime.nsec());
8061 stat_set_mtime_sec(st, in->mtime.sec());
8062 stat_set_mtime_nsec(st, in->mtime.nsec());
8063 if (in->is_dir()) {
8064 if (cct->_conf->client_dirsize_rbytes)
8065 st->st_size = in->rstat.rbytes;
8066 else
8067 st->st_size = in->dirstat.size();
f67539c2
TL
8068// The Windows "stat" structure provides just a subset of the fields that are
8069// available on Linux.
8070#ifndef _WIN32
7c673cae 8071 st->st_blocks = 1;
f67539c2 8072#endif
7c673cae
FG
8073 } else {
8074 st->st_size = in->size;
f67539c2 8075#ifndef _WIN32
7c673cae 8076 st->st_blocks = (in->size + 511) >> 9;
f67539c2 8077#endif
7c673cae 8078 }
f67539c2 8079#ifndef _WIN32
11fdf7f2 8080 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
f67539c2 8081#endif
7c673cae
FG
8082
8083 if (dirstat)
8084 *dirstat = in->dirstat;
8085 if (rstat)
8086 *rstat = in->rstat;
8087
8088 return in->caps_issued();
8089}
8090
8091void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8092{
11fdf7f2 8093 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8094 << " mode 0" << oct << in->mode << dec
8095 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8096 memset(stx, 0, sizeof(struct ceph_statx));
8097
8098 /*
8099 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
8100 * so that all bits are set.
8101 */
8102 if (!mask)
8103 mask = ~0;
8104
8105 /* These are always considered to be available */
8106 stx->stx_dev = in->snapid;
11fdf7f2 8107 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
8108
8109 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8110 stx->stx_mode = S_IFMT & in->mode;
8111 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
8112 stx->stx_rdev = in->rdev;
8113 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8114
8115 if (mask & CEPH_CAP_AUTH_SHARED) {
8116 stx->stx_uid = in->uid;
8117 stx->stx_gid = in->gid;
8118 stx->stx_mode = in->mode;
8119 in->btime.to_timespec(&stx->stx_btime);
8120 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8121 }
8122
8123 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
8124 if (in->is_dir()) {
8125 switch (in->nlink) {
8126 case 0:
8127 stx->stx_nlink = 0; /* dir is unlinked */
8128 break;
8129 case 1:
8130 stx->stx_nlink = 1 /* parent dentry */
8131 + 1 /* <dir>/. */
8132 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8133 break;
8134 default:
8135 ceph_abort();
8136 }
8137 } else {
8138 stx->stx_nlink = in->nlink;
8139 }
7c673cae
FG
8140 stx->stx_mask |= CEPH_STATX_NLINK;
8141 }
8142
8143 if (mask & CEPH_CAP_FILE_SHARED) {
8144
8145 in->atime.to_timespec(&stx->stx_atime);
8146 in->mtime.to_timespec(&stx->stx_mtime);
8147
8148 if (in->is_dir()) {
8149 if (cct->_conf->client_dirsize_rbytes)
8150 stx->stx_size = in->rstat.rbytes;
8151 else
8152 stx->stx_size = in->dirstat.size();
8153 stx->stx_blocks = 1;
8154 } else {
8155 stx->stx_size = in->size;
8156 stx->stx_blocks = (in->size + 511) >> 9;
8157 }
8158 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8159 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8160 }
8161
8162 /* Change time and change_attr both require all shared caps to view */
8163 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8164 stx->stx_version = in->change_attr;
8165 if (in->ctime > in->mtime)
8166 in->ctime.to_timespec(&stx->stx_ctime);
8167 else
8168 in->mtime.to_timespec(&stx->stx_ctime);
8169 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8170 }
8171
8172}
8173
8174void Client::touch_dn(Dentry *dn)
8175{
8176 lru.lru_touch(dn);
8177}
8178
8179int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8180{
b3b6e05e 8181 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
7c673cae
FG
8182}
8183
8184int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8185{
f67539c2
TL
8186 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8187 if (!mref_reader.is_state_satisfied())
8188 return -CEPHFS_ENOTCONN;
8189
11fdf7f2 8190 tout(cct) << __func__ << std::endl;
7c673cae
FG
8191 tout(cct) << fd << std::endl;
8192 tout(cct) << mode << std::endl;
181888fb 8193
f67539c2 8194 std::scoped_lock lock(client_lock);
7c673cae
FG
8195 Fh *f = get_filehandle(fd);
8196 if (!f)
f67539c2 8197 return -CEPHFS_EBADF;
7c673cae
FG
8198#if defined(__linux__) && defined(O_PATH)
8199 if (f->flags & O_PATH)
f67539c2 8200 return -CEPHFS_EBADF;
7c673cae
FG
8201#endif
8202 struct stat attr;
8203 attr.st_mode = mode;
8204 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8205}
8206
b3b6e05e
TL
8207int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8208 const UserPerm& perms) {
f67539c2 8209 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8210 if (!mref_reader.is_state_satisfied()) {
f67539c2 8211 return -CEPHFS_ENOTCONN;
b3b6e05e 8212 }
f67539c2 8213
11fdf7f2 8214 tout(cct) << __func__ << std::endl;
b3b6e05e 8215 tout(cct) << dirfd << std::endl;
7c673cae
FG
8216 tout(cct) << relpath << std::endl;
8217 tout(cct) << mode << std::endl;
b3b6e05e 8218 tout(cct) << flags << std::endl;
181888fb 8219
7c673cae
FG
8220 filepath path(relpath);
8221 InodeRef in;
b3b6e05e 8222 InodeRef dirinode;
f67539c2
TL
8223
8224 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8225 int r = get_fd_inode(dirfd, &dirinode);
8226 if (r < 0) {
8227 return r;
8228 }
8229
8230 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8231 if (r < 0) {
7c673cae 8232 return r;
b3b6e05e 8233 }
7c673cae
FG
8234 struct stat attr;
8235 attr.st_mode = mode;
8236 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8237}
8238
b3b6e05e
TL
8239int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8240{
8241 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8242}
8243
7c673cae
FG
8244int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8245 const UserPerm& perms)
8246{
b3b6e05e 8247 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
7c673cae
FG
8248}
8249
8250int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8251{
f67539c2
TL
8252 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8253 if (!mref_reader.is_state_satisfied())
8254 return -CEPHFS_ENOTCONN;
8255
11fdf7f2 8256 tout(cct) << __func__ << std::endl;
7c673cae
FG
8257 tout(cct) << fd << std::endl;
8258 tout(cct) << new_uid << std::endl;
8259 tout(cct) << new_gid << std::endl;
181888fb 8260
f67539c2 8261 std::scoped_lock lock(client_lock);
7c673cae
FG
8262 Fh *f = get_filehandle(fd);
8263 if (!f)
f67539c2 8264 return -CEPHFS_EBADF;
7c673cae
FG
8265#if defined(__linux__) && defined(O_PATH)
8266 if (f->flags & O_PATH)
f67539c2 8267 return -CEPHFS_EBADF;
7c673cae
FG
8268#endif
8269 struct stat attr;
8270 attr.st_uid = new_uid;
8271 attr.st_gid = new_gid;
8272 int mask = 0;
8273 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8274 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8275 return _setattr(f->inode, &attr, mask, perms);
8276}
8277
8278int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8279 const UserPerm& perms)
8280{
b3b6e05e
TL
8281 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8282}
8283
8284int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8285 int flags, const UserPerm& perms) {
f67539c2 8286 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8287 if (!mref_reader.is_state_satisfied()) {
f67539c2 8288 return -CEPHFS_ENOTCONN;
b3b6e05e 8289 }
f67539c2 8290
11fdf7f2 8291 tout(cct) << __func__ << std::endl;
b3b6e05e 8292 tout(cct) << dirfd << std::endl;
7c673cae
FG
8293 tout(cct) << relpath << std::endl;
8294 tout(cct) << new_uid << std::endl;
8295 tout(cct) << new_gid << std::endl;
b3b6e05e 8296 tout(cct) << flags << std::endl;
181888fb 8297
7c673cae
FG
8298 filepath path(relpath);
8299 InodeRef in;
b3b6e05e 8300 InodeRef dirinode;
f67539c2
TL
8301
8302 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8303 int r = get_fd_inode(dirfd, &dirinode);
8304 if (r < 0) {
7c673cae 8305 return r;
b3b6e05e
TL
8306 }
8307
8308 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8309 if (r < 0) {
8310 return r;
8311 }
7c673cae
FG
8312 struct stat attr;
8313 attr.st_uid = new_uid;
8314 attr.st_gid = new_gid;
b3b6e05e 8315 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
8316}
8317
11fdf7f2
TL
8318static void attr_set_atime_and_mtime(struct stat *attr,
8319 const utime_t &atime,
8320 const utime_t &mtime)
8321{
8322 stat_set_atime_sec(attr, atime.tv.tv_sec);
8323 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8324 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8325 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8326}
8327
8328// for [l]utime() invoke the timeval variant as the timespec
8329// variant are not yet implemented. for futime[s](), invoke
8330// the timespec variant.
7c673cae
FG
8331int Client::utime(const char *relpath, struct utimbuf *buf,
8332 const UserPerm& perms)
8333{
11fdf7f2
TL
8334 struct timeval tv[2];
8335 tv[0].tv_sec = buf->actime;
8336 tv[0].tv_usec = 0;
8337 tv[1].tv_sec = buf->modtime;
8338 tv[1].tv_usec = 0;
8339
8340 return utimes(relpath, tv, perms);
8341}
8342
8343int Client::lutime(const char *relpath, struct utimbuf *buf,
8344 const UserPerm& perms)
8345{
8346 struct timeval tv[2];
8347 tv[0].tv_sec = buf->actime;
8348 tv[0].tv_usec = 0;
8349 tv[1].tv_sec = buf->modtime;
8350 tv[1].tv_usec = 0;
8351
8352 return lutimes(relpath, tv, perms);
8353}
8354
8355int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8356{
8357 struct timespec ts[2];
8358 ts[0].tv_sec = buf->actime;
8359 ts[0].tv_nsec = 0;
8360 ts[1].tv_sec = buf->modtime;
8361 ts[1].tv_nsec = 0;
8362
8363 return futimens(fd, ts, perms);
8364}
8365
8366int Client::utimes(const char *relpath, struct timeval times[2],
8367 const UserPerm& perms)
8368{
f67539c2
TL
8369 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8370 if (!mref_reader.is_state_satisfied())
8371 return -CEPHFS_ENOTCONN;
8372
11fdf7f2 8373 tout(cct) << __func__ << std::endl;
7c673cae 8374 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8375 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8376 << std::endl;
8377 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8378 << std::endl;
181888fb 8379
7c673cae
FG
8380 filepath path(relpath);
8381 InodeRef in;
f67539c2
TL
8382
8383 std::scoped_lock lock(client_lock);
7c673cae
FG
8384 int r = path_walk(path, &in, perms);
8385 if (r < 0)
8386 return r;
8387 struct stat attr;
11fdf7f2
TL
8388 utime_t atime(times[0]);
8389 utime_t mtime(times[1]);
8390
8391 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8392 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8393}
8394
11fdf7f2
TL
8395int Client::lutimes(const char *relpath, struct timeval times[2],
8396 const UserPerm& perms)
7c673cae 8397{
f67539c2
TL
8398 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8399 if (!mref_reader.is_state_satisfied())
8400 return -CEPHFS_ENOTCONN;
8401
11fdf7f2 8402 tout(cct) << __func__ << std::endl;
7c673cae 8403 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8404 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8405 << std::endl;
8406 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8407 << std::endl;
181888fb 8408
7c673cae
FG
8409 filepath path(relpath);
8410 InodeRef in;
f67539c2
TL
8411
8412 std::scoped_lock lock(client_lock);
7c673cae
FG
8413 int r = path_walk(path, &in, perms, false);
8414 if (r < 0)
8415 return r;
8416 struct stat attr;
11fdf7f2
TL
8417 utime_t atime(times[0]);
8418 utime_t mtime(times[1]);
8419
8420 attr_set_atime_and_mtime(&attr, atime, mtime);
7c673cae
FG
8421 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8422}
8423
11fdf7f2
TL
8424int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8425{
8426 struct timespec ts[2];
8427 ts[0].tv_sec = times[0].tv_sec;
8428 ts[0].tv_nsec = times[0].tv_usec * 1000;
8429 ts[1].tv_sec = times[1].tv_sec;
8430 ts[1].tv_nsec = times[1].tv_usec * 1000;
8431
8432 return futimens(fd, ts, perms);
8433}
8434
8435int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8436{
f67539c2
TL
8437 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8438 if (!mref_reader.is_state_satisfied())
8439 return -CEPHFS_ENOTCONN;
8440
11fdf7f2
TL
8441 tout(cct) << __func__ << std::endl;
8442 tout(cct) << fd << std::endl;
8443 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8444 << std::endl;
8445 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8446 << std::endl;
8447
f67539c2 8448 std::scoped_lock lock(client_lock);
11fdf7f2
TL
8449 Fh *f = get_filehandle(fd);
8450 if (!f)
f67539c2 8451 return -CEPHFS_EBADF;
11fdf7f2
TL
8452#if defined(__linux__) && defined(O_PATH)
8453 if (f->flags & O_PATH)
f67539c2 8454 return -CEPHFS_EBADF;
11fdf7f2
TL
8455#endif
8456 struct stat attr;
8457 utime_t atime(times[0]);
8458 utime_t mtime(times[1]);
8459
8460 attr_set_atime_and_mtime(&attr, atime, mtime);
8461 return _setattr(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8462}
8463
b3b6e05e
TL
8464int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8465 const UserPerm& perms) {
8466 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8467 if (!mref_reader.is_state_satisfied()) {
8468 return -CEPHFS_ENOTCONN;
8469 }
8470
8471 tout(cct) << __func__ << std::endl;
8472 tout(cct) << dirfd << std::endl;
8473 tout(cct) << relpath << std::endl;
8474 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8475 << std::endl;
8476 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8477 << std::endl;
8478 tout(cct) << flags << std::endl;
8479
8480 filepath path(relpath);
8481 InodeRef in;
8482 InodeRef dirinode;
8483
8484 std::scoped_lock lock(client_lock);
8485 int r = get_fd_inode(dirfd, &dirinode);
8486 if (r < 0) {
8487 return r;
8488 }
8489
8490#if defined(__linux__) && defined(O_PATH)
8491 if (flags & O_PATH) {
8492 return -CEPHFS_EBADF;
8493 }
8494#endif
8495
8496 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8497 if (r < 0) {
8498 return r;
8499 }
8500 struct stat attr;
8501 utime_t atime(times[0]);
8502 utime_t mtime(times[1]);
8503
8504 attr_set_atime_and_mtime(&attr, atime, mtime);
8505 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
8506}
8507
7c673cae
FG
8508int Client::flock(int fd, int operation, uint64_t owner)
8509{
f67539c2
TL
8510 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8511 if (!mref_reader.is_state_satisfied())
8512 return -CEPHFS_ENOTCONN;
8513
11fdf7f2 8514 tout(cct) << __func__ << std::endl;
7c673cae
FG
8515 tout(cct) << fd << std::endl;
8516 tout(cct) << operation << std::endl;
8517 tout(cct) << owner << std::endl;
181888fb 8518
f67539c2 8519 std::scoped_lock lock(client_lock);
7c673cae
FG
8520 Fh *f = get_filehandle(fd);
8521 if (!f)
f67539c2 8522 return -CEPHFS_EBADF;
7c673cae
FG
8523
8524 return _flock(f, operation, owner);
8525}
8526
8527int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8528{
f67539c2
TL
8529 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8530 if (!mref_reader.is_state_satisfied())
8531 return -CEPHFS_ENOTCONN;
8532
11fdf7f2 8533 tout(cct) << __func__ << std::endl;
7c673cae 8534 tout(cct) << relpath << std::endl;
181888fb 8535
7c673cae
FG
8536 filepath path(relpath);
8537 InodeRef in;
f67539c2
TL
8538
8539 std::scoped_lock lock(client_lock);
7c673cae
FG
8540 int r = path_walk(path, &in, perms, true);
8541 if (r < 0)
8542 return r;
8543 if (cct->_conf->client_permissions) {
8544 int r = may_open(in.get(), O_RDONLY, perms);
8545 if (r < 0)
8546 return r;
8547 }
8548 r = _opendir(in.get(), dirpp, perms);
8549 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
f67539c2
TL
8550 if (r != -CEPHFS_ENOTDIR)
8551 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
8552 return r;
8553}
8554
b3b6e05e
TL
8555int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8556 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8557 if (!mref_reader.is_state_satisfied()) {
8558 return -CEPHFS_ENOTCONN;
8559 }
8560
8561 tout(cct) << __func__ << std::endl;
8562 tout(cct) << dirfd << std::endl;
8563
8564 InodeRef dirinode;
8565 std::scoped_lock locker(client_lock);
8566 int r = get_fd_inode(dirfd, &dirinode);
8567 if (r < 0) {
8568 return r;
8569 }
8570
8571 if (cct->_conf->client_permissions) {
8572 r = may_open(dirinode.get(), O_RDONLY, perms);
8573 if (r < 0) {
8574 return r;
8575 }
8576 }
8577 r = _opendir(dirinode.get(), dirpp, perms);
8578 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8579 if (r != -CEPHFS_ENOTDIR) {
8580 tout(cct) << (uintptr_t)*dirpp << std::endl;
8581 }
8582 return r;
8583}
8584
7c673cae
FG
8585int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8586{
8587 if (!in->is_dir())
f67539c2 8588 return -CEPHFS_ENOTDIR;
7c673cae
FG
8589 *dirpp = new dir_result_t(in, perms);
8590 opened_dirs.insert(*dirpp);
11fdf7f2 8591 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
8592 return 0;
8593}
8594
8595
8596int Client::closedir(dir_result_t *dir)
8597{
11fdf7f2 8598 tout(cct) << __func__ << std::endl;
f67539c2 8599 tout(cct) << (uintptr_t)dir << std::endl;
7c673cae 8600
11fdf7f2 8601 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
f67539c2 8602 std::scoped_lock lock(client_lock);
7c673cae
FG
8603 _closedir(dir);
8604 return 0;
8605}
8606
8607void Client::_closedir(dir_result_t *dirp)
8608{
11fdf7f2 8609 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
f67539c2 8610
7c673cae 8611 if (dirp->inode) {
11fdf7f2 8612 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
8613 dirp->inode.reset();
8614 }
8615 _readdir_drop_dirp_buffer(dirp);
8616 opened_dirs.erase(dirp);
8617 delete dirp;
8618}
8619
8620void Client::rewinddir(dir_result_t *dirp)
8621{
11fdf7f2 8622 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb 8623
f67539c2
TL
8624 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8625 if (!mref_reader.is_state_satisfied())
181888fb
FG
8626 return;
8627
f67539c2 8628 std::scoped_lock lock(client_lock);
7c673cae
FG
8629 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8630 _readdir_drop_dirp_buffer(d);
8631 d->reset();
8632}
8633
8634loff_t Client::telldir(dir_result_t *dirp)
8635{
8636 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 8637 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
8638 return d->offset;
8639}
8640
8641void Client::seekdir(dir_result_t *dirp, loff_t offset)
8642{
11fdf7f2 8643 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 8644
f67539c2
TL
8645 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8646 if (!mref_reader.is_state_satisfied())
181888fb
FG
8647 return;
8648
f67539c2
TL
8649 std::scoped_lock lock(client_lock);
8650
7c673cae
FG
8651 if (offset == dirp->offset)
8652 return;
8653
8654 if (offset > dirp->offset)
8655 dirp->release_count = 0; // bump if we do a forward seek
8656 else
8657 dirp->ordered_count = 0; // disable filling readdir cache
8658
8659 if (dirp->hash_order()) {
8660 if (dirp->offset > offset) {
8661 _readdir_drop_dirp_buffer(dirp);
8662 dirp->reset();
8663 }
8664 } else {
8665 if (offset == 0 ||
8666 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8667 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8668 _readdir_drop_dirp_buffer(dirp);
8669 dirp->reset();
8670 }
8671 }
8672
8673 dirp->offset = offset;
8674}
8675
8676
8677//struct dirent {
8678// ino_t d_ino; /* inode number */
8679// off_t d_off; /* offset to the next dirent */
8680// unsigned short d_reclen; /* length of this record */
8681// unsigned char d_type; /* type of file */
8682// char d_name[256]; /* filename */
8683//};
8684void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8685{
8686 strncpy(de->d_name, name, 255);
8687 de->d_name[255] = '\0';
f67539c2 8688#if !defined(__CYGWIN__) && !(defined(_WIN32))
7c673cae 8689 de->d_ino = ino;
11fdf7f2 8690#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
8691 de->d_off = next_off;
8692#endif
8693 de->d_reclen = 1;
8694 de->d_type = IFTODT(type);
11fdf7f2 8695 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
8696 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
8697#endif
8698}
8699
8700void Client::_readdir_next_frag(dir_result_t *dirp)
8701{
8702 frag_t fg = dirp->buffer_frag;
8703
8704 if (fg.is_rightmost()) {
11fdf7f2 8705 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
8706 dirp->set_end();
8707 return;
8708 }
8709
8710 // advance
8711 fg = fg.next();
11fdf7f2 8712 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
8713
8714 if (dirp->hash_order()) {
8715 // keep last_name
8716 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
8717 if (dirp->offset < new_offset) // don't decrease offset
8718 dirp->offset = new_offset;
8719 } else {
8720 dirp->last_name.clear();
8721 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8722 _readdir_rechoose_frag(dirp);
8723 }
8724}
8725
8726void Client::_readdir_rechoose_frag(dir_result_t *dirp)
8727{
11fdf7f2 8728 ceph_assert(dirp->inode);
7c673cae
FG
8729
8730 if (dirp->hash_order())
8731 return;
8732
8733 frag_t cur = frag_t(dirp->offset_high());
8734 frag_t fg = dirp->inode->dirfragtree[cur.value()];
8735 if (fg != cur) {
11fdf7f2 8736 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
8737 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
8738 dirp->last_name.clear();
8739 dirp->next_offset = 2;
8740 }
8741}
8742
8743void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
8744{
11fdf7f2 8745 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
8746 dirp->buffer.clear();
8747}
8748
8749int Client::_readdir_get_frag(dir_result_t *dirp)
8750{
11fdf7f2
TL
8751 ceph_assert(dirp);
8752 ceph_assert(dirp->inode);
7c673cae
FG
8753
8754 // get the current frag.
8755 frag_t fg;
8756 if (dirp->hash_order())
8757 fg = dirp->inode->dirfragtree[dirp->offset_high()];
8758 else
8759 fg = frag_t(dirp->offset_high());
8760
11fdf7f2 8761 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
8762 << " offset " << hex << dirp->offset << dec << dendl;
8763
8764 int op = CEPH_MDS_OP_READDIR;
8765 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
8766 op = CEPH_MDS_OP_LSSNAP;
8767
8768 InodeRef& diri = dirp->inode;
8769
8770 MetaRequest *req = new MetaRequest(op);
8771 filepath path;
8772 diri->make_nosnap_relative_path(path);
8773 req->set_filepath(path);
8774 req->set_inode(diri.get());
8775 req->head.args.readdir.frag = fg;
8776 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
8777 if (dirp->last_name.length()) {
94b18763 8778 req->path2.set_path(dirp->last_name);
7c673cae
FG
8779 } else if (dirp->hash_order()) {
8780 req->head.args.readdir.offset_hash = dirp->offset_high();
8781 }
8782 req->dirp = dirp;
8783
8784 bufferlist dirbl;
8785 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
8786
f67539c2 8787 if (res == -CEPHFS_EAGAIN) {
11fdf7f2 8788 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
8789 _readdir_rechoose_frag(dirp);
8790 return _readdir_get_frag(dirp);
8791 }
8792
8793 if (res == 0) {
11fdf7f2 8794 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
8795 << " size " << dirp->buffer.size() << dendl;
8796 } else {
11fdf7f2 8797 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
8798 dirp->set_end();
8799 }
8800
8801 return res;
8802}
8803
8804struct dentry_off_lt {
8805 bool operator()(const Dentry* dn, int64_t off) const {
8806 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
8807 }
8808};
8809
8810int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
8811 int caps, bool getref)
8812{
f67539c2 8813 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11fdf7f2 8814 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
8815 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
8816 << dendl;
8817 Dir *dir = dirp->inode->dir;
8818
8819 if (!dir) {
8820 ldout(cct, 10) << " dir is empty" << dendl;
8821 dirp->set_end();
8822 return 0;
8823 }
8824
8825 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
8826 dir->readdir_cache.end(),
8827 dirp->offset, dentry_off_lt());
8828
8829 string dn_name;
8830 while (true) {
adb31ebb 8831 int mask = caps;
7c673cae 8832 if (!dirp->inode->is_complete_and_ordered())
f67539c2 8833 return -CEPHFS_EAGAIN;
7c673cae
FG
8834 if (pd == dir->readdir_cache.end())
8835 break;
8836 Dentry *dn = *pd;
8837 if (dn->inode == NULL) {
8838 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
8839 ++pd;
8840 continue;
8841 }
8842 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
8843 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
8844 ++pd;
8845 continue;
8846 }
8847
92f5a8d4 8848 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
8849 if (dn->inode->is_dir()) {
8850 mask |= CEPH_STAT_RSTAT;
8851 }
8852 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
8853 if (r < 0)
8854 return r;
92f5a8d4
TL
8855
8856 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
8857 pd = dir->readdir_cache.begin() + idx;
8858 if (pd >= dir->readdir_cache.end() || *pd != dn)
f67539c2 8859 return -CEPHFS_EAGAIN;
7c673cae
FG
8860
8861 struct ceph_statx stx;
8862 struct dirent de;
8863 fill_statx(dn->inode, caps, &stx);
8864
8865 uint64_t next_off = dn->offset + 1;
eafe8130 8866 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
8867 ++pd;
8868 if (pd == dir->readdir_cache.end())
8869 next_off = dir_result_t::END;
8870
8871 Inode *in = NULL;
7c673cae
FG
8872 if (getref) {
8873 in = dn->inode.get();
8874 _ll_get(in);
8875 }
8876
8877 dn_name = dn->name; // fill in name while we have lock
8878
9f95a23c 8879 client_lock.unlock();
7c673cae 8880 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 8881 client_lock.lock();
7c673cae
FG
8882 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
8883 << " = " << r << dendl;
8884 if (r < 0) {
8885 return r;
8886 }
8887
8888 dirp->offset = next_off;
8889 if (dirp->at_end())
8890 dirp->next_offset = 2;
8891 else
8892 dirp->next_offset = dirp->offset_low();
8893 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 8894 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
8895 if (r > 0)
8896 return r;
8897 }
8898
11fdf7f2 8899 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
8900 dirp->set_end();
8901 return 0;
8902}
8903
8904int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
8905 unsigned want, unsigned flags, bool getref)
8906{
8907 int caps = statx_to_mask(flags, want);
8908
f67539c2
TL
8909 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8910 if (!mref_reader.is_state_satisfied())
8911 return -CEPHFS_ENOTCONN;
7c673cae 8912
f67539c2 8913 std::unique_lock cl(client_lock);
181888fb 8914
7c673cae
FG
8915 dir_result_t *dirp = static_cast<dir_result_t*>(d);
8916
11fdf7f2 8917 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
8918 << dec << " at_end=" << dirp->at_end()
8919 << " hash_order=" << dirp->hash_order() << dendl;
8920
8921 struct dirent de;
8922 struct ceph_statx stx;
8923 memset(&de, 0, sizeof(de));
8924 memset(&stx, 0, sizeof(stx));
8925
8926 InodeRef& diri = dirp->inode;
8927
8928 if (dirp->at_end())
8929 return 0;
8930
8931 if (dirp->offset == 0) {
8932 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 8933 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
8934 uint64_t next_off = 1;
8935
8936 int r;
adb31ebb 8937 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8938 if (r < 0)
8939 return r;
8940
8941 fill_statx(diri, caps, &stx);
8942 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
8943
8944 Inode *inode = NULL;
8945 if (getref) {
8946 inode = diri.get();
8947 _ll_get(inode);
8948 }
8949
f67539c2 8950 cl.unlock();
7c673cae 8951 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8952 cl.lock();
7c673cae
FG
8953 if (r < 0)
8954 return r;
8955
8956 dirp->offset = next_off;
8957 if (r > 0)
8958 return r;
8959 }
8960 if (dirp->offset == 1) {
8961 ldout(cct, 15) << " including .." << dendl;
8962 uint64_t next_off = 2;
8963 InodeRef in;
11fdf7f2 8964 if (diri->dentries.empty())
7c673cae
FG
8965 in = diri;
8966 else
94b18763 8967 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
8968
8969 int r;
adb31ebb 8970 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
8971 if (r < 0)
8972 return r;
8973
8974 fill_statx(in, caps, &stx);
8975 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
8976
8977 Inode *inode = NULL;
8978 if (getref) {
8979 inode = in.get();
8980 _ll_get(inode);
8981 }
8982
f67539c2 8983 cl.unlock();
7c673cae 8984 r = cb(p, &de, &stx, next_off, inode);
f67539c2 8985 cl.lock();
7c673cae
FG
8986 if (r < 0)
8987 return r;
8988
8989 dirp->offset = next_off;
8990 if (r > 0)
8991 return r;
8992 }
8993
8994 // can we read from our cache?
8995 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
8996 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
8997 << dirp->inode->is_complete_and_ordered()
8998 << " issued " << ccap_string(dirp->inode->caps_issued())
8999 << dendl;
9000 if (dirp->inode->snapid != CEPH_SNAPDIR &&
9001 dirp->inode->is_complete_and_ordered() &&
94b18763 9002 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 9003 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
f67539c2 9004 if (err != -CEPHFS_EAGAIN)
7c673cae
FG
9005 return err;
9006 }
9007
9008 while (1) {
9009 if (dirp->at_end())
9010 return 0;
9011
9012 bool check_caps = true;
9013 if (!dirp->is_cached()) {
9014 int r = _readdir_get_frag(dirp);
9015 if (r)
9016 return r;
9017 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9018 // different than the requested one. (our dirfragtree was outdated)
9019 check_caps = false;
9020 }
9021 frag_t fg = dirp->buffer_frag;
9022
9023 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9024 << " offset " << hex << dirp->offset << dendl;
9025
9026 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9027 dirp->offset, dir_result_t::dentry_off_lt());
9028 it != dirp->buffer.end();
9029 ++it) {
9030 dir_result_t::dentry &entry = *it;
9031
9032 uint64_t next_off = entry.offset + 1;
9033
9034 int r;
9035 if (check_caps) {
adb31ebb
TL
9036 int mask = caps;
9037 if(entry.inode->is_dir()){
9038 mask |= CEPH_STAT_RSTAT;
9039 }
9040 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
9041 if (r < 0)
9042 return r;
9043 }
9044
9045 fill_statx(entry.inode, caps, &stx);
9046 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9047
9048 Inode *inode = NULL;
9049 if (getref) {
9050 inode = entry.inode.get();
9051 _ll_get(inode);
9052 }
9053
f67539c2 9054 cl.unlock();
7c673cae 9055 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
f67539c2 9056 cl.lock();
7c673cae
FG
9057
9058 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9059 << " = " << r << dendl;
9060 if (r < 0)
9061 return r;
9062
9063 dirp->offset = next_off;
9064 if (r > 0)
9065 return r;
9066 }
9067
9068 if (dirp->next_offset > 2) {
9069 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9070 _readdir_drop_dirp_buffer(dirp);
9071 continue; // more!
9072 }
9073
9074 if (!fg.is_rightmost()) {
9075 // next frag!
9076 _readdir_next_frag(dirp);
9077 continue;
9078 }
9079
9080 if (diri->shared_gen == dirp->start_shared_gen &&
9081 diri->dir_release_count == dirp->release_count) {
9082 if (diri->dir_ordered_count == dirp->ordered_count) {
9083 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9084 if (diri->dir) {
11fdf7f2 9085 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
9086 diri->dir->readdir_cache.resize(dirp->cache_index);
9087 }
9088 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9089 } else {
9090 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9091 diri->flags |= I_COMPLETE;
9092 }
9093 }
9094
9095 dirp->set_end();
9096 return 0;
9097 }
9098 ceph_abort();
9099 return 0;
9100}
9101
9102
9103int Client::readdir_r(dir_result_t *d, struct dirent *de)
9104{
9105 return readdirplus_r(d, de, 0, 0, 0, NULL);
9106}
9107
9108/*
9109 * readdirplus_r
9110 *
9111 * returns
9112 * 1 if we got a dirent
9113 * 0 for end of directory
9114 * <0 on error
9115 */
9116
9117struct single_readdir {
9118 struct dirent *de;
9119 struct ceph_statx *stx;
9120 Inode *inode;
9121 bool full;
9122};
9123
9124static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9125 struct ceph_statx *stx, off_t off,
9126 Inode *in)
9127{
9128 single_readdir *c = static_cast<single_readdir *>(p);
9129
9130 if (c->full)
9131 return -1; // already filled this dirent
9132
9133 *c->de = *de;
9134 if (c->stx)
9135 *c->stx = *stx;
9136 c->inode = in;
9137 c->full = true;
9138 return 1;
9139}
9140
9141struct dirent *Client::readdir(dir_result_t *d)
9142{
9143 int ret;
f91f0fd5 9144 auto& de = d->de;
7c673cae
FG
9145 single_readdir sr;
9146 sr.de = &de;
9147 sr.stx = NULL;
9148 sr.inode = NULL;
9149 sr.full = false;
9150
9151 // our callback fills the dirent and sets sr.full=true on first
9152 // call, and returns -1 the second time around.
9153 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9154 if (ret < -1) {
9155 errno = -ret; // this sucks.
9156 return (dirent *) NULL;
9157 }
9158 if (sr.full) {
9159 return &de;
9160 }
9161 return (dirent *) NULL;
9162}
9163
9164int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9165 struct ceph_statx *stx, unsigned want,
9166 unsigned flags, Inode **out)
9167{
9168 single_readdir sr;
9169 sr.de = de;
9170 sr.stx = stx;
9171 sr.inode = NULL;
9172 sr.full = false;
9173
9174 // our callback fills the dirent and sets sr.full=true on first
9175 // call, and returns -1 the second time around.
9176 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9177 if (r < -1)
9178 return r;
9179 if (out)
9180 *out = sr.inode;
9181 if (sr.full)
9182 return 1;
9183 return 0;
9184}
9185
9186
9187/* getdents */
9188struct getdents_result {
9189 char *buf;
9190 int buflen;
9191 int pos;
9192 bool fullent;
9193};
9194
9195static int _readdir_getdent_cb(void *p, struct dirent *de,
9196 struct ceph_statx *stx, off_t off, Inode *in)
9197{
9198 struct getdents_result *c = static_cast<getdents_result *>(p);
9199
9200 int dlen;
9201 if (c->fullent)
9202 dlen = sizeof(*de);
9203 else
9204 dlen = strlen(de->d_name) + 1;
9205
9206 if (c->pos + dlen > c->buflen)
9207 return -1; // doesn't fit
9208
9209 if (c->fullent) {
9210 memcpy(c->buf + c->pos, de, sizeof(*de));
9211 } else {
9212 memcpy(c->buf + c->pos, de->d_name, dlen);
9213 }
9214 c->pos += dlen;
9215 return 0;
9216}
9217
9218int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9219{
9220 getdents_result gr;
9221 gr.buf = buf;
9222 gr.buflen = buflen;
9223 gr.fullent = fullent;
9224 gr.pos = 0;
9225
9226 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9227
9228 if (r < 0) { // some error
9229 if (r == -1) { // buffer ran out of space
9230 if (gr.pos) { // but we got some entries already!
9231 return gr.pos;
9232 } // or we need a larger buffer
f67539c2 9233 return -CEPHFS_ERANGE;
7c673cae
FG
9234 } else { // actual error, return it
9235 return r;
9236 }
9237 }
9238 return gr.pos;
9239}
9240
9241
9242/* getdir */
9243struct getdir_result {
9244 list<string> *contents;
9245 int num;
9246};
9247
9248static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9249{
9250 getdir_result *r = static_cast<getdir_result *>(p);
9251
9252 r->contents->push_back(de->d_name);
9253 r->num++;
9254 return 0;
9255}
9256
9257int Client::getdir(const char *relpath, list<string>& contents,
9258 const UserPerm& perms)
9259{
9260 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
f67539c2
TL
9261 tout(cct) << "getdir" << std::endl;
9262 tout(cct) << relpath << std::endl;
7c673cae
FG
9263
9264 dir_result_t *d;
9265 int r = opendir(relpath, &d, perms);
9266 if (r < 0)
9267 return r;
9268
9269 getdir_result gr;
9270 gr.contents = &contents;
9271 gr.num = 0;
9272 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9273
9274 closedir(d);
9275
9276 if (r < 0)
9277 return r;
9278 return gr.num;
9279}
9280
9281
9282/****** file i/o **********/
f67539c2 9283
b3b6e05e 9284// common parts for open and openat. call with client_lock locked.
20effc67 9285int Client::create_and_open(int dirfd, const char *relpath, int flags,
b3b6e05e
TL
9286 const UserPerm& perms, mode_t mode, int stripe_unit,
9287 int stripe_count, int object_size, const char *data_pool,
9288 std::string alternate_name) {
9289 ceph_assert(ceph_mutex_is_locked(client_lock));
f91f0fd5 9290 int cflags = ceph_flags_sys2wire(flags);
f91f0fd5 9291 tout(cct) << cflags << std::endl;
7c673cae
FG
9292
9293 Fh *fh = NULL;
9294
9295#if defined(__linux__) && defined(O_PATH)
9296 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9297 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9298 * in kernel (fs/open.c). */
9299 if (flags & O_PATH)
9300 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9301#endif
9302
9303 filepath path(relpath);
9304 InodeRef in;
9305 bool created = false;
9306 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9307 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
9308 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9309
b3b6e05e 9310 InodeRef dirinode = nullptr;
20effc67
TL
9311 int r = get_fd_inode(dirfd, &dirinode);
9312 if (r < 0) {
9313 return r;
b3b6e05e 9314 }
7c673cae 9315
20effc67 9316 r = path_walk(path, &in, perms, followsym, mask, dirinode);
7c673cae 9317 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 9318 return -CEPHFS_EEXIST;
7c673cae
FG
9319
9320#if defined(__linux__) && defined(O_PATH)
9321 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9322#else
b3b6e05e 9323 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
7c673cae 9324#endif
f67539c2 9325 return -CEPHFS_ELOOP;
7c673cae 9326
f67539c2 9327 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
7c673cae
FG
9328 filepath dirpath = path;
9329 string dname = dirpath.last_dentry();
9330 dirpath.pop_dentry();
9331 InodeRef dir;
9332 r = path_walk(dirpath, &dir, perms, true,
b3b6e05e
TL
9333 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9334 if (r < 0) {
7c673cae 9335 goto out;
b3b6e05e 9336 }
7c673cae
FG
9337 if (cct->_conf->client_permissions) {
9338 r = may_create(dir.get(), perms);
9339 if (r < 0)
b3b6e05e 9340 goto out;
7c673cae
FG
9341 }
9342 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
f67539c2
TL
9343 stripe_count, object_size, data_pool, &created, perms,
9344 std::move(alternate_name));
7c673cae
FG
9345 }
9346 if (r < 0)
9347 goto out;
9348
9349 if (!created) {
9350 // posix says we can only check permissions of existing files
9351 if (cct->_conf->client_permissions) {
9352 r = may_open(in.get(), flags, perms);
9353 if (r < 0)
b3b6e05e 9354 goto out;
7c673cae
FG
9355 }
9356 }
9357
9358 if (!fh)
9359 r = _open(in.get(), flags, mode, &fh, perms);
9360 if (r >= 0) {
9361 // allocate a integer file descriptor
11fdf7f2 9362 ceph_assert(fh);
7c673cae 9363 r = get_fd();
11fdf7f2 9364 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
9365 fd_map[r] = fh;
9366 }
9367
9368 out:
b3b6e05e
TL
9369 return r;
9370}
9371
9372int Client::open(const char *relpath, int flags, const UserPerm& perms,
9373 mode_t mode, int stripe_unit, int stripe_count,
9374 int object_size, const char *data_pool, std::string alternate_name)
9375{
9376 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9377 stripe_count, object_size, data_pool, alternate_name);
9378}
9379
b3b6e05e
TL
9380int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9381 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9382 const char *data_pool, std::string alternate_name) {
9383 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9384 if (!mref_reader.is_state_satisfied()) {
9385 return -CEPHFS_ENOTCONN;
9386 }
9387
9388 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9389 tout(cct) << dirfd << std::endl;
9390 tout(cct) << relpath << std::endl;
9391 tout(cct) << flags << std::endl;
9392 tout(cct) << mode << std::endl;
9393
9394 std::scoped_lock locker(client_lock);
9395 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9396 object_size, data_pool, alternate_name);
9397
7c673cae 9398 tout(cct) << r << std::endl;
b3b6e05e 9399 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
7c673cae
FG
9400 return r;
9401}
9402
7c673cae
FG
9403int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9404 const UserPerm& perms)
9405{
11fdf7f2 9406 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 9407
f67539c2
TL
9408 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9409 if (!mref_reader.is_state_satisfied())
9410 return -CEPHFS_ENOTCONN;
181888fb 9411
f67539c2 9412 std::scoped_lock lock(client_lock);
7c673cae
FG
9413 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9414 filepath path(ino);
9415 req->set_filepath(path);
9416
9417 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9418 char f[30];
9419 sprintf(f, "%u", h);
9420 filepath path2(dirino);
9421 path2.push_dentry(string(f));
9422 req->set_filepath2(path2);
9423
9424 int r = make_request(req, perms, NULL, NULL,
9425 rand() % mdsmap->get_num_in_mds());
11fdf7f2 9426 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
9427 return r;
9428}
9429
9430
9431/**
9432 * Load inode into local cache.
9433 *
9434 * If inode pointer is non-NULL, and take a reference on
9435 * the resulting Inode object in one operation, so that caller
9436 * can safely assume inode will still be there after return.
9437 */
f67539c2 9438int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
7c673cae 9439{
f67539c2 9440 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
7c673cae 9441
f67539c2
TL
9442 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9443 if (!mref_reader.is_state_satisfied())
9444 return -CEPHFS_ENOTCONN;
181888fb 9445
b3b6e05e
TL
9446 if (is_reserved_vino(vino))
9447 return -CEPHFS_ESTALE;
9448
7c673cae 9449 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
f67539c2 9450 filepath path(vino.ino);
7c673cae
FG
9451 req->set_filepath(path);
9452
f67539c2
TL
9453 /*
9454 * The MDS expects either a "real" snapid here or 0. The special value
9455 * carveouts for the snapid are all at the end of the range so we can
9456 * just look for any snapid below this value.
9457 */
9458 if (vino.snapid < CEPH_NOSNAP)
9459 req->head.args.lookupino.snapid = vino.snapid;
9460
7c673cae
FG
9461 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9462 if (r == 0 && inode != NULL) {
7c673cae 9463 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 9464 ceph_assert(p != inode_map.end());
7c673cae
FG
9465 *inode = p->second;
9466 _ll_get(*inode);
9467 }
f67539c2 9468 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
7c673cae
FG
9469 return r;
9470}
9471
1adf2230
AA
9472int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9473{
f67539c2
TL
9474 vinodeno_t vino(ino, CEPH_NOSNAP);
9475 std::scoped_lock lock(client_lock);
9476 return _lookup_vino(vino, perms, inode);
1adf2230 9477}
7c673cae
FG
9478
9479/**
9480 * Find the parent inode of `ino` and insert it into
9481 * our cache. Conditionally also set `parent` to a referenced
9482 * Inode* if caller provides non-NULL value.
9483 */
1adf2230 9484int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 9485{
11fdf7f2 9486 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9487
7c673cae
FG
9488 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9489 filepath path(ino->ino);
9490 req->set_filepath(path);
9491
9492 InodeRef target;
9493 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9494 // Give caller a reference to the parent ino if they provided a pointer.
9495 if (parent != NULL) {
9496 if (r == 0) {
9497 *parent = target.get();
9498 _ll_get(*parent);
11fdf7f2 9499 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
9500 } else {
9501 *parent = NULL;
9502 }
9503 }
11fdf7f2 9504 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9505 return r;
9506}
9507
7c673cae
FG
9508/**
9509 * Populate the parent dentry for `ino`, provided it is
9510 * a child of `parent`.
9511 */
1adf2230 9512int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 9513{
11fdf7f2
TL
9514 ceph_assert(parent->is_dir());
9515 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9516
f67539c2
TL
9517 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9518 if (!mref_reader.is_state_satisfied())
9519 return -CEPHFS_ENOTCONN;
181888fb 9520
7c673cae
FG
9521 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9522 req->set_filepath2(filepath(parent->ino));
9523 req->set_filepath(filepath(ino->ino));
9524 req->set_inode(ino);
9525
9526 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 9527 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9528 return r;
9529}
9530
1adf2230
AA
9531int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9532{
f67539c2 9533 std::scoped_lock lock(client_lock);
1adf2230
AA
9534 return _lookup_name(ino, parent, perms);
9535}
7c673cae 9536
11fdf7f2 9537Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 9538{
11fdf7f2 9539 ceph_assert(in);
f6b5b4d7 9540 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 9541
11fdf7f2 9542 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
9543
9544 if (in->snapid != CEPH_NOSNAP) {
9545 in->snap_cap_refs++;
9546 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9547 << ccap_string(in->caps_issued()) << dendl;
9548 }
9549
11fdf7f2 9550 const auto& conf = cct->_conf;
7c673cae
FG
9551 f->readahead.set_trigger_requests(1);
9552 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9553 uint64_t max_readahead = Readahead::NO_LIMIT;
9554 if (conf->client_readahead_max_bytes) {
11fdf7f2 9555 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
9556 }
9557 if (conf->client_readahead_max_periods) {
11fdf7f2 9558 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
9559 }
9560 f->readahead.set_max_readahead_size(max_readahead);
9561 vector<uint64_t> alignments;
9562 alignments.push_back(in->layout.get_period());
9563 alignments.push_back(in->layout.stripe_unit);
9564 f->readahead.set_alignments(alignments);
9565
9566 return f;
9567}
9568
9569int Client::_release_fh(Fh *f)
9570{
9571 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9572 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9573 Inode *in = f->inode.get();
11fdf7f2 9574 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 9575
b32b8144
FG
9576 in->unset_deleg(f);
9577
7c673cae
FG
9578 if (in->snapid == CEPH_NOSNAP) {
9579 if (in->put_open_ref(f->mode)) {
9580 _flush(in, new C_Client_FlushComplete(this, in));
9581 check_caps(in, 0);
9582 }
9583 } else {
11fdf7f2 9584 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
9585 in->snap_cap_refs--;
9586 }
9587
9588 _release_filelocks(f);
9589
9590 // Finally, read any async err (i.e. from flushes)
9591 int err = f->take_async_err();
9592 if (err != 0) {
11fdf7f2 9593 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
9594 << cpp_strerror(err) << dendl;
9595 } else {
11fdf7f2 9596 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
9597 }
9598
9599 _put_fh(f);
9600
9601 return err;
9602}
9603
9604void Client::_put_fh(Fh *f)
9605{
9606 int left = f->put();
9607 if (!left) {
9608 delete f;
9609 }
9610}
9611
9612int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9613 const UserPerm& perms)
9614{
9615 if (in->snapid != CEPH_NOSNAP &&
9616 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
f67539c2 9617 return -CEPHFS_EROFS;
7c673cae
FG
9618 }
9619
9620 // use normalized flags to generate cmode
11fdf7f2
TL
9621 int cflags = ceph_flags_sys2wire(flags);
9622 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9623 cflags |= CEPH_O_LAZY;
9624
9625 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
9626 int want = ceph_caps_for_mode(cmode);
9627 int result = 0;
9628
9629 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9630
b32b8144 9631 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
9632 // update wanted?
9633 check_caps(in, CHECK_CAPS_NODELAY);
9634 } else {
b32b8144 9635
7c673cae
FG
9636 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9637 filepath path;
9638 in->make_nosnap_relative_path(path);
9639 req->set_filepath(path);
11fdf7f2 9640 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
9641 req->head.args.open.mode = mode;
9642 req->head.args.open.pool = -1;
9643 if (cct->_conf->client_debug_getattr_caps)
9644 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9645 else
9646 req->head.args.open.mask = 0;
9647 req->head.args.open.old_size = in->size; // for O_TRUNC
9648 req->set_inode(in);
9649 result = make_request(req, perms);
b32b8144
FG
9650
9651 /*
9652 * NFS expects that delegations will be broken on a conflicting open,
9653 * not just when there is actual conflicting access to the file. SMB leases
9654 * and oplocks also have similar semantics.
9655 *
9656 * Ensure that clients that have delegations enabled will wait on minimal
9657 * caps during open, just to ensure that other clients holding delegations
9658 * return theirs first.
9659 */
9660 if (deleg_timeout && result == 0) {
9661 int need = 0, have;
9662
9663 if (cmode & CEPH_FILE_MODE_WR)
9664 need |= CEPH_CAP_FILE_WR;
9665 if (cmode & CEPH_FILE_MODE_RD)
9666 need |= CEPH_CAP_FILE_RD;
9667
f6b5b4d7
TL
9668 Fh fh(in, flags, cmode, fd_gen, perms);
9669 result = get_caps(&fh, need, want, &have, -1);
b32b8144 9670 if (result < 0) {
1adf2230 9671 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
9672 " . Denying open: " <<
9673 cpp_strerror(result) << dendl;
b32b8144
FG
9674 } else {
9675 put_cap_ref(in, need);
9676 }
9677 }
7c673cae
FG
9678 }
9679
9680 // success?
9681 if (result >= 0) {
9682 if (fhp)
9683 *fhp = _create_fh(in, flags, cmode, perms);
9684 } else {
9685 in->put_open_ref(cmode);
9686 }
9687
9688 trim_cache();
9689
9690 return result;
9691}
9692
9693int Client::_renew_caps(Inode *in)
9694{
9695 int wanted = in->caps_file_wanted();
9696 if (in->is_any_caps() &&
9697 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
9698 check_caps(in, CHECK_CAPS_NODELAY);
9699 return 0;
9700 }
9701
9702 int flags = 0;
9703 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
9704 flags = O_RDWR;
9705 else if (wanted & CEPH_CAP_FILE_RD)
9706 flags = O_RDONLY;
9707 else if (wanted & CEPH_CAP_FILE_WR)
9708 flags = O_WRONLY;
9709
9710 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9711 filepath path;
9712 in->make_nosnap_relative_path(path);
9713 req->set_filepath(path);
9714 req->head.args.open.flags = flags;
9715 req->head.args.open.pool = -1;
9716 if (cct->_conf->client_debug_getattr_caps)
9717 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9718 else
9719 req->head.args.open.mask = 0;
9720 req->set_inode(in);
9721
9722 // duplicate in case Cap goes away; not sure if that race is a concern?
9723 const UserPerm *pperm = in->get_best_perms();
9724 UserPerm perms;
9725 if (pperm != NULL)
9726 perms = *pperm;
9727 int ret = make_request(req, perms);
9728 return ret;
9729}
9730
b3b6e05e 9731int Client::_close(int fd)
7c673cae
FG
9732{
9733 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
7c673cae
FG
9734 tout(cct) << "close" << std::endl;
9735 tout(cct) << fd << std::endl;
9736
9737 Fh *fh = get_filehandle(fd);
9738 if (!fh)
f67539c2 9739 return -CEPHFS_EBADF;
7c673cae
FG
9740 int err = _release_fh(fh);
9741 fd_map.erase(fd);
9742 put_fd(fd);
9743 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
9744 return err;
9745}
9746
b3b6e05e
TL
9747int Client::close(int fd) {
9748 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9749 if (!mref_reader.is_state_satisfied())
9750 return -CEPHFS_ENOTCONN;
9751
9752 std::scoped_lock lock(client_lock);
9753 return _close(fd);
9754}
7c673cae
FG
9755
9756// ------------
9757// read, write
9758
9759loff_t Client::lseek(int fd, loff_t offset, int whence)
9760{
f67539c2
TL
9761 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9762 if (!mref_reader.is_state_satisfied())
9763 return -CEPHFS_ENOTCONN;
9764
7c673cae
FG
9765 tout(cct) << "lseek" << std::endl;
9766 tout(cct) << fd << std::endl;
9767 tout(cct) << offset << std::endl;
9768 tout(cct) << whence << std::endl;
9769
f67539c2 9770 std::scoped_lock lock(client_lock);
7c673cae
FG
9771 Fh *f = get_filehandle(fd);
9772 if (!f)
f67539c2 9773 return -CEPHFS_EBADF;
7c673cae
FG
9774#if defined(__linux__) && defined(O_PATH)
9775 if (f->flags & O_PATH)
f67539c2 9776 return -CEPHFS_EBADF;
7c673cae
FG
9777#endif
9778 return _lseek(f, offset, whence);
9779}
9780
9781loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
9782{
9783 Inode *in = f->inode.get();
9f95a23c 9784 bool whence_check = false;
11fdf7f2 9785 loff_t pos = -1;
7c673cae 9786
9f95a23c
TL
9787 switch (whence) {
9788 case SEEK_END:
9789 whence_check = true;
9790 break;
9791
9792#ifdef SEEK_DATA
9793 case SEEK_DATA:
9794 whence_check = true;
9795 break;
9796#endif
9797
9798#ifdef SEEK_HOLE
9799 case SEEK_HOLE:
9800 whence_check = true;
9801 break;
9802#endif
9803 }
9804
9805 if (whence_check) {
9806 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
9807 if (r < 0)
92f5a8d4 9808 return r;
92f5a8d4
TL
9809 }
9810
7c673cae
FG
9811 switch (whence) {
9812 case SEEK_SET:
11fdf7f2 9813 pos = offset;
7c673cae
FG
9814 break;
9815
9816 case SEEK_CUR:
92f5a8d4 9817 pos = f->pos + offset;
7c673cae
FG
9818 break;
9819
9820 case SEEK_END:
11fdf7f2 9821 pos = in->size + offset;
7c673cae
FG
9822 break;
9823
9f95a23c 9824#ifdef SEEK_DATA
92f5a8d4 9825 case SEEK_DATA:
9f95a23c 9826 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9827 return -CEPHFS_ENXIO;
92f5a8d4
TL
9828 pos = offset;
9829 break;
9f95a23c 9830#endif
92f5a8d4 9831
9f95a23c 9832#ifdef SEEK_HOLE
92f5a8d4 9833 case SEEK_HOLE:
9f95a23c 9834 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 9835 return -CEPHFS_ENXIO;
9f95a23c 9836 pos = in->size;
92f5a8d4 9837 break;
9f95a23c 9838#endif
92f5a8d4 9839
7c673cae 9840 default:
92f5a8d4 9841 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
f67539c2 9842 return -CEPHFS_EINVAL;
7c673cae
FG
9843 }
9844
11fdf7f2 9845 if (pos < 0) {
f67539c2 9846 return -CEPHFS_EINVAL;
11fdf7f2
TL
9847 } else {
9848 f->pos = pos;
9849 }
9850
1adf2230 9851 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
9852 return f->pos;
9853}
9854
9855
9856void Client::lock_fh_pos(Fh *f)
9857{
11fdf7f2 9858 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
9859
9860 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 9861 ceph::condition_variable cond;
7c673cae 9862 f->pos_waiters.push_back(&cond);
11fdf7f2 9863 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
9864 std::unique_lock l{client_lock, std::adopt_lock};
9865 cond.wait(l, [f, me=&cond] {
9866 return !f->pos_locked && f->pos_waiters.front() == me;
9867 });
9868 l.release();
11fdf7f2
TL
9869 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
9870 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
9871 f->pos_waiters.pop_front();
9872 }
9873
9874 f->pos_locked = true;
9875}
9876
9877void Client::unlock_fh_pos(Fh *f)
9878{
f67539c2
TL
9879 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9880
11fdf7f2 9881 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 9882 f->pos_locked = false;
f67539c2
TL
9883 if (!f->pos_waiters.empty()) {
9884 // only wake up the oldest waiter
9885 auto cond = f->pos_waiters.front();
9886 cond->notify_one();
9887 }
7c673cae
FG
9888}
9889
9890int Client::uninline_data(Inode *in, Context *onfinish)
9891{
9892 if (!in->inline_data.length()) {
9893 onfinish->complete(0);
9894 return 0;
9895 }
9896
9897 char oid_buf[32];
9898 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
9899 object_t oid = oid_buf;
9900
9901 ObjectOperation create_ops;
9902 create_ops.create(false);
9903
9904 objecter->mutate(oid,
9905 OSDMap::file_to_object_locator(in->layout),
9906 create_ops,
9907 in->snaprealm->get_snap_context(),
9908 ceph::real_clock::now(),
9909 0,
9910 NULL);
9911
9912 bufferlist inline_version_bl;
11fdf7f2 9913 encode(in->inline_version, inline_version_bl);
7c673cae
FG
9914
9915 ObjectOperation uninline_ops;
9916 uninline_ops.cmpxattr("inline_version",
9917 CEPH_OSD_CMPXATTR_OP_GT,
9918 CEPH_OSD_CMPXATTR_MODE_U64,
9919 inline_version_bl);
9920 bufferlist inline_data = in->inline_data;
9921 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
9922 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
9923
9924 objecter->mutate(oid,
9925 OSDMap::file_to_object_locator(in->layout),
9926 uninline_ops,
9927 in->snaprealm->get_snap_context(),
9928 ceph::real_clock::now(),
9929 0,
9930 onfinish);
9931
9932 return 0;
9933}
9934
9935//
9936
9937// blocking osd interface
9938
9939int Client::read(int fd, char *buf, loff_t size, loff_t offset)
9940{
f67539c2
TL
9941 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9942 if (!mref_reader.is_state_satisfied())
9943 return -CEPHFS_ENOTCONN;
9944
7c673cae
FG
9945 tout(cct) << "read" << std::endl;
9946 tout(cct) << fd << std::endl;
9947 tout(cct) << size << std::endl;
9948 tout(cct) << offset << std::endl;
9949
f67539c2 9950 std::unique_lock lock(client_lock);
7c673cae
FG
9951 Fh *f = get_filehandle(fd);
9952 if (!f)
f67539c2 9953 return -CEPHFS_EBADF;
7c673cae
FG
9954#if defined(__linux__) && defined(O_PATH)
9955 if (f->flags & O_PATH)
f67539c2 9956 return -CEPHFS_EBADF;
7c673cae
FG
9957#endif
9958 bufferlist bl;
11fdf7f2
TL
9959 /* We can't return bytes written larger than INT_MAX, clamp size to that */
9960 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
9961 int r = _read(f, offset, size, &bl);
9962 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
9963 if (r >= 0) {
f6b5b4d7 9964 lock.unlock();
9f95a23c 9965 bl.begin().copy(bl.length(), buf);
7c673cae
FG
9966 r = bl.length();
9967 }
9968 return r;
9969}
9970
9971int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
9972{
9973 if (iovcnt < 0)
f67539c2 9974 return -CEPHFS_EINVAL;
7c673cae
FG
9975 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
9976}
9977
11fdf7f2 9978int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 9979{
f67539c2
TL
9980 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
9981
11fdf7f2
TL
9982 int want, have = 0;
9983 bool movepos = false;
9984 std::unique_ptr<C_SaferCond> onuninline;
adb31ebb 9985 int64_t rc = 0;
11fdf7f2 9986 const auto& conf = cct->_conf;
7c673cae 9987 Inode *in = f->inode.get();
11fdf7f2
TL
9988 utime_t lat;
9989 utime_t start = ceph_clock_now();
7c673cae
FG
9990
9991 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
f67539c2 9992 return -CEPHFS_EBADF;
7c673cae
FG
9993 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9994
7c673cae
FG
9995 if (offset < 0) {
9996 lock_fh_pos(f);
9997 offset = f->pos;
9998 movepos = true;
9999 }
10000 loff_t start_pos = offset;
10001
10002 if (in->inline_version == 0) {
adb31ebb 10003 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 10004 if (r < 0) {
adb31ebb 10005 rc = r;
11fdf7f2 10006 goto done;
c07f9fc5 10007 }
11fdf7f2 10008 ceph_assert(in->inline_version > 0);
7c673cae
FG
10009 }
10010
10011retry:
11fdf7f2
TL
10012 if (f->mode & CEPH_FILE_MODE_LAZY)
10013 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10014 else
10015 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
10016 {
10017 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10018 if (r < 0) {
10019 rc = r;
10020 goto done;
10021 }
c07f9fc5 10022 }
7c673cae 10023 if (f->flags & O_DIRECT)
11fdf7f2 10024 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10025
10026 if (in->inline_version < CEPH_INLINE_NONE) {
10027 if (!(have & CEPH_CAP_FILE_CACHE)) {
11fdf7f2
TL
10028 onuninline.reset(new C_SaferCond("Client::_read_uninline_data flock"));
10029 uninline_data(in, onuninline.get());
7c673cae
FG
10030 } else {
10031 uint32_t len = in->inline_data.length();
7c673cae
FG
10032 uint64_t endoff = offset + size;
10033 if (endoff > in->size)
10034 endoff = in->size;
10035
10036 if (offset < len) {
10037 if (endoff <= len) {
10038 bl->substr_of(in->inline_data, offset, endoff - offset);
10039 } else {
10040 bl->substr_of(in->inline_data, offset, len - offset);
10041 bl->append_zero(endoff - len);
10042 }
adb31ebb 10043 rc = endoff - offset;
7c673cae
FG
10044 } else if ((uint64_t)offset < endoff) {
10045 bl->append_zero(endoff - offset);
adb31ebb 10046 rc = endoff - offset;
11fdf7f2 10047 } else {
adb31ebb 10048 rc = 0;
7c673cae 10049 }
7c673cae
FG
10050 goto success;
10051 }
10052 }
10053
10054 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
10055 conf->client_oc &&
10056 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10057
10058 if (f->flags & O_RSYNC) {
10059 _flush_range(in, offset, size);
10060 }
adb31ebb
TL
10061 rc = _read_async(f, offset, size, bl);
10062 if (rc < 0)
7c673cae
FG
10063 goto done;
10064 } else {
10065 if (f->flags & O_DIRECT)
10066 _flush_range(in, offset, size);
10067
10068 bool checkeof = false;
adb31ebb
TL
10069 rc = _read_sync(f, offset, size, bl, &checkeof);
10070 if (rc < 0)
7c673cae
FG
10071 goto done;
10072 if (checkeof) {
adb31ebb
TL
10073 offset += rc;
10074 size -= rc;
7c673cae
FG
10075
10076 put_cap_ref(in, CEPH_CAP_FILE_RD);
10077 have = 0;
10078 // reverify size
adb31ebb
TL
10079 {
10080 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10081 if (r < 0) {
10082 rc = r;
10083 goto done;
10084 }
10085 }
7c673cae
FG
10086
10087 // eof? short read.
10088 if ((uint64_t)offset < in->size)
10089 goto retry;
10090 }
10091 }
10092
10093success:
adb31ebb 10094 ceph_assert(rc >= 0);
a4b75251 10095 update_read_io_size(bl->length());
7c673cae
FG
10096 if (movepos) {
10097 // adjust fd pos
adb31ebb 10098 f->pos = start_pos + rc;
7c673cae 10099 }
11fdf7f2
TL
10100
10101 lat = ceph_clock_now();
10102 lat -= start;
10103 logger->tinc(l_c_read, lat);
7c673cae
FG
10104
10105done:
10106 // done!
11fdf7f2 10107
7c673cae 10108 if (onuninline) {
9f95a23c 10109 client_lock.unlock();
11fdf7f2 10110 int ret = onuninline->wait();
9f95a23c 10111 client_lock.lock();
f67539c2 10112 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10113 in->inline_data.clear();
10114 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10115 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10116 check_caps(in, 0);
10117 } else
adb31ebb 10118 rc = ret;
7c673cae 10119 }
11fdf7f2 10120 if (have) {
7c673cae 10121 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
10122 }
10123 if (movepos) {
10124 unlock_fh_pos(f);
10125 }
adb31ebb 10126 return rc;
7c673cae
FG
10127}
10128
10129Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10130 client(c), f(f) {
10131 f->get();
10132 f->readahead.inc_pending();
10133}
10134
10135Client::C_Readahead::~C_Readahead() {
10136 f->readahead.dec_pending();
10137 client->_put_fh(f);
10138}
10139
10140void Client::C_Readahead::finish(int r) {
10141 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10142 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
a4b75251
TL
10143 if (r > 0) {
10144 client->update_read_io_size(r);
10145 }
7c673cae
FG
10146}
10147
10148int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10149{
f67539c2
TL
10150 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10151
11fdf7f2 10152 const auto& conf = cct->_conf;
7c673cae
FG
10153 Inode *in = f->inode.get();
10154
11fdf7f2 10155 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
10156
10157 // trim read based on file size?
10158 if (off >= in->size)
10159 return 0;
10160 if (len == 0)
10161 return 0;
10162 if (off + len > in->size) {
10163 len = in->size - off;
10164 }
10165
10166 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10167 << " max_bytes=" << f->readahead.get_max_readahead_size()
10168 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10169
10170 // read (and possibly block)
11fdf7f2
TL
10171 int r = 0;
10172 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 10173 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 10174 off, len, bl, 0, &onfinish);
7c673cae
FG
10175 if (r == 0) {
10176 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 10177 client_lock.unlock();
11fdf7f2 10178 r = onfinish.wait();
9f95a23c 10179 client_lock.lock();
7c673cae 10180 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
a4b75251 10181 update_read_io_size(bl->length());
7c673cae
FG
10182 }
10183
10184 if(f->readahead.get_min_readahead_size() > 0) {
10185 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10186 if (readahead_extent.second > 0) {
10187 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10188 << " (caller wants " << off << "~" << len << ")" << dendl;
10189 Context *onfinish2 = new C_Readahead(this, f);
10190 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10191 readahead_extent.first, readahead_extent.second,
10192 NULL, 0, onfinish2);
10193 if (r2 == 0) {
10194 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10195 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10196 } else {
10197 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10198 delete onfinish2;
10199 }
10200 }
10201 }
10202
10203 return r;
10204}
10205
10206int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10207 bool *checkeof)
10208{
f67539c2
TL
10209 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10210
7c673cae
FG
10211 Inode *in = f->inode.get();
10212 uint64_t pos = off;
10213 int left = len;
10214 int read = 0;
10215
11fdf7f2 10216 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 10217
f67539c2
TL
10218 // 0 success, 1 continue and < 0 error happen.
10219 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
11fdf7f2 10220 int r = onfinish.wait();
7c673cae
FG
10221
10222 // if we get ENOENT from OSD, assume 0 bytes returned
f67539c2 10223 if (r == -CEPHFS_ENOENT)
7c673cae
FG
10224 r = 0;
10225 if (r < 0)
10226 return r;
f67539c2 10227
7c673cae
FG
10228 if (tbl.length()) {
10229 r = tbl.length();
10230
10231 read += r;
10232 pos += r;
10233 left -= r;
10234 bl->claim_append(tbl);
10235 }
10236 // short read?
10237 if (r >= 0 && r < wanted) {
10238 if (pos < in->size) {
10239 // zero up to known EOF
10240 int64_t some = in->size - pos;
10241 if (some > left)
10242 some = left;
11fdf7f2
TL
10243 auto z = buffer::ptr_node::create(some);
10244 z->zero();
10245 bl->push_back(std::move(z));
7c673cae
FG
10246 read += some;
10247 pos += some;
10248 left -= some;
10249 if (left == 0)
f67539c2 10250 return 0;
7c673cae
FG
10251 }
10252
10253 *checkeof = true;
f67539c2 10254 return 0;
7c673cae 10255 }
f67539c2
TL
10256 return 1;
10257 };
7c673cae 10258
f67539c2
TL
10259 while (left > 0) {
10260 C_SaferCond onfinish("Client::_read_sync flock");
10261 bufferlist tbl;
7c673cae 10262
f67539c2
TL
10263 int wanted = left;
10264 filer->read_trunc(in->ino, &in->layout, in->snapid,
10265 pos, left, &tbl, 0,
10266 in->truncate_size, in->truncate_seq,
10267 &onfinish);
10268 client_lock.unlock();
10269 int r = wait_and_copy(onfinish, tbl, wanted);
10270 client_lock.lock();
10271 if (!r)
10272 return read;
10273 if (r < 0)
10274 return r;
7c673cae 10275 }
f67539c2 10276 return read;
7c673cae
FG
10277}
10278
10279int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10280{
f67539c2
TL
10281 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10282 if (!mref_reader.is_state_satisfied())
10283 return -CEPHFS_ENOTCONN;
10284
7c673cae
FG
10285 tout(cct) << "write" << std::endl;
10286 tout(cct) << fd << std::endl;
10287 tout(cct) << size << std::endl;
10288 tout(cct) << offset << std::endl;
10289
f67539c2 10290 std::scoped_lock lock(client_lock);
7c673cae
FG
10291 Fh *fh = get_filehandle(fd);
10292 if (!fh)
f67539c2 10293 return -CEPHFS_EBADF;
7c673cae
FG
10294#if defined(__linux__) && defined(O_PATH)
10295 if (fh->flags & O_PATH)
f67539c2 10296 return -CEPHFS_EBADF;
7c673cae 10297#endif
11fdf7f2
TL
10298 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10299 size = std::min(size, (loff_t)INT_MAX);
10300 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
10301 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10302 return r;
10303}
10304
10305int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10306{
10307 if (iovcnt < 0)
f67539c2 10308 return -CEPHFS_EINVAL;
7c673cae
FG
10309 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10310}
10311
11fdf7f2 10312int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
20effc67
TL
10313 unsigned iovcnt, int64_t offset,
10314 bool write, bool clamp_to_int)
7c673cae 10315{
20effc67
TL
10316 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10317
7c673cae
FG
10318#if defined(__linux__) && defined(O_PATH)
10319 if (fh->flags & O_PATH)
f67539c2 10320 return -CEPHFS_EBADF;
7c673cae
FG
10321#endif
10322 loff_t totallen = 0;
10323 for (unsigned i = 0; i < iovcnt; i++) {
10324 totallen += iov[i].iov_len;
10325 }
11fdf7f2
TL
10326
10327 /*
10328 * Some of the API functions take 64-bit size values, but only return
10329 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10330 * we don't do I/Os larger than the values we can return.
10331 */
10332 if (clamp_to_int) {
10333 totallen = std::min(totallen, (loff_t)INT_MAX);
10334 }
7c673cae 10335 if (write) {
11fdf7f2
TL
10336 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10337 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
10338 return w;
10339 } else {
10340 bufferlist bl;
11fdf7f2
TL
10341 int64_t r = _read(fh, offset, totallen, &bl);
10342 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
10343 if (r <= 0)
10344 return r;
10345
20effc67 10346 client_lock.unlock();
9f95a23c 10347 auto iter = bl.cbegin();
7c673cae
FG
10348 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10349 /*
f67539c2
TL
10350 * This piece of code aims to handle the case that bufferlist
10351 * does not have enough data to fill in the iov
7c673cae 10352 */
9f95a23c
TL
10353 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10354 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10355 resid -= round_size;
10356 /* iter is self-updating */
7c673cae 10357 }
20effc67 10358 client_lock.lock();
f67539c2 10359 return r;
7c673cae
FG
10360 }
10361}
10362
11fdf7f2
TL
10363int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10364{
f67539c2
TL
10365 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10366 if (!mref_reader.is_state_satisfied())
10367 return -CEPHFS_ENOTCONN;
10368
11fdf7f2
TL
10369 tout(cct) << fd << std::endl;
10370 tout(cct) << offset << std::endl;
10371
20effc67 10372 std::scoped_lock cl(client_lock);
11fdf7f2
TL
10373 Fh *fh = get_filehandle(fd);
10374 if (!fh)
f67539c2 10375 return -CEPHFS_EBADF;
20effc67 10376 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
11fdf7f2
TL
10377}
10378
10379int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10380 const struct iovec *iov, int iovcnt)
7c673cae 10381{
f67539c2
TL
10382 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10383
f64942e4
AA
10384 uint64_t fpos = 0;
10385
7c673cae 10386 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
f67539c2 10387 return -CEPHFS_EFBIG;
7c673cae
FG
10388
10389 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
10390 Inode *in = f->inode.get();
10391
10392 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
f67539c2 10393 return -CEPHFS_ENOSPC;
7c673cae
FG
10394 }
10395
11fdf7f2 10396 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
10397
10398 // was Fh opened as writeable?
10399 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10400 return -CEPHFS_EBADF;
7c673cae 10401
7c673cae
FG
10402 // use/adjust fd pos?
10403 if (offset < 0) {
10404 lock_fh_pos(f);
10405 /*
10406 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10407 * change out from under us.
10408 */
10409 if (f->flags & O_APPEND) {
9f95a23c 10410 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
10411 if (r < 0) {
10412 unlock_fh_pos(f);
10413 return r;
10414 }
10415 }
10416 offset = f->pos;
f64942e4 10417 fpos = offset+size;
7c673cae
FG
10418 unlock_fh_pos(f);
10419 }
10420
11fdf7f2
TL
10421 // check quota
10422 uint64_t endoff = offset + size;
10423 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10424 f->actor_perms)) {
f67539c2 10425 return -CEPHFS_EDQUOT;
11fdf7f2
TL
10426 }
10427
7c673cae
FG
10428 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10429
10430 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10431
10432 // time it.
10433 utime_t start = ceph_clock_now();
10434
10435 if (in->inline_version == 0) {
10436 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10437 if (r < 0)
10438 return r;
11fdf7f2 10439 ceph_assert(in->inline_version > 0);
7c673cae
FG
10440 }
10441
10442 // copy into fresh buffer (since our write may be resub, async)
10443 bufferlist bl;
10444 if (buf) {
10445 if (size > 0)
10446 bl.append(buf, size);
10447 } else if (iov){
10448 for (int i = 0; i < iovcnt; i++) {
10449 if (iov[i].iov_len > 0) {
10450 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10451 }
10452 }
10453 }
10454
10455 utime_t lat;
10456 uint64_t totalwritten;
11fdf7f2
TL
10457 int want, have;
10458 if (f->mode & CEPH_FILE_MODE_LAZY)
10459 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10460 else
10461 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 10462 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
10463 if (r < 0)
10464 return r;
10465
10466 /* clear the setuid/setgid bits, if any */
181888fb 10467 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
7c673cae
FG
10468 struct ceph_statx stx = { 0 };
10469
10470 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10471 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
10472 if (r < 0)
10473 return r;
10474 } else {
10475 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10476 }
10477
10478 if (f->flags & O_DIRECT)
11fdf7f2 10479 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10480
10481 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10482
11fdf7f2
TL
10483 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10484
7c673cae
FG
10485 if (in->inline_version < CEPH_INLINE_NONE) {
10486 if (endoff > cct->_conf->client_max_inline_size ||
10487 endoff > CEPH_INLINE_MAX_SIZE ||
10488 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
10489 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10490 uninline_data(in, onuninline.get());
7c673cae
FG
10491 } else {
10492 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10493
10494 uint32_t len = in->inline_data.length();
10495
10496 if (endoff < len)
9f95a23c 10497 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
10498
10499 if (offset < len)
10500 in->inline_data.splice(offset, len - offset);
10501 else if (offset > len)
10502 in->inline_data.append_zero(offset - len);
10503
10504 in->inline_data.append(bl);
10505 in->inline_version++;
10506
10507 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10508
10509 goto success;
10510 }
10511 }
10512
11fdf7f2
TL
10513 if (cct->_conf->client_oc &&
10514 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10515 // do buffered write
10516 if (!in->oset.dirty_or_tx)
10517 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10518
10519 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10520
10521 // async, caching, non-blocking.
10522 r = objectcacher->file_write(&in->oset, &in->layout,
10523 in->snaprealm->get_snap_context(),
10524 offset, size, bl, ceph::real_clock::now(),
10525 0);
10526 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10527
10528 if (r < 0)
10529 goto done;
10530
10531 // flush cached write if O_SYNC is set on file fh
10532 // O_DSYNC == O_SYNC on linux < 2.6.33
10533 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10534 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10535 _flush_range(in, offset, size);
10536 }
10537 } else {
10538 if (f->flags & O_DIRECT)
10539 _flush_range(in, offset, size);
10540
10541 // simple, non-atomic sync write
11fdf7f2 10542 C_SaferCond onfinish("Client::_write flock");
f67539c2 10543 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
10544
10545 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10546 offset, size, bl, ceph::real_clock::now(), 0,
10547 in->truncate_size, in->truncate_seq,
11fdf7f2 10548 &onfinish);
9f95a23c 10549 client_lock.unlock();
f6b5b4d7 10550 r = onfinish.wait();
9f95a23c 10551 client_lock.lock();
f67539c2 10552 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
f6b5b4d7
TL
10553 if (r < 0)
10554 goto done;
7c673cae
FG
10555 }
10556
10557 // if we get here, write was successful, update client metadata
10558success:
a4b75251 10559 update_write_io_size(size);
7c673cae
FG
10560 // time
10561 lat = ceph_clock_now();
10562 lat -= start;
10563 logger->tinc(l_c_wrlat, lat);
10564
f64942e4
AA
10565 if (fpos) {
10566 lock_fh_pos(f);
10567 f->pos = fpos;
10568 unlock_fh_pos(f);
10569 }
7c673cae 10570 totalwritten = size;
11fdf7f2 10571 r = (int64_t)totalwritten;
7c673cae
FG
10572
10573 // extend file?
10574 if (totalwritten + offset > in->size) {
10575 in->size = totalwritten + offset;
28e407b8 10576 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 10577
11fdf7f2 10578 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 10579 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
10580 } else if (is_max_size_approaching(in)) {
10581 check_caps(in, 0);
7c673cae
FG
10582 }
10583
10584 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10585 } else {
10586 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10587 }
10588
10589 // mtime
91327a77 10590 in->mtime = in->ctime = ceph_clock_now();
7c673cae 10591 in->change_attr++;
28e407b8 10592 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10593
10594done:
10595
11fdf7f2 10596 if (nullptr != onuninline) {
9f95a23c 10597 client_lock.unlock();
11fdf7f2 10598 int uninline_ret = onuninline->wait();
9f95a23c 10599 client_lock.lock();
7c673cae 10600
f67539c2 10601 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10602 in->inline_data.clear();
10603 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10604 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10605 check_caps(in, 0);
10606 } else
10607 r = uninline_ret;
10608 }
10609
10610 put_cap_ref(in, CEPH_CAP_FILE_WR);
10611 return r;
10612}
10613
10614int Client::_flush(Fh *f)
10615{
10616 Inode *in = f->inode.get();
10617 int err = f->take_async_err();
10618 if (err != 0) {
10619 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10620 << cpp_strerror(err) << dendl;
10621 } else {
10622 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10623 }
10624
10625 return err;
10626}
10627
10628int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10629{
10630 struct ceph_statx stx;
10631 stx.stx_size = length;
10632 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10633}
10634
10635int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10636{
f67539c2
TL
10637 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10638 if (!mref_reader.is_state_satisfied())
10639 return -CEPHFS_ENOTCONN;
10640
11fdf7f2 10641 tout(cct) << __func__ << std::endl;
7c673cae
FG
10642 tout(cct) << fd << std::endl;
10643 tout(cct) << length << std::endl;
10644
f67539c2 10645 std::scoped_lock lock(client_lock);
7c673cae
FG
10646 Fh *f = get_filehandle(fd);
10647 if (!f)
f67539c2 10648 return -CEPHFS_EBADF;
7c673cae
FG
10649#if defined(__linux__) && defined(O_PATH)
10650 if (f->flags & O_PATH)
f67539c2 10651 return -CEPHFS_EBADF;
7c673cae 10652#endif
adb31ebb 10653 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10654 return -CEPHFS_EBADF;
7c673cae
FG
10655 struct stat attr;
10656 attr.st_size = length;
10657 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10658}
10659
10660int Client::fsync(int fd, bool syncdataonly)
10661{
f67539c2
TL
10662 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10663 if (!mref_reader.is_state_satisfied())
10664 return -CEPHFS_ENOTCONN;
10665
7c673cae
FG
10666 tout(cct) << "fsync" << std::endl;
10667 tout(cct) << fd << std::endl;
10668 tout(cct) << syncdataonly << std::endl;
10669
f67539c2 10670 std::scoped_lock lock(client_lock);
7c673cae
FG
10671 Fh *f = get_filehandle(fd);
10672 if (!f)
f67539c2 10673 return -CEPHFS_EBADF;
7c673cae
FG
10674#if defined(__linux__) && defined(O_PATH)
10675 if (f->flags & O_PATH)
f67539c2 10676 return -CEPHFS_EBADF;
7c673cae
FG
10677#endif
10678 int r = _fsync(f, syncdataonly);
10679 if (r == 0) {
10680 // The IOs in this fsync were okay, but maybe something happened
10681 // in the background that we shoudl be reporting?
10682 r = f->take_async_err();
1adf2230 10683 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
10684 << ") = 0, async_err = " << r << dendl;
10685 } else {
10686 // Assume that an error we encountered during fsync, even reported
10687 // synchronously, would also have applied the error to the Fh, and we
10688 // should clear it here to avoid returning the same error again on next
10689 // call.
1adf2230 10690 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
10691 << r << dendl;
10692 f->take_async_err();
10693 }
10694 return r;
10695}
10696
10697int Client::_fsync(Inode *in, bool syncdataonly)
10698{
f67539c2
TL
10699 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10700
7c673cae 10701 int r = 0;
11fdf7f2 10702 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
10703 ceph_tid_t flush_tid = 0;
10704 InodeRef tmp_ref;
11fdf7f2
TL
10705 utime_t lat;
10706 utime_t start = ceph_clock_now();
7c673cae 10707
1adf2230 10708 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
10709
10710 if (cct->_conf->client_oc) {
11fdf7f2
TL
10711 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
10712 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
10713 _flush(in, object_cacher_completion.get());
7c673cae
FG
10714 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
10715 }
10716
10717 if (!syncdataonly && in->dirty_caps) {
10718 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
10719 if (in->flushing_caps)
10720 flush_tid = last_flush_tid;
10721 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
10722
10723 if (!syncdataonly && !in->unsafe_ops.empty()) {
522d829b 10724 flush_mdlog_sync(in);
28e407b8 10725
7c673cae
FG
10726 MetaRequest *req = in->unsafe_ops.back();
10727 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
10728
10729 req->get();
10730 wait_on_list(req->waitfor_safe);
10731 put_request(req);
10732 }
10733
11fdf7f2 10734 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 10735 client_lock.unlock();
7c673cae 10736 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 10737 r = object_cacher_completion->wait();
9f95a23c 10738 client_lock.lock();
7c673cae
FG
10739 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
10740 } else {
10741 // FIXME: this can starve
10742 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
10743 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
10744 << " uncommitted, waiting" << dendl;
10745 wait_on_list(in->waitfor_commit);
10746 }
10747 }
10748
10749 if (!r) {
10750 if (flush_tid > 0)
10751 wait_sync_caps(in, flush_tid);
10752
10753 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
10754 } else {
1adf2230 10755 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
10756 << cpp_strerror(-r) << dendl;
10757 }
11fdf7f2
TL
10758
10759 lat = ceph_clock_now();
10760 lat -= start;
10761 logger->tinc(l_c_fsync, lat);
7c673cae
FG
10762
10763 return r;
10764}
10765
10766int Client::_fsync(Fh *f, bool syncdataonly)
10767{
1adf2230 10768 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
10769 return _fsync(f->inode.get(), syncdataonly);
10770}
10771
10772int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
10773{
f67539c2
TL
10774 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10775 if (!mref_reader.is_state_satisfied())
10776 return -CEPHFS_ENOTCONN;
10777
7c673cae
FG
10778 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
10779 tout(cct) << fd << std::endl;
10780
f67539c2 10781 std::scoped_lock lock(client_lock);
7c673cae
FG
10782 Fh *f = get_filehandle(fd);
10783 if (!f)
f67539c2 10784 return -CEPHFS_EBADF;
7c673cae
FG
10785 int r = _getattr(f->inode, mask, perms);
10786 if (r < 0)
10787 return r;
10788 fill_stat(f->inode, stbuf, NULL);
1adf2230 10789 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
10790 return r;
10791}
10792
10793int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
10794 unsigned int want, unsigned int flags)
10795{
f67539c2
TL
10796 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10797 if (!mref_reader.is_state_satisfied())
10798 return -CEPHFS_ENOTCONN;
10799
7c673cae
FG
10800 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
10801 tout(cct) << fd << std::endl;
10802
f67539c2 10803 std::scoped_lock lock(client_lock);
7c673cae
FG
10804 Fh *f = get_filehandle(fd);
10805 if (!f)
f67539c2 10806 return -CEPHFS_EBADF;
7c673cae
FG
10807
10808 unsigned mask = statx_to_mask(flags, want);
10809
10810 int r = 0;
b3b6e05e 10811 if (mask) {
7c673cae
FG
10812 r = _getattr(f->inode, mask, perms);
10813 if (r < 0) {
10814 ldout(cct, 3) << "fstatx exit on error!" << dendl;
10815 return r;
10816 }
10817 }
10818
10819 fill_statx(f->inode, mask, stx);
10820 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
10821 return r;
10822}
10823
b3b6e05e
TL
10824int Client::statxat(int dirfd, const char *relpath,
10825 struct ceph_statx *stx, const UserPerm& perms,
10826 unsigned int want, unsigned int flags) {
10827 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10828 if (!mref_reader.is_state_satisfied()) {
10829 return -CEPHFS_ENOTCONN;
10830 }
10831
10832 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
10833 tout(cct) << dirfd << std::endl;
10834 tout(cct) << relpath << std::endl;
10835
10836 unsigned mask = statx_to_mask(flags, want);
10837
10838 InodeRef dirinode;
10839 std::scoped_lock lock(client_lock);
10840 int r = get_fd_inode(dirfd, &dirinode);
10841 if (r < 0) {
10842 return r;
10843 }
10844
10845 InodeRef in;
10846 filepath path(relpath);
10847 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
10848 if (r < 0) {
10849 return r;
10850 }
10851 r = _getattr(in, mask, perms);
10852 if (r < 0) {
10853 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
10854 return r;
10855 }
10856
10857 fill_statx(in, mask, stx);
10858 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
10859 return r;
10860}
10861
7c673cae
FG
10862// not written yet, but i want to link!
10863
10864int Client::chdir(const char *relpath, std::string &new_cwd,
10865 const UserPerm& perms)
10866{
f67539c2
TL
10867 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10868 if (!mref_reader.is_state_satisfied())
10869 return -CEPHFS_ENOTCONN;
10870
7c673cae
FG
10871 tout(cct) << "chdir" << std::endl;
10872 tout(cct) << relpath << std::endl;
181888fb 10873
7c673cae
FG
10874 filepath path(relpath);
10875 InodeRef in;
f67539c2
TL
10876
10877 std::scoped_lock lock(client_lock);
7c673cae
FG
10878 int r = path_walk(path, &in, perms);
10879 if (r < 0)
10880 return r;
92f5a8d4
TL
10881
10882 if (!(in.get()->is_dir()))
f67539c2 10883 return -CEPHFS_ENOTDIR;
92f5a8d4 10884
7c673cae
FG
10885 if (cwd != in)
10886 cwd.swap(in);
10887 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
10888
b5b8bbf5 10889 _getcwd(new_cwd, perms);
7c673cae
FG
10890 return 0;
10891}
10892
b5b8bbf5 10893void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
10894{
10895 filepath path;
11fdf7f2 10896 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
10897
10898 Inode *in = cwd.get();
b3b6e05e 10899 while (in != root.get()) {
11fdf7f2 10900 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
10901
10902 // A cwd or ancester is unlinked
11fdf7f2 10903 if (in->dentries.empty()) {
7c673cae
FG
10904 return;
10905 }
10906
10907 Dentry *dn = in->get_first_parent();
10908
10909
10910 if (!dn) {
10911 // look it up
11fdf7f2 10912 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
10913 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
10914 filepath path(in->ino);
10915 req->set_filepath(path);
10916 req->set_inode(in);
10917 int res = make_request(req, perms);
10918 if (res < 0)
10919 break;
10920
10921 // start over
10922 path = filepath();
10923 in = cwd.get();
10924 continue;
10925 }
10926 path.push_front_dentry(dn->name);
10927 in = dn->dir->parent_inode;
10928 }
10929 dir = "/";
10930 dir += path.get_path();
10931}
10932
b5b8bbf5
FG
10933void Client::getcwd(string& dir, const UserPerm& perms)
10934{
f67539c2
TL
10935 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10936 if (!mref_reader.is_state_satisfied())
10937 return;
10938
10939 std::scoped_lock l(client_lock);
10940
10941 _getcwd(dir, perms);
b5b8bbf5
FG
10942}
10943
7c673cae
FG
10944int Client::statfs(const char *path, struct statvfs *stbuf,
10945 const UserPerm& perms)
10946{
f67539c2
TL
10947 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10948 if (!mref_reader.is_state_satisfied())
10949 return -CEPHFS_ENOTCONN;
10950
11fdf7f2 10951 tout(cct) << __func__ << std::endl;
91327a77 10952 unsigned long int total_files_on_fs;
7c673cae
FG
10953
10954 ceph_statfs stats;
10955 C_SaferCond cond;
d2e6a577 10956
f67539c2 10957 std::unique_lock lock(client_lock);
d2e6a577
FG
10958 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
10959 if (data_pools.size() == 1) {
10960 objecter->get_fs_stats(stats, data_pools[0], &cond);
10961 } else {
20effc67 10962 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
d2e6a577 10963 }
7c673cae 10964
f67539c2 10965 lock.unlock();
7c673cae 10966 int rval = cond.wait();
f67539c2
TL
10967 lock.lock();
10968
20effc67 10969 ceph_assert(root);
91327a77 10970 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
10971
10972 if (rval < 0) {
10973 ldout(cct, 1) << "underlying call to statfs returned error: "
10974 << cpp_strerror(rval)
10975 << dendl;
10976 return rval;
10977 }
10978
10979 memset(stbuf, 0, sizeof(*stbuf));
10980
10981 /*
10982 * we're going to set a block size of 4MB so we can represent larger
10983 * FSes without overflowing. Additionally convert the space
10984 * measurements from KB to bytes while making them in terms of
10985 * blocks. We use 4MB only because it is big enough, and because it
10986 * actually *is* the (ceph) default block size.
10987 */
10988 const int CEPH_BLOCK_SHIFT = 22;
10989 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
10990 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77 10991 stbuf->f_files = total_files_on_fs;
f67539c2 10992 stbuf->f_ffree = -1;
7c673cae
FG
10993 stbuf->f_favail = -1;
10994 stbuf->f_fsid = -1; // ??
10995 stbuf->f_flag = 0; // ??
10996 stbuf->f_namemax = NAME_MAX;
10997
10998 // Usually quota_root will == root_ancestor, but if the mount root has no
10999 // quota but we can see a parent of it that does have a quota, we'll
11000 // respect that one instead.
11fdf7f2 11001 ceph_assert(root != nullptr);
b3b6e05e 11002 InodeRef quota_root = root->quota.is_enable() ? root : get_quota_root(root.get(), perms);
7c673cae
FG
11003
11004 // get_quota_root should always give us something
11005 // because client quotas are always enabled
11fdf7f2 11006 ceph_assert(quota_root != nullptr);
7c673cae
FG
11007
11008 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11009
11010 // Skip the getattr if any sessions are stale, as we don't want to
11011 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11012 // is unhealthy.
11013 if (!_any_stale_sessions()) {
11014 int r = _getattr(quota_root, 0, perms, true);
11015 if (r != 0) {
11016 // Ignore return value: error getting latest inode metadata is not a good
11017 // reason to break "df".
11018 lderr(cct) << "Error in getattr on quota root 0x"
11019 << std::hex << quota_root->ino << std::dec
11020 << " statfs result may be outdated" << dendl;
11021 }
11022 }
11023
11024 // Special case: if there is a size quota set on the Inode acting
11025 // as the root for this client mount, then report the quota status
11026 // as the filesystem statistics.
11027 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11028 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
11029 // It is possible for a quota to be exceeded: arithmetic here must
11030 // handle case where used > total.
11031 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
11032
11033 stbuf->f_blocks = total;
11034 stbuf->f_bfree = free;
11035 stbuf->f_bavail = free;
11036 } else {
d2e6a577 11037 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
11038 // multiple pools may be used without one filesystem namespace via
11039 // layouts, this is the most correct thing we can do.
11040 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11041 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11042 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11043 }
11044
11045 return rval;
11046}
11047
11048int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11049 struct flock *fl, uint64_t owner, bool removing)
11050{
11fdf7f2 11051 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
11052 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11053 << " type " << fl->l_type << " owner " << owner
11054 << " " << fl->l_start << "~" << fl->l_len << dendl;
11055
f6b5b4d7 11056 if (in->flags & I_ERROR_FILELOCK)
f67539c2 11057 return -CEPHFS_EIO;
f6b5b4d7 11058
7c673cae
FG
11059 int lock_cmd;
11060 if (F_RDLCK == fl->l_type)
11061 lock_cmd = CEPH_LOCK_SHARED;
11062 else if (F_WRLCK == fl->l_type)
11063 lock_cmd = CEPH_LOCK_EXCL;
11064 else if (F_UNLCK == fl->l_type)
11065 lock_cmd = CEPH_LOCK_UNLOCK;
11066 else
f67539c2 11067 return -CEPHFS_EIO;
7c673cae
FG
11068
11069 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11070 sleep = 0;
11071
11072 /*
11073 * Set the most significant bit, so that MDS knows the 'owner'
11074 * is sufficient to identify the owner of lock. (old code uses
11075 * both 'owner' and 'pid')
11076 */
11077 owner |= (1ULL << 63);
11078
11079 MetaRequest *req = new MetaRequest(op);
11080 filepath path;
11081 in->make_nosnap_relative_path(path);
11082 req->set_filepath(path);
11083 req->set_inode(in);
11084
11085 req->head.args.filelock_change.rule = lock_type;
11086 req->head.args.filelock_change.type = lock_cmd;
11087 req->head.args.filelock_change.owner = owner;
11088 req->head.args.filelock_change.pid = fl->l_pid;
11089 req->head.args.filelock_change.start = fl->l_start;
11090 req->head.args.filelock_change.length = fl->l_len;
11091 req->head.args.filelock_change.wait = sleep;
11092
11093 int ret;
11094 bufferlist bl;
11095
11096 if (sleep && switch_interrupt_cb) {
11097 // enable interrupt
11098 switch_interrupt_cb(callback_handle, req->get());
11099 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
11100 // disable interrupt
11101 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
11102 if (ret == 0 && req->aborted()) {
11103 // effect of this lock request has been revoked by the 'lock intr' request
11104 ret = req->get_abort_code();
11105 }
7c673cae
FG
11106 put_request(req);
11107 } else {
11108 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11109 }
11110
11111 if (ret == 0) {
11112 if (op == CEPH_MDS_OP_GETFILELOCK) {
11113 ceph_filelock filelock;
11fdf7f2
TL
11114 auto p = bl.cbegin();
11115 decode(filelock, p);
7c673cae
FG
11116
11117 if (CEPH_LOCK_SHARED == filelock.type)
11118 fl->l_type = F_RDLCK;
11119 else if (CEPH_LOCK_EXCL == filelock.type)
11120 fl->l_type = F_WRLCK;
11121 else
11122 fl->l_type = F_UNLCK;
11123
11124 fl->l_whence = SEEK_SET;
11125 fl->l_start = filelock.start;
11126 fl->l_len = filelock.length;
11127 fl->l_pid = filelock.pid;
11128 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11129 ceph_lock_state_t *lock_state;
11130 if (lock_type == CEPH_LOCK_FCNTL) {
11131 if (!in->fcntl_locks)
11fdf7f2
TL
11132 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11133 lock_state = in->fcntl_locks.get();
7c673cae
FG
11134 } else if (lock_type == CEPH_LOCK_FLOCK) {
11135 if (!in->flock_locks)
11fdf7f2
TL
11136 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11137 lock_state = in->flock_locks.get();
7c673cae
FG
11138 } else {
11139 ceph_abort();
f67539c2 11140 return -CEPHFS_EINVAL;
7c673cae
FG
11141 }
11142 _update_lock_state(fl, owner, lock_state);
11143
11144 if (!removing) {
11145 if (lock_type == CEPH_LOCK_FCNTL) {
11146 if (!fh->fcntl_locks)
11fdf7f2
TL
11147 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11148 lock_state = fh->fcntl_locks.get();
7c673cae
FG
11149 } else {
11150 if (!fh->flock_locks)
11fdf7f2
TL
11151 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11152 lock_state = fh->flock_locks.get();
7c673cae
FG
11153 }
11154 _update_lock_state(fl, owner, lock_state);
11155 }
11156 } else
11157 ceph_abort();
11158 }
11159 return ret;
11160}
11161
11162int Client::_interrupt_filelock(MetaRequest *req)
11163{
31f18b77
FG
11164 // Set abort code, but do not kick. The abort code prevents the request
11165 // from being re-sent.
f67539c2 11166 req->abort(-CEPHFS_EINTR);
31f18b77
FG
11167 if (req->mds < 0)
11168 return 0; // haven't sent the request
11169
7c673cae
FG
11170 Inode *in = req->inode();
11171
11172 int lock_type;
11173 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11174 lock_type = CEPH_LOCK_FLOCK_INTR;
11175 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11176 lock_type = CEPH_LOCK_FCNTL_INTR;
11177 else {
11178 ceph_abort();
f67539c2 11179 return -CEPHFS_EINVAL;
7c673cae
FG
11180 }
11181
11182 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11183 filepath path;
11184 in->make_nosnap_relative_path(path);
11185 intr_req->set_filepath(path);
11186 intr_req->set_inode(in);
11187 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11188 intr_req->head.args.filelock_change.rule = lock_type;
11189 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11190
11191 UserPerm perms(req->get_uid(), req->get_gid());
11192 return make_request(intr_req, perms, NULL, NULL, -1);
11193}
11194
11195void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11196{
11197 if (!in->fcntl_locks && !in->flock_locks)
11198 return;
11199
11200 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 11201 encode(nr_fcntl_locks, bl);
7c673cae 11202 if (nr_fcntl_locks) {
11fdf7f2 11203 auto &lock_state = in->fcntl_locks;
20effc67 11204 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11205 p != lock_state->held_locks.end();
11206 ++p)
11fdf7f2 11207 encode(p->second, bl);
7c673cae
FG
11208 }
11209
11210 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 11211 encode(nr_flock_locks, bl);
7c673cae 11212 if (nr_flock_locks) {
11fdf7f2 11213 auto &lock_state = in->flock_locks;
20effc67 11214 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11215 p != lock_state->held_locks.end();
11216 ++p)
11fdf7f2 11217 encode(p->second, bl);
7c673cae
FG
11218 }
11219
11fdf7f2 11220 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
11221 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11222}
11223
11224void Client::_release_filelocks(Fh *fh)
11225{
11226 if (!fh->fcntl_locks && !fh->flock_locks)
11227 return;
11228
11229 Inode *in = fh->inode.get();
11fdf7f2 11230 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 11231
f6b5b4d7
TL
11232 list<ceph_filelock> activated_locks;
11233
7c673cae
FG
11234 list<pair<int, ceph_filelock> > to_release;
11235
11236 if (fh->fcntl_locks) {
11fdf7f2 11237 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
11238 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11239 auto q = p++;
11240 if (in->flags & I_ERROR_FILELOCK) {
11241 lock_state->remove_lock(q->second, activated_locks);
11242 } else {
11243 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11244 }
11245 }
11fdf7f2 11246 lock_state.reset();
7c673cae
FG
11247 }
11248 if (fh->flock_locks) {
11fdf7f2 11249 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
11250 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11251 auto q = p++;
11252 if (in->flags & I_ERROR_FILELOCK) {
11253 lock_state->remove_lock(q->second, activated_locks);
11254 } else {
11255 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11256 }
11257 }
11fdf7f2 11258 lock_state.reset();
7c673cae
FG
11259 }
11260
f6b5b4d7
TL
11261 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11262 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 11263
f6b5b4d7 11264 if (to_release.empty())
11fdf7f2
TL
11265 return;
11266
7c673cae
FG
11267 struct flock fl;
11268 memset(&fl, 0, sizeof(fl));
11269 fl.l_whence = SEEK_SET;
11270 fl.l_type = F_UNLCK;
11271
11272 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11273 p != to_release.end();
11274 ++p) {
11275 fl.l_start = p->second.start;
11276 fl.l_len = p->second.length;
11277 fl.l_pid = p->second.pid;
11278 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11279 p->second.owner, true);
11280 }
11281}
11282
11283void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11284 ceph_lock_state_t *lock_state)
11285{
11286 int lock_cmd;
11287 if (F_RDLCK == fl->l_type)
11288 lock_cmd = CEPH_LOCK_SHARED;
11289 else if (F_WRLCK == fl->l_type)
11290 lock_cmd = CEPH_LOCK_EXCL;
11291 else
11292 lock_cmd = CEPH_LOCK_UNLOCK;;
11293
11294 ceph_filelock filelock;
11295 filelock.start = fl->l_start;
11296 filelock.length = fl->l_len;
11297 filelock.client = 0;
11298 // see comment in _do_filelock()
11299 filelock.owner = owner | (1ULL << 63);
11300 filelock.pid = fl->l_pid;
11301 filelock.type = lock_cmd;
11302
11303 if (filelock.type == CEPH_LOCK_UNLOCK) {
11304 list<ceph_filelock> activated_locks;
11305 lock_state->remove_lock(filelock, activated_locks);
11306 } else {
11307 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 11308 ceph_assert(r);
7c673cae
FG
11309 }
11310}
11311
11312int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11313{
11314 Inode *in = fh->inode.get();
11315 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11316 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11317 return ret;
11318}
11319
11320int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11321{
11322 Inode *in = fh->inode.get();
11323 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11324 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11325 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11326 return ret;
11327}
11328
11329int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11330{
11331 Inode *in = fh->inode.get();
11332 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11333
11334 int sleep = !(cmd & LOCK_NB);
11335 cmd &= ~LOCK_NB;
11336
11337 int type;
11338 switch (cmd) {
11339 case LOCK_SH:
11340 type = F_RDLCK;
11341 break;
11342 case LOCK_EX:
11343 type = F_WRLCK;
11344 break;
11345 case LOCK_UN:
11346 type = F_UNLCK;
11347 break;
11348 default:
f67539c2 11349 return -CEPHFS_EINVAL;
7c673cae
FG
11350 }
11351
11352 struct flock fl;
11353 memset(&fl, 0, sizeof(fl));
11354 fl.l_type = type;
11355 fl.l_whence = SEEK_SET;
11356
11357 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11358 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11359 return ret;
11360}
11361
f67539c2
TL
11362int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11363 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11364 if (!mref_reader.is_state_satisfied()) {
11365 return -CEPHFS_ENOTCONN;
11366 }
11367
20effc67 11368 std::scoped_lock lock(client_lock);
f67539c2
TL
11369 InodeRef in;
11370 int r = Client::path_walk(path, &in, perms, true);
11371 if (r < 0) {
11372 return r;
11373 }
11374
11375 if (in->snapid == CEPH_NOSNAP) {
11376 return -CEPHFS_EINVAL;
11377 }
11378
11379 snap_info->id = in->snapid;
11380 snap_info->metadata = in->snap_metadata;
11381 return 0;
11382}
11383
11384int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11385{
11386 /* Since the only thing this does is wrap a call to statfs, and
11387 statfs takes a lock, it doesn't seem we have a need to split it
11388 out. */
7c673cae
FG
11389 return statfs(0, stbuf, perms);
11390}
11391
20effc67 11392void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
11393{
11394 if (!args)
11395 return;
20effc67 11396
11fdf7f2 11397 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
11398 << " invalidate_ino_cb " << args->ino_cb
11399 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
11400 << " switch_interrupt_cb " << args->switch_intr_cb
11401 << " remount_cb " << args->remount_cb
11402 << dendl;
11403 callback_handle = args->handle;
11404 if (args->ino_cb) {
11405 ino_invalidate_cb = args->ino_cb;
11406 async_ino_invalidator.start();
11407 }
11408 if (args->dentry_cb) {
11409 dentry_invalidate_cb = args->dentry_cb;
11410 async_dentry_invalidator.start();
11411 }
11412 if (args->switch_intr_cb) {
11413 switch_interrupt_cb = args->switch_intr_cb;
11414 interrupt_finisher.start();
11415 }
11416 if (args->remount_cb) {
11417 remount_cb = args->remount_cb;
11418 remount_finisher.start();
11419 }
e306af50
TL
11420 if (args->ino_release_cb) {
11421 ino_release_cb = args->ino_release_cb;
11422 async_ino_releasor.start();
11423 }
11424 if (args->umask_cb)
11425 umask_cb = args->umask_cb;
7c673cae
FG
11426}
11427
20effc67
TL
11428// This is deprecated, use ll_register_callbacks2() instead.
11429void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11430{
11431 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11432
11433 _ll_register_callbacks(args);
11434}
11435
11436int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11437{
11438 if (is_mounting() || is_mounted() || is_unmounting())
11439 return -CEPHFS_EBUSY;
11440
11441 _ll_register_callbacks(args);
11442 return 0;
11443}
11444
1d09f67e 11445std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
7c673cae 11446{
1d09f67e 11447 std::pair <int, bool> r(0, false);
7c673cae 11448
f67539c2
TL
11449 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11450 if (!iref_reader.is_state_satisfied())
1d09f67e 11451 return std::make_pair(-CEPHFS_ENOTCONN, false);
f67539c2 11452
7c673cae
FG
11453 can_invalidate_dentries = can_invalidate;
11454
11455 if (can_invalidate_dentries) {
11fdf7f2 11456 ceph_assert(dentry_invalidate_cb);
7c673cae 11457 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11fdf7f2
TL
11458 } else {
11459 ceph_assert(remount_cb);
7c673cae 11460 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 11461 r = _do_remount(false);
b32b8144 11462 }
11fdf7f2 11463
7c673cae
FG
11464 return r;
11465}
11466
11467int Client::_sync_fs()
11468{
f67539c2
TL
11469 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11470
11fdf7f2 11471 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
11472
11473 // flush file data
11fdf7f2
TL
11474 std::unique_ptr<C_SaferCond> cond = nullptr;
11475 if (cct->_conf->client_oc) {
11476 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11477 objectcacher->flush_all(cond.get());
11478 }
7c673cae
FG
11479
11480 // flush caps
11481 flush_caps_sync();
11482 ceph_tid_t flush_tid = last_flush_tid;
11483
11484 // wait for unsafe mds requests
11485 wait_unsafe_requests();
11486
11487 wait_sync_caps(flush_tid);
11488
11fdf7f2 11489 if (nullptr != cond) {
9f95a23c 11490 client_lock.unlock();
11fdf7f2
TL
11491 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11492 cond->wait();
11493 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 11494 client_lock.lock();
7c673cae
FG
11495 }
11496
11497 return 0;
11498}
11499
11500int Client::sync_fs()
11501{
f67539c2
TL
11502 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11503 if (!mref_reader.is_state_satisfied())
11504 return -CEPHFS_ENOTCONN;
181888fb 11505
f67539c2 11506 std::scoped_lock l(client_lock);
181888fb 11507
7c673cae
FG
11508 return _sync_fs();
11509}
11510
11511int64_t Client::drop_caches()
11512{
f67539c2 11513 std::scoped_lock l(client_lock);
7c673cae
FG
11514 return objectcacher->release_all();
11515}
11516
11fdf7f2
TL
11517int Client::_lazyio(Fh *fh, int enable)
11518{
11519 Inode *in = fh->inode.get();
11520 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11521
11522 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11523 return 0;
11524
11525 int orig_mode = fh->mode;
11526 if (enable) {
11527 fh->mode |= CEPH_FILE_MODE_LAZY;
11528 in->get_open_ref(fh->mode);
11529 in->put_open_ref(orig_mode);
11530 check_caps(in, CHECK_CAPS_NODELAY);
11531 } else {
11532 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11533 in->get_open_ref(fh->mode);
11534 in->put_open_ref(orig_mode);
11535 check_caps(in, 0);
11536 }
11537
11538 return 0;
11539}
11540
11541int Client::lazyio(int fd, int enable)
11542{
f67539c2 11543 std::scoped_lock l(client_lock);
11fdf7f2
TL
11544 Fh *f = get_filehandle(fd);
11545 if (!f)
f67539c2 11546 return -CEPHFS_EBADF;
11fdf7f2
TL
11547
11548 return _lazyio(f, enable);
11549}
11550
11551int Client::ll_lazyio(Fh *fh, int enable)
11552{
11fdf7f2
TL
11553 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11554 tout(cct) << __func__ << std::endl;
11555
f67539c2 11556 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11557 return _lazyio(fh, enable);
11558}
7c673cae 11559
92f5a8d4 11560int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 11561{
f67539c2 11562 std::scoped_lock l(client_lock);
92f5a8d4 11563 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
11564 << ", " << offset << ", " << count << ")" << dendl;
11565
11566 Fh *f = get_filehandle(fd);
11567 if (!f)
f67539c2 11568 return -CEPHFS_EBADF;
7c673cae
FG
11569
11570 // for now
11571 _fsync(f, true);
11572
11573 return 0;
11574}
11575
11576int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11577{
f67539c2 11578 std::scoped_lock l(client_lock);
7c673cae
FG
11579 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11580 << ", " << offset << ", " << count << ")" << dendl;
11581
11582 Fh *f = get_filehandle(fd);
11583 if (!f)
f67539c2 11584 return -CEPHFS_EBADF;
7c673cae
FG
11585 Inode *in = f->inode.get();
11586
11587 _fsync(f, true);
92f5a8d4
TL
11588 if (_release(in)) {
11589 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11590 if (r < 0)
11591 return r;
11592 }
7c673cae
FG
11593 return 0;
11594}
11595
11596
11597// =============================
11598// snaps
11599
f67539c2
TL
11600int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11601 mode_t mode, const std::map<std::string, std::string> &metadata)
7c673cae 11602{
f67539c2
TL
11603 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11604 if (!mref_reader.is_state_satisfied())
11605 return -CEPHFS_ENOTCONN;
181888fb 11606
f67539c2 11607 std::scoped_lock l(client_lock);
181888fb 11608
7c673cae
FG
11609 filepath path(relpath);
11610 InodeRef in;
11611 int r = path_walk(path, &in, perm);
11612 if (r < 0)
11613 return r;
11614 if (cct->_conf->client_permissions) {
11615 r = may_create(in.get(), perm);
11616 if (r < 0)
11617 return r;
11618 }
11619 Inode *snapdir = open_snapdir(in.get());
f67539c2 11620 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
7c673cae 11621}
181888fb 11622
f67539c2 11623int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
7c673cae 11624{
f67539c2
TL
11625 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11626 if (!mref_reader.is_state_satisfied())
11627 return -CEPHFS_ENOTCONN;
181888fb 11628
f67539c2 11629 std::scoped_lock l(client_lock);
181888fb 11630
7c673cae
FG
11631 filepath path(relpath);
11632 InodeRef in;
11633 int r = path_walk(path, &in, perms);
11634 if (r < 0)
11635 return r;
f67539c2 11636 Inode *snapdir = open_snapdir(in.get());
7c673cae 11637 if (cct->_conf->client_permissions) {
f67539c2 11638 r = may_delete(snapdir, check_perms ? name : NULL, perms);
7c673cae
FG
11639 if (r < 0)
11640 return r;
11641 }
7c673cae
FG
11642 return _rmdir(snapdir, name, perms);
11643}
11644
11645// =============================
11646// expose caps
11647
f67539c2
TL
11648int Client::get_caps_issued(int fd)
11649{
11650 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11651 if (!mref_reader.is_state_satisfied())
11652 return -CEPHFS_ENOTCONN;
7c673cae 11653
f67539c2 11654 std::scoped_lock lock(client_lock);
181888fb 11655
7c673cae
FG
11656 Fh *f = get_filehandle(fd);
11657 if (!f)
f67539c2 11658 return -CEPHFS_EBADF;
7c673cae
FG
11659
11660 return f->inode->caps_issued();
11661}
11662
11663int Client::get_caps_issued(const char *path, const UserPerm& perms)
11664{
f67539c2
TL
11665 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11666 if (!mref_reader.is_state_satisfied())
11667 return -CEPHFS_ENOTCONN;
181888fb 11668
f67539c2 11669 std::scoped_lock lock(client_lock);
181888fb 11670
7c673cae
FG
11671 filepath p(path);
11672 InodeRef in;
11673 int r = path_walk(p, &in, perms, true);
11674 if (r < 0)
11675 return r;
11676 return in->caps_issued();
11677}
11678
11679// =========================================
11680// low level
11681
11682Inode *Client::open_snapdir(Inode *diri)
11683{
11684 Inode *in;
11685 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
11686 if (!inode_map.count(vino)) {
11687 in = new Inode(this, vino, &diri->layout);
11688
11689 in->ino = diri->ino;
11690 in->snapid = CEPH_SNAPDIR;
11691 in->mode = diri->mode;
11692 in->uid = diri->uid;
11693 in->gid = diri->gid;
494da23a 11694 in->nlink = 1;
7c673cae
FG
11695 in->mtime = diri->mtime;
11696 in->ctime = diri->ctime;
11697 in->btime = diri->btime;
f6b5b4d7 11698 in->atime = diri->atime;
7c673cae
FG
11699 in->size = diri->size;
11700 in->change_attr = diri->change_attr;
11701
11702 in->dirfragtree.clear();
11703 in->snapdir_parent = diri;
11704 diri->flags |= I_SNAPDIR_OPEN;
11705 inode_map[vino] = in;
11706 if (use_faked_inos())
11707 _assign_faked_ino(in);
11708 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
11709 } else {
11710 in = inode_map[vino];
11711 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
11712 }
11713 return in;
11714}
11715
11716int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
11717 Inode **out, const UserPerm& perms)
11718{
f67539c2
TL
11719 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11720 if (!mref_reader.is_state_satisfied())
11721 return -CEPHFS_ENOTCONN;
11722
31f18b77 11723 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
11724 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
11725 tout(cct) << __func__ << std::endl;
7c673cae
FG
11726 tout(cct) << name << std::endl;
11727
f67539c2 11728 std::scoped_lock lock(client_lock);
181888fb 11729
7c673cae 11730 int r = 0;
11fdf7f2
TL
11731 if (!fuse_default_permissions) {
11732 if (strcmp(name, ".") && strcmp(name, "..")) {
11733 r = may_lookup(parent, perms);
11734 if (r < 0)
11735 return r;
11736 }
7c673cae
FG
11737 }
11738
11739 string dname(name);
11740 InodeRef in;
11741
11742 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
11743 if (r < 0) {
11744 attr->st_ino = 0;
11745 goto out;
11746 }
11747
11fdf7f2 11748 ceph_assert(in);
7c673cae
FG
11749 fill_stat(in, attr);
11750 _ll_get(in.get());
11751
11752 out:
11fdf7f2 11753 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11754 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11755 tout(cct) << attr->st_ino << std::endl;
11756 *out = in.get();
11757 return r;
11758}
11759
f67539c2
TL
11760int Client::ll_lookup_vino(
11761 vinodeno_t vino,
1adf2230
AA
11762 const UserPerm& perms,
11763 Inode **inode)
11764{
81eedcae 11765 ceph_assert(inode != NULL);
f67539c2
TL
11766 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11767 if (!mref_reader.is_state_satisfied())
11768 return -CEPHFS_ENOTCONN;
81eedcae 11769
b3b6e05e
TL
11770 if (is_reserved_vino(vino))
11771 return -CEPHFS_ESTALE;
11772
f67539c2
TL
11773 std::scoped_lock lock(client_lock);
11774 ldout(cct, 3) << __func__ << " " << vino << dendl;
1adf2230 11775
f67539c2
TL
11776 // Check the cache first
11777 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11778 if (p != inode_map.end()) {
11779 *inode = p->second;
11780 _ll_get(*inode);
81eedcae
TL
11781 return 0;
11782 }
11783
f67539c2 11784 uint64_t snapid = vino.snapid;
81eedcae 11785
f67539c2
TL
11786 // for snapdir, find the non-snapped dir inode
11787 if (snapid == CEPH_SNAPDIR)
11788 vino.snapid = CEPH_NOSNAP;
11789
11790 int r = _lookup_vino(vino, perms, inode);
11791 if (r)
1adf2230 11792 return r;
f67539c2 11793 ceph_assert(*inode != NULL);
81eedcae 11794
f67539c2
TL
11795 if (snapid == CEPH_SNAPDIR) {
11796 Inode *tmp = *inode;
1adf2230 11797
f67539c2
TL
11798 // open the snapdir and put the inode ref
11799 *inode = open_snapdir(tmp);
11800 _ll_forget(tmp, 1);
11801 _ll_get(*inode);
1adf2230 11802 }
1adf2230
AA
11803 return 0;
11804}
11805
f67539c2
TL
11806int Client::ll_lookup_inode(
11807 struct inodeno_t ino,
11808 const UserPerm& perms,
11809 Inode **inode)
11810{
11811 vinodeno_t vino(ino, CEPH_NOSNAP);
11812 return ll_lookup_vino(vino, perms, inode);
11813}
11814
7c673cae
FG
11815int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
11816 struct ceph_statx *stx, unsigned want, unsigned flags,
11817 const UserPerm& perms)
11818{
f67539c2
TL
11819 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11820 if (!mref_reader.is_state_satisfied())
11821 return -CEPHFS_ENOTCONN;
11822
31f18b77 11823 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 11824 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
11825 tout(cct) << "ll_lookupx" << std::endl;
11826 tout(cct) << name << std::endl;
11827
f67539c2 11828 std::scoped_lock lock(client_lock);
181888fb 11829
7c673cae 11830 int r = 0;
11fdf7f2 11831 if (!fuse_default_permissions) {
7c673cae
FG
11832 r = may_lookup(parent, perms);
11833 if (r < 0)
11834 return r;
11835 }
11836
11837 string dname(name);
11838 InodeRef in;
11839
11840 unsigned mask = statx_to_mask(flags, want);
11841 r = _lookup(parent, dname, mask, &in, perms);
11842 if (r < 0) {
11843 stx->stx_ino = 0;
11844 stx->stx_mask = 0;
11845 } else {
11fdf7f2 11846 ceph_assert(in);
7c673cae
FG
11847 fill_statx(in, mask, stx);
11848 _ll_get(in.get());
11849 }
11850
11fdf7f2 11851 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
11852 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11853 tout(cct) << stx->stx_ino << std::endl;
11854 *out = in.get();
11855 return r;
11856}
11857
11858int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
11859 unsigned int want, unsigned int flags, const UserPerm& perms)
11860{
f67539c2
TL
11861 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11862 if (!mref_reader.is_state_satisfied())
11863 return -CEPHFS_ENOTCONN;
181888fb 11864
7c673cae
FG
11865 filepath fp(name, 0);
11866 InodeRef in;
11867 int rc;
11868 unsigned mask = statx_to_mask(flags, want);
11869
11fdf7f2
TL
11870 ldout(cct, 3) << __func__ << " " << name << dendl;
11871 tout(cct) << __func__ << std::endl;
7c673cae
FG
11872 tout(cct) << name << std::endl;
11873
f67539c2 11874 std::scoped_lock lock(client_lock);
7c673cae
FG
11875 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
11876 if (rc < 0) {
11877 /* zero out mask, just in case... */
11878 stx->stx_mask = 0;
11879 stx->stx_ino = 0;
11880 *out = NULL;
11881 return rc;
11882 } else {
11fdf7f2 11883 ceph_assert(in);
7c673cae
FG
11884 fill_statx(in, mask, stx);
11885 _ll_get(in.get());
11886 *out = in.get();
11887 return 0;
11888 }
11889}
11890
11891void Client::_ll_get(Inode *in)
11892{
11893 if (in->ll_ref == 0) {
b3b6e05e 11894 in->iget();
11fdf7f2
TL
11895 if (in->is_dir() && !in->dentries.empty()) {
11896 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11897 in->get_first_parent()->get(); // pin dentry
11898 }
11fdf7f2
TL
11899 if (in->snapid != CEPH_NOSNAP)
11900 ll_snap_ref[in->snapid]++;
7c673cae
FG
11901 }
11902 in->ll_get();
11fdf7f2 11903 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
11904}
11905
494da23a 11906int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
11907{
11908 in->ll_put(num);
11fdf7f2 11909 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 11910 if (in->ll_ref == 0) {
11fdf7f2
TL
11911 if (in->is_dir() && !in->dentries.empty()) {
11912 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
11913 in->get_first_parent()->put(); // unpin dentry
11914 }
11fdf7f2
TL
11915 if (in->snapid != CEPH_NOSNAP) {
11916 auto p = ll_snap_ref.find(in->snapid);
11917 ceph_assert(p != ll_snap_ref.end());
11918 ceph_assert(p->second > 0);
11919 if (--p->second == 0)
11920 ll_snap_ref.erase(p);
11921 }
7c673cae
FG
11922 put_inode(in);
11923 return 0;
11924 } else {
11925 return in->ll_ref;
11926 }
11927}
11928
11929void Client::_ll_drop_pins()
11930{
11fdf7f2 11931 ldout(cct, 10) << __func__ << dendl;
1adf2230 11932 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
11933 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
11934 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
11935 it != inode_map.end();
11936 it = next) {
11937 Inode *in = it->second;
11938 next = it;
11939 ++next;
1adf2230
AA
11940 if (in->ll_ref){
11941 to_be_put.insert(in);
7c673cae 11942 _ll_put(in, in->ll_ref);
1adf2230 11943 }
7c673cae
FG
11944 }
11945}
11946
494da23a 11947bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 11948{
11fdf7f2 11949 inodeno_t ino = in->ino;
7c673cae 11950
11fdf7f2
TL
11951 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
11952 tout(cct) << __func__ << std::endl;
7c673cae
FG
11953 tout(cct) << ino.val << std::endl;
11954 tout(cct) << count << std::endl;
11955
181888fb 11956 // Ignore forget if we're no longer mounted
f67539c2
TL
11957 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11958 if (!mref_reader.is_state_satisfied())
181888fb
FG
11959 return true;
11960
7c673cae
FG
11961 if (ino == 1) return true; // ignore forget on root.
11962
11963 bool last = false;
11964 if (in->ll_ref < count) {
11965 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
11966 << ", which only has ll_ref=" << in->ll_ref << dendl;
11967 _ll_put(in, in->ll_ref);
11968 last = true;
11969 } else {
11970 if (_ll_put(in, count) == 0)
11971 last = true;
11972 }
11973
11974 return last;
11975}
11976
494da23a 11977bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 11978{
f67539c2 11979 std::scoped_lock lock(client_lock);
1adf2230
AA
11980 return _ll_forget(in, count);
11981}
11982
7c673cae
FG
11983bool Client::ll_put(Inode *in)
11984{
11985 /* ll_forget already takes the lock */
11986 return ll_forget(in, 1);
11987}
11988
11fdf7f2
TL
11989int Client::ll_get_snap_ref(snapid_t snap)
11990{
f67539c2 11991 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11992 auto p = ll_snap_ref.find(snap);
11993 if (p != ll_snap_ref.end())
11994 return p->second;
11995 return 0;
11996}
11997
7c673cae
FG
11998snapid_t Client::ll_get_snapid(Inode *in)
11999{
f67539c2 12000 std::scoped_lock lock(client_lock);
7c673cae
FG
12001 return in->snapid;
12002}
12003
12004Inode *Client::ll_get_inode(ino_t ino)
12005{
f67539c2
TL
12006 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12007 if (!mref_reader.is_state_satisfied())
181888fb
FG
12008 return NULL;
12009
f67539c2
TL
12010 std::scoped_lock lock(client_lock);
12011
7c673cae
FG
12012 vinodeno_t vino = _map_faked_ino(ino);
12013 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12014 if (p == inode_map.end())
12015 return NULL;
12016 Inode *in = p->second;
12017 _ll_get(in);
12018 return in;
12019}
12020
12021Inode *Client::ll_get_inode(vinodeno_t vino)
12022{
f67539c2
TL
12023 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12024 if (!mref_reader.is_state_satisfied())
181888fb
FG
12025 return NULL;
12026
b3b6e05e
TL
12027 if (is_reserved_vino(vino))
12028 return NULL;
12029
f67539c2
TL
12030 std::scoped_lock lock(client_lock);
12031
7c673cae
FG
12032 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12033 if (p == inode_map.end())
12034 return NULL;
12035 Inode *in = p->second;
12036 _ll_get(in);
12037 return in;
12038}
12039
12040int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12041{
12042 vinodeno_t vino = _get_vino(in);
12043
11fdf7f2
TL
12044 ldout(cct, 8) << __func__ << " " << vino << dendl;
12045 tout(cct) << __func__ << std::endl;
7c673cae
FG
12046 tout(cct) << vino.ino.val << std::endl;
12047
12048 if (vino.snapid < CEPH_NOSNAP)
12049 return 0;
12050 else
12051 return _getattr(in, caps, perms);
12052}
12053
12054int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12055{
f67539c2
TL
12056 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12057 if (!mref_reader.is_state_satisfied())
12058 return -CEPHFS_ENOTCONN;
7c673cae 12059
f67539c2 12060 std::scoped_lock lock(client_lock);
181888fb 12061
7c673cae
FG
12062 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12063
12064 if (res == 0)
12065 fill_stat(in, attr);
11fdf7f2 12066 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12067 return res;
12068}
12069
12070int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12071 unsigned int flags, const UserPerm& perms)
12072{
f67539c2
TL
12073 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12074 if (!mref_reader.is_state_satisfied())
12075 return -CEPHFS_ENOTCONN;
7c673cae 12076
f67539c2 12077 std::scoped_lock lock(client_lock);
181888fb 12078
7c673cae
FG
12079 int res = 0;
12080 unsigned mask = statx_to_mask(flags, want);
12081
94b18763 12082 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
12083 res = _ll_getattr(in, mask, perms);
12084
12085 if (res == 0)
12086 fill_statx(in, mask, stx);
11fdf7f2 12087 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12088 return res;
12089}
12090
12091int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12092 const UserPerm& perms, InodeRef *inp)
12093{
12094 vinodeno_t vino = _get_vino(in);
12095
11fdf7f2 12096 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 12097 << dendl;
11fdf7f2 12098 tout(cct) << __func__ << std::endl;
7c673cae
FG
12099 tout(cct) << vino.ino.val << std::endl;
12100 tout(cct) << stx->stx_mode << std::endl;
12101 tout(cct) << stx->stx_uid << std::endl;
12102 tout(cct) << stx->stx_gid << std::endl;
12103 tout(cct) << stx->stx_size << std::endl;
12104 tout(cct) << stx->stx_mtime << std::endl;
12105 tout(cct) << stx->stx_atime << std::endl;
12106 tout(cct) << stx->stx_btime << std::endl;
12107 tout(cct) << mask << std::endl;
12108
11fdf7f2 12109 if (!fuse_default_permissions) {
7c673cae
FG
12110 int res = may_setattr(in, stx, mask, perms);
12111 if (res < 0)
12112 return res;
12113 }
12114
12115 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12116
12117 return __setattrx(in, stx, mask, perms, inp);
12118}
12119
12120int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12121 const UserPerm& perms)
12122{
f67539c2
TL
12123 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12124 if (!mref_reader.is_state_satisfied())
12125 return -CEPHFS_ENOTCONN;
181888fb 12126
f67539c2 12127 std::scoped_lock lock(client_lock);
181888fb 12128
7c673cae
FG
12129 InodeRef target(in);
12130 int res = _ll_setattrx(in, stx, mask, perms, &target);
12131 if (res == 0) {
11fdf7f2 12132 ceph_assert(in == target.get());
7c673cae
FG
12133 fill_statx(in, in->caps_issued(), stx);
12134 }
12135
11fdf7f2 12136 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12137 return res;
12138}
12139
12140int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12141 const UserPerm& perms)
12142{
12143 struct ceph_statx stx;
12144 stat_to_statx(attr, &stx);
12145
f67539c2
TL
12146 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12147 if (!mref_reader.is_state_satisfied())
12148 return -CEPHFS_ENOTCONN;
181888fb 12149
f67539c2 12150 std::scoped_lock lock(client_lock);
181888fb 12151
7c673cae
FG
12152 InodeRef target(in);
12153 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12154 if (res == 0) {
11fdf7f2 12155 ceph_assert(in == target.get());
7c673cae
FG
12156 fill_stat(in, attr);
12157 }
12158
11fdf7f2 12159 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12160 return res;
12161}
12162
12163
12164// ----------
12165// xattrs
12166
12167int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12168 const UserPerm& perms)
12169{
f67539c2
TL
12170 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12171 if (!mref_reader.is_state_satisfied())
12172 return -CEPHFS_ENOTCONN;
181888fb 12173
f67539c2 12174 std::scoped_lock lock(client_lock);
181888fb 12175
7c673cae
FG
12176 InodeRef in;
12177 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12178 if (r < 0)
12179 return r;
12180 return _getxattr(in, name, value, size, perms);
12181}
12182
12183int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12184 const UserPerm& perms)
12185{
f67539c2
TL
12186 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12187 if (!mref_reader.is_state_satisfied())
12188 return -CEPHFS_ENOTCONN;
181888fb 12189
f67539c2 12190 std::scoped_lock lock(client_lock);
181888fb 12191
7c673cae
FG
12192 InodeRef in;
12193 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12194 if (r < 0)
12195 return r;
12196 return _getxattr(in, name, value, size, perms);
12197}
12198
12199int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12200 const UserPerm& perms)
12201{
f67539c2
TL
12202 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12203 if (!mref_reader.is_state_satisfied())
12204 return -CEPHFS_ENOTCONN;
181888fb 12205
f67539c2 12206 std::scoped_lock lock(client_lock);
181888fb 12207
7c673cae
FG
12208 Fh *f = get_filehandle(fd);
12209 if (!f)
f67539c2 12210 return -CEPHFS_EBADF;
7c673cae
FG
12211 return _getxattr(f->inode, name, value, size, perms);
12212}
12213
12214int Client::listxattr(const char *path, char *list, size_t size,
12215 const UserPerm& perms)
12216{
f67539c2
TL
12217 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12218 if (!mref_reader.is_state_satisfied())
12219 return -CEPHFS_ENOTCONN;
181888fb 12220
f67539c2 12221 std::scoped_lock lock(client_lock);
181888fb 12222
7c673cae
FG
12223 InodeRef in;
12224 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12225 if (r < 0)
12226 return r;
12227 return Client::_listxattr(in.get(), list, size, perms);
12228}
12229
12230int Client::llistxattr(const char *path, char *list, size_t size,
12231 const UserPerm& perms)
12232{
f67539c2
TL
12233 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12234 if (!mref_reader.is_state_satisfied())
12235 return -CEPHFS_ENOTCONN;
181888fb 12236
f67539c2 12237 std::scoped_lock lock(client_lock);
181888fb 12238
7c673cae
FG
12239 InodeRef in;
12240 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12241 if (r < 0)
12242 return r;
12243 return Client::_listxattr(in.get(), list, size, perms);
12244}
12245
12246int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12247{
f67539c2
TL
12248 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12249 if (!mref_reader.is_state_satisfied())
12250 return -CEPHFS_ENOTCONN;
181888fb 12251
f67539c2 12252 std::scoped_lock lock(client_lock);
181888fb 12253
7c673cae
FG
12254 Fh *f = get_filehandle(fd);
12255 if (!f)
f67539c2 12256 return -CEPHFS_EBADF;
7c673cae
FG
12257 return Client::_listxattr(f->inode.get(), list, size, perms);
12258}
12259
12260int Client::removexattr(const char *path, const char *name,
12261 const UserPerm& perms)
12262{
f67539c2
TL
12263 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12264 if (!mref_reader.is_state_satisfied())
12265 return -CEPHFS_ENOTCONN;
181888fb 12266
f67539c2 12267 std::scoped_lock lock(client_lock);
181888fb 12268
7c673cae
FG
12269 InodeRef in;
12270 int r = Client::path_walk(path, &in, perms, true);
12271 if (r < 0)
12272 return r;
12273 return _removexattr(in, name, perms);
12274}
12275
12276int Client::lremovexattr(const char *path, const char *name,
12277 const UserPerm& perms)
12278{
f67539c2
TL
12279 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12280 if (!mref_reader.is_state_satisfied())
12281 return -CEPHFS_ENOTCONN;
181888fb 12282
f67539c2 12283 std::scoped_lock lock(client_lock);
181888fb 12284
7c673cae
FG
12285 InodeRef in;
12286 int r = Client::path_walk(path, &in, perms, false);
12287 if (r < 0)
12288 return r;
12289 return _removexattr(in, name, perms);
12290}
12291
12292int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12293{
f67539c2
TL
12294 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12295 if (!mref_reader.is_state_satisfied())
12296 return -CEPHFS_ENOTCONN;
181888fb 12297
f67539c2 12298 std::scoped_lock lock(client_lock);
181888fb 12299
7c673cae
FG
12300 Fh *f = get_filehandle(fd);
12301 if (!f)
f67539c2 12302 return -CEPHFS_EBADF;
7c673cae
FG
12303 return _removexattr(f->inode, name, perms);
12304}
12305
12306int Client::setxattr(const char *path, const char *name, const void *value,
12307 size_t size, int flags, const UserPerm& perms)
12308{
f67539c2
TL
12309 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12310 if (!mref_reader.is_state_satisfied())
12311 return -CEPHFS_ENOTCONN;
12312
7c673cae
FG
12313 _setxattr_maybe_wait_for_osdmap(name, value, size);
12314
f67539c2 12315 std::scoped_lock lock(client_lock);
181888fb 12316
7c673cae
FG
12317 InodeRef in;
12318 int r = Client::path_walk(path, &in, perms, true);
12319 if (r < 0)
12320 return r;
12321 return _setxattr(in, name, value, size, flags, perms);
12322}
12323
12324int Client::lsetxattr(const char *path, const char *name, const void *value,
12325 size_t size, int flags, const UserPerm& perms)
12326{
f67539c2
TL
12327 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12328 if (!mref_reader.is_state_satisfied())
12329 return -CEPHFS_ENOTCONN;
7c673cae 12330
f67539c2 12331 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12332
f67539c2 12333 std::scoped_lock lock(client_lock);
181888fb 12334
7c673cae
FG
12335 InodeRef in;
12336 int r = Client::path_walk(path, &in, perms, false);
12337 if (r < 0)
12338 return r;
12339 return _setxattr(in, name, value, size, flags, perms);
12340}
12341
12342int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12343 int flags, const UserPerm& perms)
12344{
f67539c2
TL
12345 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12346 if (!mref_reader.is_state_satisfied())
12347 return -CEPHFS_ENOTCONN;
7c673cae 12348
f67539c2 12349 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12350
f67539c2 12351 std::scoped_lock lock(client_lock);
181888fb 12352
7c673cae
FG
12353 Fh *f = get_filehandle(fd);
12354 if (!f)
f67539c2 12355 return -CEPHFS_EBADF;
7c673cae
FG
12356 return _setxattr(f->inode, name, value, size, flags, perms);
12357}
12358
12359int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12360 const UserPerm& perms)
12361{
12362 int r;
1d09f67e 12363 const VXattr *vxattr = nullptr;
7c673cae 12364
1d09f67e 12365 vxattr = _match_vxattr(in, name);
7c673cae 12366 if (vxattr) {
f67539c2 12367 r = -CEPHFS_ENODATA;
7c673cae
FG
12368
12369 // Do a force getattr to get the latest quota before returning
12370 // a value to userspace.
28e407b8
AA
12371 int flags = 0;
12372 if (vxattr->flags & VXATTR_RSTAT) {
12373 flags |= CEPH_STAT_RSTAT;
12374 }
adb31ebb
TL
12375 if (vxattr->flags & VXATTR_DIRSTAT) {
12376 flags |= CEPH_CAP_FILE_SHARED;
12377 }
f67539c2 12378 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
7c673cae
FG
12379 if (r != 0) {
12380 // Error from getattr!
12381 return r;
12382 }
12383
12384 // call pointer-to-member function
12385 char buf[256];
12386 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12387 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12388 } else {
f67539c2 12389 r = -CEPHFS_ENODATA;
7c673cae
FG
12390 }
12391
12392 if (size != 0) {
12393 if (r > (int)size) {
f67539c2 12394 r = -CEPHFS_ERANGE;
7c673cae
FG
12395 } else if (r > 0) {
12396 memcpy(value, buf, r);
12397 }
12398 }
12399 goto out;
12400 }
12401
1d09f67e
TL
12402 if (!strncmp(name, "ceph.", 5)) {
12403 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12404 goto out;
12405 }
12406
7c673cae 12407 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
f67539c2 12408 r = -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12409 goto out;
12410 }
12411
12412 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12413 if (r == 0) {
12414 string n(name);
f67539c2 12415 r = -CEPHFS_ENODATA;
1d09f67e 12416 if (in->xattrs.count(n)) {
7c673cae
FG
12417 r = in->xattrs[n].length();
12418 if (r > 0 && size != 0) {
12419 if (size >= (unsigned)r)
12420 memcpy(value, in->xattrs[n].c_str(), r);
12421 else
f67539c2 12422 r = -CEPHFS_ERANGE;
7c673cae
FG
12423 }
12424 }
12425 }
12426 out:
1adf2230 12427 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
12428 return r;
12429}
12430
12431int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12432 const UserPerm& perms)
12433{
12434 if (cct->_conf->client_permissions) {
12435 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12436 if (r < 0)
12437 return r;
12438 }
12439 return _getxattr(in.get(), name, value, size, perms);
12440}
12441
12442int Client::ll_getxattr(Inode *in, const char *name, void *value,
12443 size_t size, const UserPerm& perms)
12444{
f67539c2
TL
12445 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12446 if (!mref_reader.is_state_satisfied())
12447 return -CEPHFS_ENOTCONN;
181888fb 12448
7c673cae
FG
12449 vinodeno_t vino = _get_vino(in);
12450
11fdf7f2
TL
12451 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12452 tout(cct) << __func__ << std::endl;
7c673cae
FG
12453 tout(cct) << vino.ino.val << std::endl;
12454 tout(cct) << name << std::endl;
12455
f67539c2 12456 std::scoped_lock lock(client_lock);
11fdf7f2 12457 if (!fuse_default_permissions) {
7c673cae
FG
12458 int r = xattr_permission(in, name, MAY_READ, perms);
12459 if (r < 0)
12460 return r;
12461 }
12462
12463 return _getxattr(in, name, value, size, perms);
12464}
12465
12466int Client::_listxattr(Inode *in, char *name, size_t size,
12467 const UserPerm& perms)
12468{
81eedcae 12469 bool len_only = (size == 0);
7c673cae 12470 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
12471 if (r != 0) {
12472 goto out;
12473 }
7c673cae 12474
81eedcae 12475 r = 0;
f67539c2
TL
12476 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12477 if (xattr_name.rfind("ceph.", 0) == 0) {
12478 continue;
12479 }
12480
12481 size_t this_len = xattr_name.length() + 1;
81eedcae
TL
12482 r += this_len;
12483 if (len_only)
12484 continue;
7c673cae 12485
81eedcae 12486 if (this_len > size) {
f67539c2 12487 r = -CEPHFS_ERANGE;
81eedcae
TL
12488 goto out;
12489 }
12490
f67539c2 12491 memcpy(name, xattr_name.c_str(), this_len);
81eedcae
TL
12492 name += this_len;
12493 size -= this_len;
12494 }
81eedcae 12495out:
11fdf7f2 12496 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
12497 return r;
12498}
12499
12500int Client::ll_listxattr(Inode *in, char *names, size_t size,
12501 const UserPerm& perms)
12502{
f67539c2
TL
12503 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12504 if (!mref_reader.is_state_satisfied())
12505 return -CEPHFS_ENOTCONN;
181888fb 12506
7c673cae
FG
12507 vinodeno_t vino = _get_vino(in);
12508
11fdf7f2
TL
12509 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12510 tout(cct) << __func__ << std::endl;
7c673cae
FG
12511 tout(cct) << vino.ino.val << std::endl;
12512 tout(cct) << size << std::endl;
12513
f67539c2 12514 std::scoped_lock lock(client_lock);
7c673cae
FG
12515 return _listxattr(in, names, size, perms);
12516}
12517
12518int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12519 size_t size, int flags, const UserPerm& perms)
12520{
12521
12522 int xattr_flags = 0;
12523 if (!value)
12524 xattr_flags |= CEPH_XATTR_REMOVE;
12525 if (flags & XATTR_CREATE)
12526 xattr_flags |= CEPH_XATTR_CREATE;
12527 if (flags & XATTR_REPLACE)
12528 xattr_flags |= CEPH_XATTR_REPLACE;
12529
12530 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12531 filepath path;
12532 in->make_nosnap_relative_path(path);
12533 req->set_filepath(path);
12534 req->set_string2(name);
12535 req->set_inode(in);
12536 req->head.args.setxattr.flags = xattr_flags;
12537
12538 bufferlist bl;
20effc67 12539 ceph_assert(value || size == 0);
7c673cae
FG
12540 bl.append((const char*)value, size);
12541 req->set_data(bl);
12542
12543 int res = make_request(req, perms);
12544
12545 trim_cache();
11fdf7f2 12546 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
12547 res << dendl;
12548 return res;
12549}
12550
12551int Client::_setxattr(Inode *in, const char *name, const void *value,
12552 size_t size, int flags, const UserPerm& perms)
12553{
12554 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12555 return -CEPHFS_EROFS;
7c673cae
FG
12556 }
12557
f6b5b4d7
TL
12558 if (size == 0) {
12559 value = "";
12560 } else if (value == NULL) {
f67539c2 12561 return -CEPHFS_EINVAL;
f6b5b4d7
TL
12562 }
12563
7c673cae
FG
12564 bool posix_acl_xattr = false;
12565 if (acl_type == POSIX_ACL)
12566 posix_acl_xattr = !strncmp(name, "system.", 7);
12567
12568 if (strncmp(name, "user.", 5) &&
12569 strncmp(name, "security.", 9) &&
12570 strncmp(name, "trusted.", 8) &&
12571 strncmp(name, "ceph.", 5) &&
12572 !posix_acl_xattr)
f67539c2 12573 return -CEPHFS_EOPNOTSUPP;
7c673cae 12574
11fdf7f2
TL
12575 bool check_realm = false;
12576
7c673cae
FG
12577 if (posix_acl_xattr) {
12578 if (!strcmp(name, ACL_EA_ACCESS)) {
12579 mode_t new_mode = in->mode;
12580 if (value) {
12581 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12582 if (ret < 0)
12583 return ret;
12584 if (ret == 0) {
12585 value = NULL;
12586 size = 0;
12587 }
12588 if (new_mode != in->mode) {
12589 struct ceph_statx stx;
12590 stx.stx_mode = new_mode;
12591 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
12592 if (ret < 0)
12593 return ret;
12594 }
12595 }
12596 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12597 if (value) {
12598 if (!S_ISDIR(in->mode))
f67539c2 12599 return -CEPHFS_EACCES;
7c673cae
FG
12600 int ret = posix_acl_check(value, size);
12601 if (ret < 0)
f67539c2 12602 return -CEPHFS_EINVAL;
7c673cae
FG
12603 if (ret == 0) {
12604 value = NULL;
12605 size = 0;
12606 }
12607 }
12608 } else {
f67539c2 12609 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12610 }
12611 } else {
12612 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
12613 if (vxattr) {
12614 if (vxattr->readonly)
f67539c2 12615 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12616 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12617 check_realm = true;
12618 }
7c673cae
FG
12619 }
12620
11fdf7f2
TL
12621 int ret = _do_setxattr(in, name, value, size, flags, perms);
12622 if (ret >= 0 && check_realm) {
12623 // check if snaprealm was created for quota inode
12624 if (in->quota.is_enable() &&
12625 !(in->snaprealm && in->snaprealm->ino == in->ino))
f67539c2 12626 ret = -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12627 }
12628
12629 return ret;
7c673cae
FG
12630}
12631
12632int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12633 size_t size, int flags, const UserPerm& perms)
12634{
12635 if (cct->_conf->client_permissions) {
12636 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12637 if (r < 0)
12638 return r;
12639 }
12640 return _setxattr(in.get(), name, value, size, flags, perms);
12641}
12642
12643int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12644{
12645 string tmp;
12646 if (name == "layout") {
12647 string::iterator begin = value.begin();
12648 string::iterator end = value.end();
12649 keys_and_values<string::iterator> p; // create instance of parser
12650 std::map<string, string> m; // map to receive results
12651 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 12652 return -CEPHFS_EINVAL;
7c673cae
FG
12653 }
12654 if (begin != end)
f67539c2 12655 return -CEPHFS_EINVAL;
7c673cae
FG
12656 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12657 if (q->first == "pool") {
12658 tmp = q->second;
12659 break;
12660 }
12661 }
12662 } else if (name == "layout.pool") {
12663 tmp = value;
12664 }
12665
12666 if (tmp.length()) {
12667 int64_t pool;
12668 try {
12669 pool = boost::lexical_cast<unsigned>(tmp);
12670 if (!osdmap->have_pg_pool(pool))
f67539c2 12671 return -CEPHFS_ENOENT;
7c673cae
FG
12672 } catch (boost::bad_lexical_cast const&) {
12673 pool = osdmap->lookup_pg_pool_name(tmp);
12674 if (pool < 0) {
f67539c2 12675 return -CEPHFS_ENOENT;
7c673cae
FG
12676 }
12677 }
12678 }
12679
12680 return 0;
12681}
12682
12683void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
12684{
12685 // For setting pool of layout, MetaRequest need osdmap epoch.
12686 // There is a race which create a new data pool but client and mds both don't have.
12687 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
f67539c2 12688 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
7c673cae
FG
12689 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
12690 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
12691 string rest(strstr(name, "layout"));
12692 string v((const char*)value, size);
12693 int r = objecter->with_osdmap([&](const OSDMap& o) {
12694 return _setxattr_check_data_pool(rest, v, &o);
12695 });
12696
f67539c2
TL
12697 if (r == -CEPHFS_ENOENT) {
12698 bs::error_code ec;
12699 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
12700 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
12701 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
7c673cae
FG
12702 }
12703 }
12704}
12705
12706int Client::ll_setxattr(Inode *in, const char *name, const void *value,
12707 size_t size, int flags, const UserPerm& perms)
12708{
f67539c2
TL
12709 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12710 if (!mref_reader.is_state_satisfied())
12711 return -CEPHFS_ENOTCONN;
7c673cae 12712
f67539c2 12713 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12714
7c673cae
FG
12715 vinodeno_t vino = _get_vino(in);
12716
11fdf7f2
TL
12717 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12718 tout(cct) << __func__ << std::endl;
7c673cae
FG
12719 tout(cct) << vino.ino.val << std::endl;
12720 tout(cct) << name << std::endl;
12721
f67539c2 12722 std::scoped_lock lock(client_lock);
11fdf7f2 12723 if (!fuse_default_permissions) {
7c673cae
FG
12724 int r = xattr_permission(in, name, MAY_WRITE, perms);
12725 if (r < 0)
12726 return r;
12727 }
12728 return _setxattr(in, name, value, size, flags, perms);
12729}
12730
12731int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
12732{
12733 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12734 return -CEPHFS_EROFS;
7c673cae
FG
12735 }
12736
12737 // same xattrs supported by kernel client
12738 if (strncmp(name, "user.", 5) &&
12739 strncmp(name, "system.", 7) &&
12740 strncmp(name, "security.", 9) &&
12741 strncmp(name, "trusted.", 8) &&
12742 strncmp(name, "ceph.", 5))
f67539c2 12743 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12744
12745 const VXattr *vxattr = _match_vxattr(in, name);
12746 if (vxattr && vxattr->readonly)
f67539c2 12747 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12748
12749 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
12750 filepath path;
12751 in->make_nosnap_relative_path(path);
12752 req->set_filepath(path);
12753 req->set_filepath2(name);
12754 req->set_inode(in);
12755
12756 int res = make_request(req, perms);
12757
12758 trim_cache();
1adf2230 12759 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
12760 return res;
12761}
12762
12763int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
12764{
12765 if (cct->_conf->client_permissions) {
12766 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12767 if (r < 0)
12768 return r;
12769 }
12770 return _removexattr(in.get(), name, perms);
12771}
12772
12773int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
12774{
f67539c2
TL
12775 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12776 if (!mref_reader.is_state_satisfied())
12777 return -CEPHFS_ENOTCONN;
181888fb 12778
7c673cae
FG
12779 vinodeno_t vino = _get_vino(in);
12780
12781 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
12782 tout(cct) << "ll_removexattr" << std::endl;
12783 tout(cct) << vino.ino.val << std::endl;
12784 tout(cct) << name << std::endl;
12785
f67539c2 12786 std::scoped_lock lock(client_lock);
11fdf7f2 12787 if (!fuse_default_permissions) {
7c673cae
FG
12788 int r = xattr_permission(in, name, MAY_WRITE, perms);
12789 if (r < 0)
12790 return r;
12791 }
12792
12793 return _removexattr(in, name, perms);
12794}
12795
12796bool Client::_vxattrcb_quota_exists(Inode *in)
12797{
11fdf7f2 12798 return in->quota.is_enable() &&
f6b5b4d7
TL
12799 (in->snapid != CEPH_NOSNAP ||
12800 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
12801}
12802size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
12803{
12804 return snprintf(val, size,
12805 "max_bytes=%lld max_files=%lld",
12806 (long long int)in->quota.max_bytes,
12807 (long long int)in->quota.max_files);
12808}
12809size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
12810{
12811 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
12812}
12813size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
12814{
12815 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
12816}
12817
12818bool Client::_vxattrcb_layout_exists(Inode *in)
12819{
12820 return in->layout != file_layout_t();
12821}
12822size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
12823{
12824 int r = snprintf(val, size,
11fdf7f2 12825 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
12826 (unsigned long long)in->layout.stripe_unit,
12827 (unsigned long long)in->layout.stripe_count,
12828 (unsigned long long)in->layout.object_size);
12829 objecter->with_osdmap([&](const OSDMap& o) {
12830 if (o.have_pg_pool(in->layout.pool_id))
12831 r += snprintf(val + r, size - r, "%s",
12832 o.get_pool_name(in->layout.pool_id).c_str());
12833 else
12834 r += snprintf(val + r, size - r, "%" PRIu64,
12835 (uint64_t)in->layout.pool_id);
12836 });
12837 if (in->layout.pool_ns.length())
12838 r += snprintf(val + r, size - r, " pool_namespace=%s",
12839 in->layout.pool_ns.c_str());
12840 return r;
12841}
12842size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
12843{
11fdf7f2 12844 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
12845}
12846size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
12847{
11fdf7f2 12848 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
12849}
12850size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
12851{
11fdf7f2 12852 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
12853}
12854size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
12855{
12856 size_t r;
12857 objecter->with_osdmap([&](const OSDMap& o) {
12858 if (o.have_pg_pool(in->layout.pool_id))
12859 r = snprintf(val, size, "%s", o.get_pool_name(
12860 in->layout.pool_id).c_str());
12861 else
12862 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
12863 });
12864 return r;
12865}
12866size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
12867{
12868 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
12869}
12870size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
12871{
11fdf7f2 12872 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
12873}
12874size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
12875{
11fdf7f2 12876 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
12877}
12878size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
12879{
11fdf7f2 12880 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
12881}
12882size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
12883{
11fdf7f2 12884 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
12885}
12886size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
12887{
11fdf7f2 12888 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
12889}
12890size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
12891{
11fdf7f2 12892 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae 12893}
f67539c2
TL
12894size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
12895{
12896 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
12897}
7c673cae
FG
12898size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
12899{
11fdf7f2 12900 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
12901}
12902size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
12903{
81eedcae 12904 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
12905 (long)in->rstat.rctime.nsec());
12906}
11fdf7f2
TL
12907bool Client::_vxattrcb_dir_pin_exists(Inode *in)
12908{
f67539c2 12909 return in->dir_pin != -CEPHFS_ENODATA;
11fdf7f2
TL
12910}
12911size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
12912{
12913 return snprintf(val, size, "%ld", (long)in->dir_pin);
12914}
7c673cae 12915
81eedcae
TL
12916bool Client::_vxattrcb_snap_btime_exists(Inode *in)
12917{
12918 return !in->snap_btime.is_zero();
12919}
12920
12921size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
12922{
12923 return snprintf(val, size, "%llu.%09lu",
12924 (long long unsigned)in->snap_btime.sec(),
12925 (long unsigned)in->snap_btime.nsec());
12926}
12927
20effc67
TL
12928size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
12929{
12930 int issued;
12931
12932 in->caps_issued(&issued);
12933 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
12934}
12935
f67539c2
TL
12936bool Client::_vxattrcb_mirror_info_exists(Inode *in)
12937{
12938 // checking one of the xattrs would suffice
12939 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
12940}
12941
12942size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
12943{
12944 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
12945 in->xattrs["ceph.mirror.info.cluster_id"].length(),
12946 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
12947 in->xattrs["ceph.mirror.info.fs_id"].length(),
12948 in->xattrs["ceph.mirror.info.fs_id"].c_str());
12949}
12950
adb31ebb
TL
12951size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
12952{
12953 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
12954}
12955
12956size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
12957{
12958 auto name = messenger->get_myname();
20effc67 12959 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
adb31ebb
TL
12960}
12961
7c673cae
FG
12962#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
12963#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
12964
adb31ebb 12965#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
12966{ \
12967 name: CEPH_XATTR_NAME(_type, _name), \
12968 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12969 readonly: true, \
28e407b8
AA
12970 exists_cb: NULL, \
12971 flags: _flags, \
7c673cae
FG
12972}
12973#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
12974{ \
12975 name: CEPH_XATTR_NAME2(_type, _name, _field), \
12976 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
12977 readonly: false, \
7c673cae 12978 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 12979 flags: 0, \
7c673cae
FG
12980}
12981#define XATTR_QUOTA_FIELD(_type, _name) \
12982{ \
12983 name: CEPH_XATTR_NAME(_type, _name), \
12984 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
12985 readonly: false, \
7c673cae 12986 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 12987 flags: 0, \
7c673cae
FG
12988}
12989
12990const Client::VXattr Client::_dir_vxattrs[] = {
12991 {
12992 name: "ceph.dir.layout",
12993 getxattr_cb: &Client::_vxattrcb_layout,
12994 readonly: false,
7c673cae 12995 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 12996 flags: 0,
7c673cae 12997 },
1d09f67e
TL
12998 // FIXME
12999 // Delete the following dir layout field definitions for release "S"
7c673cae
FG
13000 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13001 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13002 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13003 XATTR_LAYOUT_FIELD(dir, layout, pool),
13004 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
13005 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13006 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13007 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13008 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13009 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13010 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
f67539c2 13011 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
adb31ebb
TL
13012 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13013 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
13014 {
13015 name: "ceph.quota",
13016 getxattr_cb: &Client::_vxattrcb_quota,
13017 readonly: false,
7c673cae 13018 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 13019 flags: 0,
7c673cae
FG
13020 },
13021 XATTR_QUOTA_FIELD(quota, max_bytes),
13022 XATTR_QUOTA_FIELD(quota, max_files),
1d09f67e
TL
13023 // FIXME
13024 // Delete the following dir pin field definitions for release "S"
11fdf7f2
TL
13025 {
13026 name: "ceph.dir.pin",
13027 getxattr_cb: &Client::_vxattrcb_dir_pin,
13028 readonly: false,
11fdf7f2
TL
13029 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13030 flags: 0,
13031 },
81eedcae
TL
13032 {
13033 name: "ceph.snap.btime",
13034 getxattr_cb: &Client::_vxattrcb_snap_btime,
13035 readonly: true,
81eedcae
TL
13036 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13037 flags: 0,
13038 },
f67539c2
TL
13039 {
13040 name: "ceph.mirror.info",
13041 getxattr_cb: &Client::_vxattrcb_mirror_info,
13042 readonly: false,
13043 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13044 flags: 0,
13045 },
20effc67
TL
13046 {
13047 name: "ceph.caps",
13048 getxattr_cb: &Client::_vxattrcb_caps,
13049 readonly: true,
13050 exists_cb: NULL,
13051 flags: 0,
13052 },
7c673cae
FG
13053 { name: "" } /* Required table terminator */
13054};
13055
13056const Client::VXattr Client::_file_vxattrs[] = {
13057 {
13058 name: "ceph.file.layout",
13059 getxattr_cb: &Client::_vxattrcb_layout,
13060 readonly: false,
7c673cae 13061 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 13062 flags: 0,
7c673cae
FG
13063 },
13064 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13065 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13066 XATTR_LAYOUT_FIELD(file, layout, object_size),
13067 XATTR_LAYOUT_FIELD(file, layout, pool),
13068 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
13069 {
13070 name: "ceph.snap.btime",
13071 getxattr_cb: &Client::_vxattrcb_snap_btime,
13072 readonly: true,
81eedcae
TL
13073 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13074 flags: 0,
13075 },
20effc67
TL
13076 {
13077 name: "ceph.caps",
13078 getxattr_cb: &Client::_vxattrcb_caps,
13079 readonly: true,
13080 exists_cb: NULL,
13081 flags: 0,
13082 },
7c673cae
FG
13083 { name: "" } /* Required table terminator */
13084};
13085
adb31ebb
TL
13086const Client::VXattr Client::_common_vxattrs[] = {
13087 {
13088 name: "ceph.cluster_fsid",
13089 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13090 readonly: true,
13091 exists_cb: nullptr,
13092 flags: 0,
13093 },
13094 {
13095 name: "ceph.client_id",
13096 getxattr_cb: &Client::_vxattrcb_client_id,
13097 readonly: true,
13098 exists_cb: nullptr,
13099 flags: 0,
13100 },
13101 { name: "" } /* Required table terminator */
13102};
13103
7c673cae
FG
13104const Client::VXattr *Client::_get_vxattrs(Inode *in)
13105{
13106 if (in->is_dir())
13107 return _dir_vxattrs;
13108 else if (in->is_file())
13109 return _file_vxattrs;
13110 return NULL;
13111}
13112
13113const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13114{
13115 if (strncmp(name, "ceph.", 5) == 0) {
13116 const VXattr *vxattr = _get_vxattrs(in);
13117 if (vxattr) {
13118 while (!vxattr->name.empty()) {
13119 if (vxattr->name == name)
13120 return vxattr;
13121 vxattr++;
13122 }
13123 }
adb31ebb
TL
13124
13125 // for common vxattrs
13126 vxattr = _common_vxattrs;
13127 while (!vxattr->name.empty()) {
13128 if (vxattr->name == name)
13129 return vxattr;
13130 vxattr++;
13131 }
7c673cae 13132 }
adb31ebb 13133
7c673cae
FG
13134 return NULL;
13135}
13136
7c673cae
FG
13137int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13138{
f67539c2
TL
13139 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13140 if (!mref_reader.is_state_satisfied())
13141 return -CEPHFS_ENOTCONN;
181888fb 13142
7c673cae
FG
13143 vinodeno_t vino = _get_vino(in);
13144
13145 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13146 tout(cct) << "ll_readlink" << std::endl;
13147 tout(cct) << vino.ino.val << std::endl;
13148
f67539c2 13149 std::scoped_lock lock(client_lock);
11fdf7f2
TL
13150 for (auto dn : in->dentries) {
13151 touch_dn(dn);
7c673cae
FG
13152 }
13153
13154 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13155 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13156 return r;
13157}
13158
13159int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13160 const UserPerm& perms, InodeRef *inp)
13161{
1adf2230 13162 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13163 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13164 << ", gid " << perms.gid() << ")" << dendl;
13165
13166 if (strlen(name) > NAME_MAX)
f67539c2 13167 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13168
13169 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13170 return -CEPHFS_EROFS;
7c673cae
FG
13171 }
13172 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13173 return -CEPHFS_EDQUOT;
7c673cae
FG
13174 }
13175
13176 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13177
13178 filepath path;
13179 dir->make_nosnap_relative_path(path);
13180 path.push_dentry(name);
13181 req->set_filepath(path);
13182 req->set_inode(dir);
13183 req->head.args.mknod.rdev = rdev;
13184 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13185 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13186
13187 bufferlist xattrs_bl;
13188 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13189 if (res < 0)
13190 goto fail;
13191 req->head.args.mknod.mode = mode;
13192 if (xattrs_bl.length() > 0)
13193 req->set_data(xattrs_bl);
13194
13195 Dentry *de;
13196 res = get_or_create(dir, name, &de);
13197 if (res < 0)
13198 goto fail;
13199 req->set_dentry(de);
13200
13201 res = make_request(req, perms, inp);
13202
13203 trim_cache();
13204
1adf2230 13205 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13206 return res;
13207
13208 fail:
13209 put_request(req);
13210 return res;
13211}
13212
13213int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13214 dev_t rdev, struct stat *attr, Inode **out,
13215 const UserPerm& perms)
13216{
f67539c2
TL
13217 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13218 if (!mref_reader.is_state_satisfied())
13219 return -CEPHFS_ENOTCONN;
181888fb 13220
7c673cae
FG
13221 vinodeno_t vparent = _get_vino(parent);
13222
13223 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13224 tout(cct) << "ll_mknod" << std::endl;
13225 tout(cct) << vparent.ino.val << std::endl;
13226 tout(cct) << name << std::endl;
13227 tout(cct) << mode << std::endl;
13228 tout(cct) << rdev << std::endl;
13229
f67539c2 13230 std::scoped_lock lock(client_lock);
11fdf7f2 13231 if (!fuse_default_permissions) {
7c673cae
FG
13232 int r = may_create(parent, perms);
13233 if (r < 0)
13234 return r;
13235 }
13236
13237 InodeRef in;
13238 int r = _mknod(parent, name, mode, rdev, perms, &in);
13239 if (r == 0) {
13240 fill_stat(in, attr);
13241 _ll_get(in.get());
13242 }
13243 tout(cct) << attr->st_ino << std::endl;
13244 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13245 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13246 *out = in.get();
13247 return r;
13248}
13249
13250int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13251 dev_t rdev, Inode **out,
13252 struct ceph_statx *stx, unsigned want, unsigned flags,
13253 const UserPerm& perms)
13254{
f67539c2
TL
13255 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13256 if (!mref_reader.is_state_satisfied())
13257 return -CEPHFS_ENOTCONN;
7c673cae 13258
f67539c2 13259 unsigned caps = statx_to_mask(flags, want);
181888fb 13260
7c673cae
FG
13261 vinodeno_t vparent = _get_vino(parent);
13262
13263 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13264 tout(cct) << "ll_mknodx" << std::endl;
13265 tout(cct) << vparent.ino.val << std::endl;
13266 tout(cct) << name << std::endl;
13267 tout(cct) << mode << std::endl;
13268 tout(cct) << rdev << std::endl;
13269
f67539c2
TL
13270 std::scoped_lock lock(client_lock);
13271
11fdf7f2 13272 if (!fuse_default_permissions) {
7c673cae
FG
13273 int r = may_create(parent, perms);
13274 if (r < 0)
13275 return r;
13276 }
13277
13278 InodeRef in;
13279 int r = _mknod(parent, name, mode, rdev, perms, &in);
13280 if (r == 0) {
13281 fill_statx(in, caps, stx);
13282 _ll_get(in.get());
13283 }
13284 tout(cct) << stx->stx_ino << std::endl;
13285 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13286 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13287 *out = in.get();
13288 return r;
13289}
13290
13291int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13292 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13293 int object_size, const char *data_pool, bool *created,
f67539c2 13294 const UserPerm& perms, std::string alternate_name)
7c673cae 13295{
1adf2230 13296 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
13297 mode << dec << ")" << dendl;
13298
13299 if (strlen(name) > NAME_MAX)
f67539c2 13300 return -CEPHFS_ENAMETOOLONG;
7c673cae 13301 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13302 return -CEPHFS_EROFS;
7c673cae
FG
13303 }
13304 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13305 return -CEPHFS_EDQUOT;
7c673cae
FG
13306 }
13307
13308 // use normalized flags to generate cmode
11fdf7f2
TL
13309 int cflags = ceph_flags_sys2wire(flags);
13310 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13311 cflags |= CEPH_O_LAZY;
13312
13313 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
13314
13315 int64_t pool_id = -1;
13316 if (data_pool && *data_pool) {
13317 pool_id = objecter->with_osdmap(
13318 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13319 if (pool_id < 0)
f67539c2 13320 return -CEPHFS_EINVAL;
7c673cae 13321 if (pool_id > 0xffffffffll)
f67539c2 13322 return -CEPHFS_ERANGE; // bummer!
7c673cae
FG
13323 }
13324
13325 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13326
13327 filepath path;
13328 dir->make_nosnap_relative_path(path);
13329 path.push_dentry(name);
13330 req->set_filepath(path);
f67539c2 13331 req->set_alternate_name(std::move(alternate_name));
7c673cae 13332 req->set_inode(dir);
11fdf7f2 13333 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
13334
13335 req->head.args.open.stripe_unit = stripe_unit;
13336 req->head.args.open.stripe_count = stripe_count;
13337 req->head.args.open.object_size = object_size;
13338 if (cct->_conf->client_debug_getattr_caps)
13339 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13340 else
13341 req->head.args.open.mask = 0;
13342 req->head.args.open.pool = pool_id;
13343 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13344 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13345
13346 mode |= S_IFREG;
13347 bufferlist xattrs_bl;
13348 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
13349 if (res < 0)
13350 goto fail;
13351 req->head.args.open.mode = mode;
13352 if (xattrs_bl.length() > 0)
13353 req->set_data(xattrs_bl);
13354
13355 Dentry *de;
13356 res = get_or_create(dir, name, &de);
13357 if (res < 0)
13358 goto fail;
13359 req->set_dentry(de);
13360
13361 res = make_request(req, perms, inp, created);
13362 if (res < 0) {
13363 goto reply_error;
13364 }
13365
13366 /* If the caller passed a value in fhp, do the open */
13367 if(fhp) {
13368 (*inp)->get_open_ref(cmode);
13369 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13370 }
13371
13372 reply_error:
13373 trim_cache();
13374
1adf2230 13375 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
13376 << " layout " << stripe_unit
13377 << ' ' << stripe_count
13378 << ' ' << object_size
13379 <<") = " << res << dendl;
13380 return res;
13381
13382 fail:
13383 put_request(req);
13384 return res;
13385}
13386
7c673cae 13387int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
f67539c2
TL
13388 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13389 std::string alternate_name)
7c673cae 13390{
1adf2230 13391 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13392 << mode << dec << ", uid " << perm.uid()
13393 << ", gid " << perm.gid() << ")" << dendl;
13394
13395 if (strlen(name) > NAME_MAX)
f67539c2 13396 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13397
13398 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13399 return -CEPHFS_EROFS;
7c673cae
FG
13400 }
13401 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13402 return -CEPHFS_EDQUOT;
7c673cae 13403 }
f67539c2
TL
13404
13405 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13406 MetaRequest *req = new MetaRequest(is_snap_op ?
7c673cae
FG
13407 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13408
13409 filepath path;
13410 dir->make_nosnap_relative_path(path);
13411 path.push_dentry(name);
13412 req->set_filepath(path);
13413 req->set_inode(dir);
13414 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13415 req->dentry_unless = CEPH_CAP_FILE_EXCL;
f67539c2 13416 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13417
13418 mode |= S_IFDIR;
f67539c2
TL
13419 bufferlist bl;
13420 int res = _posix_acl_create(dir, &mode, bl, perm);
7c673cae
FG
13421 if (res < 0)
13422 goto fail;
13423 req->head.args.mkdir.mode = mode;
f67539c2
TL
13424 if (is_snap_op) {
13425 SnapPayload payload;
13426 // clear the bufferlist that may have been populated by the call
13427 // to _posix_acl_create(). MDS mksnap does not make use of it.
13428 // So, reuse it to pass metadata payload.
13429 bl.clear();
13430 payload.metadata = metadata;
13431 encode(payload, bl);
13432 }
13433 if (bl.length() > 0) {
13434 req->set_data(bl);
13435 }
7c673cae
FG
13436
13437 Dentry *de;
13438 res = get_or_create(dir, name, &de);
13439 if (res < 0)
13440 goto fail;
13441 req->set_dentry(de);
13442
13443 ldout(cct, 10) << "_mkdir: making request" << dendl;
13444 res = make_request(req, perm, inp);
13445 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13446
13447 trim_cache();
13448
1adf2230 13449 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae
FG
13450 return res;
13451
13452 fail:
13453 put_request(req);
13454 return res;
13455}
13456
13457int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13458 struct stat *attr, Inode **out, const UserPerm& perm)
13459{
f67539c2
TL
13460 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13461 if (!mref_reader.is_state_satisfied())
13462 return -CEPHFS_ENOTCONN;
181888fb 13463
7c673cae
FG
13464 vinodeno_t vparent = _get_vino(parent);
13465
13466 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13467 tout(cct) << "ll_mkdir" << std::endl;
13468 tout(cct) << vparent.ino.val << std::endl;
13469 tout(cct) << name << std::endl;
13470 tout(cct) << mode << std::endl;
13471
f67539c2
TL
13472 std::scoped_lock lock(client_lock);
13473
11fdf7f2 13474 if (!fuse_default_permissions) {
7c673cae
FG
13475 int r = may_create(parent, perm);
13476 if (r < 0)
13477 return r;
13478 }
13479
13480 InodeRef in;
13481 int r = _mkdir(parent, name, mode, perm, &in);
13482 if (r == 0) {
13483 fill_stat(in, attr);
13484 _ll_get(in.get());
13485 }
13486 tout(cct) << attr->st_ino << std::endl;
13487 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13488 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13489 *out = in.get();
13490 return r;
13491}
13492
13493int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13494 struct ceph_statx *stx, unsigned want, unsigned flags,
13495 const UserPerm& perms)
13496{
f67539c2
TL
13497 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13498 if (!mref_reader.is_state_satisfied())
13499 return -CEPHFS_ENOTCONN;
181888fb 13500
7c673cae
FG
13501 vinodeno_t vparent = _get_vino(parent);
13502
13503 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13504 tout(cct) << "ll_mkdirx" << std::endl;
13505 tout(cct) << vparent.ino.val << std::endl;
13506 tout(cct) << name << std::endl;
13507 tout(cct) << mode << std::endl;
13508
f67539c2
TL
13509 std::scoped_lock lock(client_lock);
13510
11fdf7f2 13511 if (!fuse_default_permissions) {
7c673cae
FG
13512 int r = may_create(parent, perms);
13513 if (r < 0)
13514 return r;
13515 }
13516
13517 InodeRef in;
13518 int r = _mkdir(parent, name, mode, perms, &in);
13519 if (r == 0) {
13520 fill_statx(in, statx_to_mask(flags, want), stx);
13521 _ll_get(in.get());
13522 } else {
13523 stx->stx_ino = 0;
13524 stx->stx_mask = 0;
13525 }
13526 tout(cct) << stx->stx_ino << std::endl;
13527 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13528 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13529 *out = in.get();
13530 return r;
13531}
13532
13533int Client::_symlink(Inode *dir, const char *name, const char *target,
f67539c2 13534 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
7c673cae 13535{
1adf2230 13536 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
13537 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13538 << dendl;
13539
13540 if (strlen(name) > NAME_MAX)
f67539c2 13541 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13542
13543 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13544 return -CEPHFS_EROFS;
7c673cae
FG
13545 }
13546 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13547 return -CEPHFS_EDQUOT;
7c673cae
FG
13548 }
13549
13550 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13551
13552 filepath path;
13553 dir->make_nosnap_relative_path(path);
13554 path.push_dentry(name);
13555 req->set_filepath(path);
f67539c2 13556 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13557 req->set_inode(dir);
13558 req->set_string2(target);
13559 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13560 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13561
13562 Dentry *de;
13563 int res = get_or_create(dir, name, &de);
13564 if (res < 0)
13565 goto fail;
13566 req->set_dentry(de);
13567
13568 res = make_request(req, perms, inp);
13569
13570 trim_cache();
1adf2230 13571 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
13572 res << dendl;
13573 return res;
13574
13575 fail:
13576 put_request(req);
13577 return res;
13578}
13579
13580int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13581 struct stat *attr, Inode **out, const UserPerm& perms)
13582{
f67539c2
TL
13583 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13584 if (!mref_reader.is_state_satisfied())
13585 return -CEPHFS_ENOTCONN;
181888fb 13586
7c673cae
FG
13587 vinodeno_t vparent = _get_vino(parent);
13588
13589 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13590 << dendl;
13591 tout(cct) << "ll_symlink" << std::endl;
13592 tout(cct) << vparent.ino.val << std::endl;
13593 tout(cct) << name << std::endl;
13594 tout(cct) << value << std::endl;
13595
f67539c2
TL
13596 std::scoped_lock lock(client_lock);
13597
11fdf7f2 13598 if (!fuse_default_permissions) {
7c673cae
FG
13599 int r = may_create(parent, perms);
13600 if (r < 0)
13601 return r;
13602 }
13603
13604 InodeRef in;
f67539c2 13605 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13606 if (r == 0) {
13607 fill_stat(in, attr);
13608 _ll_get(in.get());
13609 }
13610 tout(cct) << attr->st_ino << std::endl;
13611 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13612 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13613 *out = in.get();
13614 return r;
13615}
13616
13617int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13618 Inode **out, struct ceph_statx *stx, unsigned want,
13619 unsigned flags, const UserPerm& perms)
13620{
f67539c2
TL
13621 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13622 if (!mref_reader.is_state_satisfied())
13623 return -CEPHFS_ENOTCONN;
181888fb 13624
7c673cae
FG
13625 vinodeno_t vparent = _get_vino(parent);
13626
13627 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
13628 << dendl;
13629 tout(cct) << "ll_symlinkx" << std::endl;
13630 tout(cct) << vparent.ino.val << std::endl;
13631 tout(cct) << name << std::endl;
13632 tout(cct) << value << std::endl;
13633
f67539c2
TL
13634 std::scoped_lock lock(client_lock);
13635
11fdf7f2 13636 if (!fuse_default_permissions) {
7c673cae
FG
13637 int r = may_create(parent, perms);
13638 if (r < 0)
13639 return r;
13640 }
13641
13642 InodeRef in;
f67539c2 13643 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13644 if (r == 0) {
13645 fill_statx(in, statx_to_mask(flags, want), stx);
13646 _ll_get(in.get());
13647 }
13648 tout(cct) << stx->stx_ino << std::endl;
13649 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
13650 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13651 *out = in.get();
13652 return r;
13653}
13654
13655int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
13656{
1adf2230 13657 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
13658 << " uid " << perm.uid() << " gid " << perm.gid()
13659 << ")" << dendl;
13660
13661 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13662 return -CEPHFS_EROFS;
7c673cae
FG
13663 }
13664
13665 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
13666
13667 filepath path;
13668 dir->make_nosnap_relative_path(path);
13669 path.push_dentry(name);
13670 req->set_filepath(path);
13671
13672 InodeRef otherin;
b32b8144 13673 Inode *in;
7c673cae 13674 Dentry *de;
b32b8144 13675
7c673cae
FG
13676 int res = get_or_create(dir, name, &de);
13677 if (res < 0)
13678 goto fail;
13679 req->set_dentry(de);
13680 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13681 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13682
13683 res = _lookup(dir, name, 0, &otherin, perm);
13684 if (res < 0)
13685 goto fail;
b32b8144
FG
13686
13687 in = otherin.get();
13688 req->set_other_inode(in);
13689 in->break_all_delegs();
7c673cae
FG
13690 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13691
13692 req->set_inode(dir);
13693
13694 res = make_request(req, perm);
13695
13696 trim_cache();
1adf2230 13697 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae
FG
13698 return res;
13699
13700 fail:
13701 put_request(req);
13702 return res;
13703}
13704
13705int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
13706{
f67539c2
TL
13707 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13708 if (!mref_reader.is_state_satisfied())
13709 return -CEPHFS_ENOTCONN;
181888fb 13710
7c673cae
FG
13711 vinodeno_t vino = _get_vino(in);
13712
13713 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
13714 tout(cct) << "ll_unlink" << std::endl;
13715 tout(cct) << vino.ino.val << std::endl;
13716 tout(cct) << name << std::endl;
13717
f67539c2
TL
13718 std::scoped_lock lock(client_lock);
13719
11fdf7f2 13720 if (!fuse_default_permissions) {
7c673cae
FG
13721 int r = may_delete(in, name, perm);
13722 if (r < 0)
13723 return r;
13724 }
13725 return _unlink(in, name, perm);
13726}
13727
13728int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
13729{
1adf2230 13730 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
13731 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
13732
13733 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13734 return -CEPHFS_EROFS;
7c673cae 13735 }
b32b8144
FG
13736
13737 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
13738 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
13739 filepath path;
13740 dir->make_nosnap_relative_path(path);
13741 path.push_dentry(name);
13742 req->set_filepath(path);
11fdf7f2 13743 req->set_inode(dir);
7c673cae
FG
13744
13745 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13746 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13747 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
13748
13749 InodeRef in;
13750
13751 Dentry *de;
13752 int res = get_or_create(dir, name, &de);
13753 if (res < 0)
13754 goto fail;
b32b8144
FG
13755 if (op == CEPH_MDS_OP_RMDIR)
13756 req->set_dentry(de);
13757 else
13758 de->get();
13759
7c673cae
FG
13760 res = _lookup(dir, name, 0, &in, perms);
13761 if (res < 0)
13762 goto fail;
11fdf7f2
TL
13763
13764 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 13765 unlink(de, true, true);
b32b8144 13766 de->put();
7c673cae 13767 }
11fdf7f2 13768 req->set_other_inode(in.get());
7c673cae
FG
13769
13770 res = make_request(req, perms);
13771
13772 trim_cache();
1adf2230 13773 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae
FG
13774 return res;
13775
13776 fail:
13777 put_request(req);
13778 return res;
13779}
13780
13781int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
13782{
f67539c2
TL
13783 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13784 if (!mref_reader.is_state_satisfied())
13785 return -CEPHFS_ENOTCONN;
181888fb 13786
7c673cae
FG
13787 vinodeno_t vino = _get_vino(in);
13788
13789 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
13790 tout(cct) << "ll_rmdir" << std::endl;
13791 tout(cct) << vino.ino.val << std::endl;
13792 tout(cct) << name << std::endl;
13793
f67539c2
TL
13794 std::scoped_lock lock(client_lock);
13795
11fdf7f2 13796 if (!fuse_default_permissions) {
7c673cae
FG
13797 int r = may_delete(in, name, perms);
13798 if (r < 0)
13799 return r;
13800 }
13801
13802 return _rmdir(in, name, perms);
13803}
13804
f67539c2 13805int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
7c673cae 13806{
1adf2230 13807 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
13808 << todir->ino << " " << toname
13809 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
13810 << dendl;
13811
13812 if (fromdir->snapid != todir->snapid)
f67539c2 13813 return -CEPHFS_EXDEV;
7c673cae
FG
13814
13815 int op = CEPH_MDS_OP_RENAME;
13816 if (fromdir->snapid != CEPH_NOSNAP) {
13817 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
13818 op = CEPH_MDS_OP_RENAMESNAP;
13819 else
f67539c2
TL
13820 return -CEPHFS_EROFS;
13821 }
13822 if (fromdir != todir) {
13823 Inode *fromdir_root =
13824 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
13825 Inode *todir_root =
13826 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
13827 if (fromdir_root != todir_root) {
13828 return -CEPHFS_EXDEV;
13829 }
7c673cae 13830 }
7c673cae
FG
13831
13832 InodeRef target;
13833 MetaRequest *req = new MetaRequest(op);
13834
13835 filepath from;
13836 fromdir->make_nosnap_relative_path(from);
13837 from.push_dentry(fromname);
13838 filepath to;
13839 todir->make_nosnap_relative_path(to);
13840 to.push_dentry(toname);
13841 req->set_filepath(to);
13842 req->set_filepath2(from);
f67539c2 13843 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13844
13845 Dentry *oldde;
13846 int res = get_or_create(fromdir, fromname, &oldde);
13847 if (res < 0)
13848 goto fail;
13849 Dentry *de;
13850 res = get_or_create(todir, toname, &de);
13851 if (res < 0)
13852 goto fail;
13853
13854 if (op == CEPH_MDS_OP_RENAME) {
13855 req->set_old_dentry(oldde);
13856 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
13857 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
13858
13859 req->set_dentry(de);
13860 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13861 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13862
13863 InodeRef oldin, otherin;
f67539c2 13864 res = _lookup(fromdir, fromname, 0, &oldin, perm);
7c673cae
FG
13865 if (res < 0)
13866 goto fail;
b32b8144
FG
13867
13868 Inode *oldinode = oldin.get();
13869 oldinode->break_all_delegs();
13870 req->set_old_inode(oldinode);
7c673cae
FG
13871 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
13872
13873 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
13874 switch (res) {
13875 case 0:
13876 {
13877 Inode *in = otherin.get();
13878 req->set_other_inode(in);
13879 in->break_all_delegs();
13880 }
7c673cae 13881 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144 13882 break;
f67539c2 13883 case -CEPHFS_ENOENT:
b32b8144
FG
13884 break;
13885 default:
13886 goto fail;
7c673cae
FG
13887 }
13888
13889 req->set_inode(todir);
13890 } else {
13891 // renamesnap reply contains no tracedn, so we need to invalidate
13892 // dentry manually
13893 unlink(oldde, true, true);
13894 unlink(de, true, true);
11fdf7f2
TL
13895
13896 req->set_inode(todir);
7c673cae
FG
13897 }
13898
13899 res = make_request(req, perm, &target);
13900 ldout(cct, 10) << "rename result is " << res << dendl;
13901
13902 // renamed item from our cache
13903
13904 trim_cache();
1adf2230 13905 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
13906 return res;
13907
13908 fail:
13909 put_request(req);
13910 return res;
13911}
13912
13913int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
13914 const char *newname, const UserPerm& perm)
13915{
f67539c2
TL
13916 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13917 if (!mref_reader.is_state_satisfied())
13918 return -CEPHFS_ENOTCONN;
181888fb 13919
7c673cae
FG
13920 vinodeno_t vparent = _get_vino(parent);
13921 vinodeno_t vnewparent = _get_vino(newparent);
13922
13923 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
13924 << vnewparent << " " << newname << dendl;
13925 tout(cct) << "ll_rename" << std::endl;
13926 tout(cct) << vparent.ino.val << std::endl;
13927 tout(cct) << name << std::endl;
13928 tout(cct) << vnewparent.ino.val << std::endl;
13929 tout(cct) << newname << std::endl;
13930
f67539c2
TL
13931 std::scoped_lock lock(client_lock);
13932
11fdf7f2 13933 if (!fuse_default_permissions) {
7c673cae
FG
13934 int r = may_delete(parent, name, perm);
13935 if (r < 0)
13936 return r;
13937 r = may_delete(newparent, newname, perm);
f67539c2 13938 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
13939 return r;
13940 }
13941
f67539c2 13942 return _rename(parent, name, newparent, newname, perm, "");
7c673cae
FG
13943}
13944
f67539c2 13945int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
7c673cae 13946{
1adf2230 13947 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
13948 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
13949
13950 if (strlen(newname) > NAME_MAX)
f67539c2 13951 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13952
13953 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
f67539c2 13954 return -CEPHFS_EROFS;
7c673cae
FG
13955 }
13956 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13957 return -CEPHFS_EDQUOT;
7c673cae
FG
13958 }
13959
b32b8144 13960 in->break_all_delegs();
7c673cae
FG
13961 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
13962
13963 filepath path(newname, dir->ino);
13964 req->set_filepath(path);
f67539c2 13965 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13966 filepath existing(in->ino);
13967 req->set_filepath2(existing);
13968
13969 req->set_inode(dir);
13970 req->inode_drop = CEPH_CAP_FILE_SHARED;
13971 req->inode_unless = CEPH_CAP_FILE_EXCL;
13972
13973 Dentry *de;
13974 int res = get_or_create(dir, newname, &de);
13975 if (res < 0)
13976 goto fail;
13977 req->set_dentry(de);
13978
13979 res = make_request(req, perm, inp);
13980 ldout(cct, 10) << "link result is " << res << dendl;
13981
13982 trim_cache();
1adf2230 13983 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae
FG
13984 return res;
13985
13986 fail:
13987 put_request(req);
13988 return res;
13989}
13990
13991int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
13992 const UserPerm& perm)
13993{
f67539c2
TL
13994 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13995 if (!mref_reader.is_state_satisfied())
13996 return -CEPHFS_ENOTCONN;
181888fb 13997
7c673cae
FG
13998 vinodeno_t vino = _get_vino(in);
13999 vinodeno_t vnewparent = _get_vino(newparent);
14000
31f18b77 14001 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
14002 newname << dendl;
14003 tout(cct) << "ll_link" << std::endl;
14004 tout(cct) << vino.ino.val << std::endl;
14005 tout(cct) << vnewparent << std::endl;
14006 tout(cct) << newname << std::endl;
14007
7c673cae
FG
14008 InodeRef target;
14009
f67539c2
TL
14010 std::scoped_lock lock(client_lock);
14011
11fdf7f2 14012 if (!fuse_default_permissions) {
7c673cae 14013 if (S_ISDIR(in->mode))
f67539c2 14014 return -CEPHFS_EPERM;
7c673cae 14015
11fdf7f2 14016 int r = may_hardlink(in, perm);
7c673cae
FG
14017 if (r < 0)
14018 return r;
14019
14020 r = may_create(newparent, perm);
14021 if (r < 0)
14022 return r;
14023 }
14024
f67539c2 14025 return _link(in, newparent, newname, perm, "", &target);
7c673cae
FG
14026}
14027
14028int Client::ll_num_osds(void)
14029{
f67539c2 14030 std::scoped_lock lock(client_lock);
7c673cae
FG
14031 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14032}
14033
14034int Client::ll_osdaddr(int osd, uint32_t *addr)
14035{
f67539c2 14036 std::scoped_lock lock(client_lock);
181888fb 14037
7c673cae
FG
14038 entity_addr_t g;
14039 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14040 if (!o.exists(osd))
14041 return false;
11fdf7f2 14042 g = o.get_addrs(osd).front();
7c673cae
FG
14043 return true;
14044 });
14045 if (!exists)
14046 return -1;
14047 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14048 *addr = ntohl(nb_addr);
14049 return 0;
14050}
181888fb 14051
7c673cae
FG
14052uint32_t Client::ll_stripe_unit(Inode *in)
14053{
f67539c2 14054 std::scoped_lock lock(client_lock);
7c673cae
FG
14055 return in->layout.stripe_unit;
14056}
14057
14058uint64_t Client::ll_snap_seq(Inode *in)
14059{
f67539c2 14060 std::scoped_lock lock(client_lock);
7c673cae
FG
14061 return in->snaprealm->seq;
14062}
14063
14064int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14065{
f67539c2 14066 std::scoped_lock lock(client_lock);
7c673cae
FG
14067 *layout = in->layout;
14068 return 0;
14069}
14070
14071int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14072{
14073 return ll_file_layout(fh->inode.get(), layout);
14074}
14075
14076/* Currently we cannot take advantage of redundancy in reads, since we
14077 would have to go through all possible placement groups (a
14078 potentially quite large number determined by a hash), and use CRUSH
14079 to calculate the appropriate set of OSDs for each placement group,
14080 then index into that. An array with one entry per OSD is much more
14081 tractable and works for demonstration purposes. */
14082
14083int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14084 file_layout_t* layout)
14085{
f67539c2 14086 std::scoped_lock lock(client_lock);
181888fb 14087
28e407b8 14088 inodeno_t ino = in->ino;
7c673cae
FG
14089 uint32_t object_size = layout->object_size;
14090 uint32_t su = layout->stripe_unit;
14091 uint32_t stripe_count = layout->stripe_count;
14092 uint64_t stripes_per_object = object_size / su;
11fdf7f2 14093 uint64_t stripeno = 0, stripepos = 0;
7c673cae 14094
11fdf7f2
TL
14095 if(stripe_count) {
14096 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14097 stripepos = blockno % stripe_count; // which object in the object set (X)
14098 }
7c673cae
FG
14099 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14100 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14101
14102 object_t oid = file_object_t(ino, objectno);
14103 return objecter->with_osdmap([&](const OSDMap& o) {
14104 ceph_object_layout olayout =
14105 o.file_to_object_layout(oid, *layout);
14106 pg_t pg = (pg_t)olayout.ol_pgid;
14107 vector<int> osds;
14108 int primary;
14109 o.pg_to_acting_osds(pg, &osds, &primary);
14110 return primary;
14111 });
14112}
14113
14114/* Return the offset of the block, internal to the object */
14115
14116uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14117{
f67539c2 14118 std::scoped_lock lock(client_lock);
7c673cae
FG
14119 file_layout_t *layout=&(in->layout);
14120 uint32_t object_size = layout->object_size;
14121 uint32_t su = layout->stripe_unit;
14122 uint64_t stripes_per_object = object_size / su;
14123
14124 return (blockno % stripes_per_object) * su;
14125}
14126
14127int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14128 const UserPerm& perms)
14129{
f67539c2
TL
14130 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14131 if (!mref_reader.is_state_satisfied())
14132 return -CEPHFS_ENOTCONN;
181888fb 14133
7c673cae
FG
14134 vinodeno_t vino = _get_vino(in);
14135
14136 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14137 tout(cct) << "ll_opendir" << std::endl;
14138 tout(cct) << vino.ino.val << std::endl;
14139
f67539c2
TL
14140 std::scoped_lock lock(client_lock);
14141
11fdf7f2 14142 if (!fuse_default_permissions) {
7c673cae
FG
14143 int r = may_open(in, flags, perms);
14144 if (r < 0)
14145 return r;
14146 }
14147
14148 int r = _opendir(in, dirpp, perms);
f67539c2 14149 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
14150
14151 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14152 << dendl;
14153 return r;
14154}
14155
14156int Client::ll_releasedir(dir_result_t *dirp)
14157{
f67539c2
TL
14158 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14159 if (!mref_reader.is_state_satisfied())
14160 return -CEPHFS_ENOTCONN;
14161
7c673cae
FG
14162 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14163 tout(cct) << "ll_releasedir" << std::endl;
f67539c2 14164 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14165
f67539c2 14166 std::scoped_lock lock(client_lock);
181888fb 14167
7c673cae
FG
14168 _closedir(dirp);
14169 return 0;
14170}
14171
14172int Client::ll_fsyncdir(dir_result_t *dirp)
14173{
f67539c2
TL
14174 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14175 if (!mref_reader.is_state_satisfied())
14176 return -CEPHFS_ENOTCONN;
14177
7c673cae
FG
14178 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14179 tout(cct) << "ll_fsyncdir" << std::endl;
f67539c2 14180 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14181
f67539c2 14182 std::scoped_lock lock(client_lock);
7c673cae
FG
14183 return _fsync(dirp->inode.get(), false);
14184}
14185
14186int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14187{
11fdf7f2 14188 ceph_assert(!(flags & O_CREAT));
7c673cae 14189
f67539c2
TL
14190 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14191 if (!mref_reader.is_state_satisfied())
14192 return -CEPHFS_ENOTCONN;
181888fb 14193
7c673cae
FG
14194 vinodeno_t vino = _get_vino(in);
14195
14196 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14197 tout(cct) << "ll_open" << std::endl;
14198 tout(cct) << vino.ino.val << std::endl;
14199 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14200
f67539c2
TL
14201 std::scoped_lock lock(client_lock);
14202
7c673cae 14203 int r;
11fdf7f2 14204 if (!fuse_default_permissions) {
7c673cae
FG
14205 r = may_open(in, flags, perms);
14206 if (r < 0)
14207 goto out;
14208 }
14209
14210 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14211
14212 out:
14213 Fh *fhptr = fhp ? *fhp : NULL;
14214 if (fhptr) {
14215 ll_unclosed_fh_set.insert(fhptr);
14216 }
f67539c2 14217 tout(cct) << (uintptr_t)fhptr << std::endl;
7c673cae
FG
14218 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14219 " = " << r << " (" << fhptr << ")" << dendl;
14220 return r;
14221}
14222
14223int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14224 int flags, InodeRef *in, int caps, Fh **fhp,
14225 const UserPerm& perms)
14226{
14227 *fhp = NULL;
14228
14229 vinodeno_t vparent = _get_vino(parent);
14230
1adf2230 14231 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14232 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14233 << ", gid " << perms.gid() << dendl;
14234 tout(cct) << "ll_create" << std::endl;
14235 tout(cct) << vparent.ino.val << std::endl;
14236 tout(cct) << name << std::endl;
14237 tout(cct) << mode << std::endl;
14238 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14239
14240 bool created = false;
14241 int r = _lookup(parent, name, caps, in, perms);
14242
14243 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 14244 return -CEPHFS_EEXIST;
7c673cae 14245
f67539c2 14246 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
11fdf7f2 14247 if (!fuse_default_permissions) {
7c673cae
FG
14248 r = may_create(parent, perms);
14249 if (r < 0)
14250 goto out;
14251 }
14252 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
f67539c2 14253 perms, "");
7c673cae
FG
14254 if (r < 0)
14255 goto out;
14256 }
14257
14258 if (r < 0)
14259 goto out;
14260
11fdf7f2 14261 ceph_assert(*in);
7c673cae
FG
14262
14263 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14264 if (!created) {
11fdf7f2 14265 if (!fuse_default_permissions) {
7c673cae
FG
14266 r = may_open(in->get(), flags, perms);
14267 if (r < 0) {
14268 if (*fhp) {
14269 int release_r = _release_fh(*fhp);
11fdf7f2 14270 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
14271 }
14272 goto out;
14273 }
14274 }
14275 if (*fhp == NULL) {
14276 r = _open(in->get(), flags, mode, fhp, perms);
14277 if (r < 0)
14278 goto out;
14279 }
14280 }
14281
14282out:
14283 if (*fhp) {
14284 ll_unclosed_fh_set.insert(*fhp);
14285 }
14286
14287 ino_t ino = 0;
14288 if (r >= 0) {
14289 Inode *inode = in->get();
14290 if (use_faked_inos())
14291 ino = inode->faked_ino;
14292 else
14293 ino = inode->ino;
14294 }
14295
f67539c2 14296 tout(cct) << (uintptr_t)*fhp << std::endl;
7c673cae 14297 tout(cct) << ino << std::endl;
1adf2230 14298 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14299 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14300 *fhp << " " << hex << ino << dec << ")" << dendl;
14301
14302 return r;
14303}
14304
14305int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14306 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14307 const UserPerm& perms)
14308{
f67539c2
TL
14309 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14310 if (!mref_reader.is_state_satisfied())
14311 return -CEPHFS_ENOTCONN;
7c673cae 14312
f67539c2
TL
14313 std::scoped_lock lock(client_lock);
14314 InodeRef in;
181888fb 14315
7c673cae
FG
14316 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14317 fhp, perms);
14318 if (r >= 0) {
11fdf7f2 14319 ceph_assert(in);
7c673cae
FG
14320
14321 // passing an Inode in outp requires an additional ref
14322 if (outp) {
14323 _ll_get(in.get());
14324 *outp = in.get();
14325 }
14326 fill_stat(in, attr);
14327 } else {
14328 attr->st_ino = 0;
14329 }
14330
14331 return r;
14332}
14333
14334int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14335 int oflags, Inode **outp, Fh **fhp,
14336 struct ceph_statx *stx, unsigned want, unsigned lflags,
14337 const UserPerm& perms)
14338{
14339 unsigned caps = statx_to_mask(lflags, want);
f67539c2
TL
14340 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14341 if (!mref_reader.is_state_satisfied())
14342 return -CEPHFS_ENOTCONN;
7c673cae 14343
f67539c2
TL
14344 std::scoped_lock lock(client_lock);
14345 InodeRef in;
7c673cae
FG
14346
14347 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14348 if (r >= 0) {
11fdf7f2 14349 ceph_assert(in);
7c673cae
FG
14350
14351 // passing an Inode in outp requires an additional ref
14352 if (outp) {
14353 _ll_get(in.get());
14354 *outp = in.get();
14355 }
14356 fill_statx(in, caps, stx);
14357 } else {
14358 stx->stx_ino = 0;
14359 stx->stx_mask = 0;
14360 }
14361
14362 return r;
14363}
14364
14365loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14366{
f67539c2
TL
14367 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14368 if (!mref_reader.is_state_satisfied())
14369 return -CEPHFS_ENOTCONN;
14370
7c673cae
FG
14371 tout(cct) << "ll_lseek" << std::endl;
14372 tout(cct) << offset << std::endl;
14373 tout(cct) << whence << std::endl;
14374
f67539c2 14375 std::scoped_lock lock(client_lock);
7c673cae
FG
14376 return _lseek(fh, offset, whence);
14377}
14378
14379int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14380{
f67539c2
TL
14381 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14382 if (!mref_reader.is_state_satisfied())
14383 return -CEPHFS_ENOTCONN;
14384
7c673cae
FG
14385 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14386 tout(cct) << "ll_read" << std::endl;
f67539c2 14387 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14388 tout(cct) << off << std::endl;
14389 tout(cct) << len << std::endl;
14390
11fdf7f2
TL
14391 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14392 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14393 std::scoped_lock lock(client_lock);
14394
f6b5b4d7
TL
14395 int r = _read(fh, off, len, bl);
14396 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14397 << dendl;
14398 return r;
7c673cae
FG
14399}
14400
14401int Client::ll_read_block(Inode *in, uint64_t blockid,
14402 char *buf,
14403 uint64_t offset,
14404 uint64_t length,
14405 file_layout_t* layout)
14406{
f67539c2
TL
14407 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14408 if (!mref_reader.is_state_satisfied())
14409 return -CEPHFS_ENOTCONN;
181888fb 14410
b32b8144 14411 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14412 object_t oid = file_object_t(vino.ino, blockid);
14413 C_SaferCond onfinish;
14414 bufferlist bl;
14415
14416 objecter->read(oid,
14417 object_locator_t(layout->pool_id),
14418 offset,
14419 length,
14420 vino.snapid,
14421 &bl,
14422 CEPH_OSD_FLAG_READ,
14423 &onfinish);
14424
7c673cae 14425 int r = onfinish.wait();
7c673cae 14426 if (r >= 0) {
9f95a23c 14427 bl.begin().copy(bl.length(), buf);
7c673cae
FG
14428 r = bl.length();
14429 }
14430
14431 return r;
14432}
14433
14434/* It appears that the OSD doesn't return success unless the entire
14435 buffer was written, return the write length on success. */
14436
14437int Client::ll_write_block(Inode *in, uint64_t blockid,
14438 char* buf, uint64_t offset,
14439 uint64_t length, file_layout_t* layout,
14440 uint64_t snapseq, uint32_t sync)
14441{
7c673cae 14442 vinodeno_t vino = ll_get_vino(in);
7c673cae 14443 int r = 0;
11fdf7f2 14444 std::unique_ptr<C_SaferCond> onsafe = nullptr;
f67539c2
TL
14445
14446 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14447 if (!mref_reader.is_state_satisfied())
14448 return -CEPHFS_ENOTCONN;
14449
7c673cae 14450 if (length == 0) {
f67539c2 14451 return -CEPHFS_EINVAL;
7c673cae
FG
14452 }
14453 if (true || sync) {
14454 /* if write is stable, the epilogue is waiting on
14455 * flock */
11fdf7f2 14456 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
14457 }
14458 object_t oid = file_object_t(vino.ino, blockid);
14459 SnapContext fakesnap;
11fdf7f2
TL
14460 ceph::bufferlist bl;
14461 if (length > 0) {
14462 bl.push_back(buffer::copy(buf, length));
14463 }
7c673cae
FG
14464
14465 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14466 << dendl;
14467
14468 fakesnap.seq = snapseq;
14469
14470 /* lock just in time */
7c673cae
FG
14471 objecter->write(oid,
14472 object_locator_t(layout->pool_id),
14473 offset,
14474 length,
14475 fakesnap,
14476 bl,
14477 ceph::real_clock::now(),
14478 0,
11fdf7f2 14479 onsafe.get());
7c673cae 14480
11fdf7f2
TL
14481 if (nullptr != onsafe) {
14482 r = onsafe->wait();
7c673cae
FG
14483 }
14484
14485 if (r < 0) {
14486 return r;
14487 } else {
14488 return length;
14489 }
14490}
14491
14492int Client::ll_commit_blocks(Inode *in,
14493 uint64_t offset,
14494 uint64_t length)
14495{
7c673cae
FG
14496 /*
14497 BarrierContext *bctx;
b32b8144 14498 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14499 uint64_t ino = vino.ino;
14500
14501 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14502 << offset << " to " << length << dendl;
14503
14504 if (length == 0) {
f67539c2 14505 return -CEPHFS_EINVAL;
7c673cae
FG
14506 }
14507
f67539c2 14508 std::scoped_lock lock(client_lock);
7c673cae
FG
14509 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14510 if (p != barriers.end()) {
14511 barrier_interval civ(offset, offset + length);
14512 p->second->commit_barrier(civ);
14513 }
14514 */
14515 return 0;
14516}
14517
14518int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14519{
7c673cae
FG
14520 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14521 "~" << len << dendl;
14522 tout(cct) << "ll_write" << std::endl;
f67539c2 14523 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14524 tout(cct) << off << std::endl;
14525 tout(cct) << len << std::endl;
14526
f67539c2
TL
14527 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14528 if (!mref_reader.is_state_satisfied())
14529 return -CEPHFS_ENOTCONN;
181888fb 14530
11fdf7f2
TL
14531 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14532 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14533 std::scoped_lock lock(client_lock);
14534
7c673cae
FG
14535 int r = _write(fh, off, len, data, NULL, 0);
14536 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14537 << dendl;
14538 return r;
14539}
14540
11fdf7f2
TL
14541int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14542{
f67539c2
TL
14543 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14544 if (!mref_reader.is_state_satisfied())
14545 return -CEPHFS_ENOTCONN;
14546
20effc67
TL
14547 std::scoped_lock cl(client_lock);
14548 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
11fdf7f2
TL
14549}
14550
14551int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14552{
f67539c2
TL
14553 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14554 if (!mref_reader.is_state_satisfied())
14555 return -CEPHFS_ENOTCONN;
14556
20effc67
TL
14557 std::scoped_lock cl(client_lock);
14558 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
11fdf7f2
TL
14559}
14560
7c673cae
FG
14561int Client::ll_flush(Fh *fh)
14562{
f67539c2
TL
14563 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14564 if (!mref_reader.is_state_satisfied())
14565 return -CEPHFS_ENOTCONN;
14566
7c673cae
FG
14567 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14568 tout(cct) << "ll_flush" << std::endl;
f67539c2 14569 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14570
f67539c2 14571 std::scoped_lock lock(client_lock);
7c673cae
FG
14572 return _flush(fh);
14573}
14574
14575int Client::ll_fsync(Fh *fh, bool syncdataonly)
14576{
f67539c2
TL
14577 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14578 if (!mref_reader.is_state_satisfied())
14579 return -CEPHFS_ENOTCONN;
14580
7c673cae
FG
14581 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14582 tout(cct) << "ll_fsync" << std::endl;
f67539c2 14583 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14584
f67539c2 14585 std::scoped_lock lock(client_lock);
7c673cae
FG
14586 int r = _fsync(fh, syncdataonly);
14587 if (r) {
14588 // If we're returning an error, clear it from the FH
14589 fh->take_async_err();
14590 }
14591 return r;
14592}
14593
28e407b8
AA
14594int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14595{
f67539c2
TL
14596 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14597 if (!mref_reader.is_state_satisfied())
14598 return -CEPHFS_ENOTCONN;
14599
28e407b8
AA
14600 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14601 tout(cct) << "ll_sync_inode" << std::endl;
f67539c2 14602 tout(cct) << (uintptr_t)in << std::endl;
28e407b8 14603
f67539c2 14604 std::scoped_lock lock(client_lock);
28e407b8
AA
14605 return _fsync(in, syncdataonly);
14606}
14607
7c673cae
FG
14608int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
14609{
f67539c2
TL
14610 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
14611
7c673cae 14612 if (offset < 0 || length <= 0)
f67539c2 14613 return -CEPHFS_EINVAL;
7c673cae
FG
14614
14615 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
f67539c2 14616 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14617
14618 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
f67539c2 14619 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
14620
14621 Inode *in = fh->inode.get();
14622
14623 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
14624 !(mode & FALLOC_FL_PUNCH_HOLE)) {
f67539c2 14625 return -CEPHFS_ENOSPC;
7c673cae
FG
14626 }
14627
14628 if (in->snapid != CEPH_NOSNAP)
f67539c2 14629 return -CEPHFS_EROFS;
7c673cae
FG
14630
14631 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 14632 return -CEPHFS_EBADF;
7c673cae
FG
14633
14634 uint64_t size = offset + length;
14635 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
14636 size > in->size &&
11fdf7f2 14637 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
f67539c2 14638 return -CEPHFS_EDQUOT;
7c673cae
FG
14639 }
14640
14641 int have;
f6b5b4d7 14642 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
14643 if (r < 0)
14644 return r;
14645
11fdf7f2 14646 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
14647 if (mode & FALLOC_FL_PUNCH_HOLE) {
14648 if (in->inline_version < CEPH_INLINE_NONE &&
14649 (have & CEPH_CAP_FILE_BUFFER)) {
14650 bufferlist bl;
9f95a23c 14651 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
14652 int len = in->inline_data.length();
14653 if (offset < len) {
14654 if (offset > 0)
9f95a23c 14655 inline_iter.copy(offset, bl);
7c673cae
FG
14656 int size = length;
14657 if (offset + size > len)
14658 size = len - offset;
14659 if (size > 0)
14660 bl.append_zero(size);
9f95a23c
TL
14661 if (offset + size < len) {
14662 inline_iter += size;
14663 inline_iter.copy(len - offset - size, bl);
14664 }
7c673cae
FG
14665 in->inline_data = bl;
14666 in->inline_version++;
14667 }
91327a77 14668 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14669 in->change_attr++;
28e407b8 14670 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14671 } else {
14672 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
14673 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
14674 uninline_data(in, onuninline.get());
7c673cae
FG
14675 }
14676
11fdf7f2 14677 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae 14678
7c673cae
FG
14679 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
14680
14681 _invalidate_inode_cache(in, offset, length);
14682 filer->zero(in->ino, &in->layout,
14683 in->snaprealm->get_snap_context(),
14684 offset, length,
14685 ceph::real_clock::now(),
11fdf7f2 14686 0, true, &onfinish);
91327a77 14687 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14688 in->change_attr++;
28e407b8 14689 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14690
9f95a23c 14691 client_lock.unlock();
11fdf7f2 14692 onfinish.wait();
9f95a23c 14693 client_lock.lock();
f67539c2 14694 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
14695 }
14696 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
14697 uint64_t size = offset + length;
14698 if (size > in->size) {
14699 in->size = size;
91327a77 14700 in->mtime = in->ctime = ceph_clock_now();
7c673cae 14701 in->change_attr++;
28e407b8 14702 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 14703
11fdf7f2 14704 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 14705 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
14706 } else if (is_max_size_approaching(in)) {
14707 check_caps(in, 0);
7c673cae
FG
14708 }
14709 }
14710 }
14711
11fdf7f2 14712 if (nullptr != onuninline) {
9f95a23c 14713 client_lock.unlock();
11fdf7f2 14714 int ret = onuninline->wait();
9f95a23c 14715 client_lock.lock();
7c673cae 14716
f67539c2 14717 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
14718 in->inline_data.clear();
14719 in->inline_version = CEPH_INLINE_NONE;
28e407b8 14720 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
14721 check_caps(in, 0);
14722 } else
11fdf7f2 14723 r = ret;
7c673cae
FG
14724 }
14725
14726 put_cap_ref(in, CEPH_CAP_FILE_WR);
14727 return r;
14728}
7c673cae 14729
11fdf7f2 14730int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 14731{
f67539c2
TL
14732 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14733 if (!mref_reader.is_state_satisfied())
14734 return -CEPHFS_ENOTCONN;
14735
11fdf7f2
TL
14736 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
14737 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
f67539c2 14738 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14739
f67539c2 14740 std::scoped_lock lock(client_lock);
7c673cae
FG
14741 return _fallocate(fh, mode, offset, length);
14742}
14743
14744int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
14745{
f67539c2
TL
14746 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14747 if (!mref_reader.is_state_satisfied())
14748 return -CEPHFS_ENOTCONN;
7c673cae 14749
f67539c2 14750 tout(cct) << __func__ << " " << " " << fd << mode << " " << offset << " " << length << std::endl;
181888fb 14751
f67539c2 14752 std::scoped_lock lock(client_lock);
7c673cae
FG
14753 Fh *fh = get_filehandle(fd);
14754 if (!fh)
f67539c2 14755 return -CEPHFS_EBADF;
7c673cae
FG
14756#if defined(__linux__) && defined(O_PATH)
14757 if (fh->flags & O_PATH)
f67539c2 14758 return -CEPHFS_EBADF;
7c673cae
FG
14759#endif
14760 return _fallocate(fh, mode, offset, length);
14761}
14762
14763int Client::ll_release(Fh *fh)
14764{
f67539c2
TL
14765 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14766 if (!mref_reader.is_state_satisfied())
14767 return -CEPHFS_ENOTCONN;
91327a77 14768
11fdf7f2 14769 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 14770 dendl;
11fdf7f2 14771 tout(cct) << __func__ << " (fh)" << std::endl;
f67539c2
TL
14772 tout(cct) << (uintptr_t)fh << std::endl;
14773
14774 std::scoped_lock lock(client_lock);
7c673cae
FG
14775
14776 if (ll_unclosed_fh_set.count(fh))
14777 ll_unclosed_fh_set.erase(fh);
14778 return _release_fh(fh);
14779}
14780
14781int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
14782{
f67539c2
TL
14783 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14784 if (!mref_reader.is_state_satisfied())
14785 return -CEPHFS_ENOTCONN;
7c673cae
FG
14786
14787 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
f67539c2 14788 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
181888fb 14789
f67539c2 14790 std::scoped_lock lock(client_lock);
7c673cae
FG
14791 return _getlk(fh, fl, owner);
14792}
14793
14794int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
14795{
f67539c2
TL
14796 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14797 if (!mref_reader.is_state_satisfied())
14798 return -CEPHFS_ENOTCONN;
7c673cae 14799
11fdf7f2 14800 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14801 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14802
f67539c2 14803 std::scoped_lock lock(client_lock);
7c673cae
FG
14804 return _setlk(fh, fl, owner, sleep);
14805}
14806
14807int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
14808{
f67539c2
TL
14809 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14810 if (!mref_reader.is_state_satisfied())
14811 return -CEPHFS_ENOTCONN;
7c673cae 14812
11fdf7f2 14813 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 14814 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 14815
f67539c2 14816 std::scoped_lock lock(client_lock);
7c673cae
FG
14817 return _flock(fh, cmd, owner);
14818}
14819
b32b8144
FG
14820int Client::set_deleg_timeout(uint32_t timeout)
14821{
f67539c2 14822 std::scoped_lock lock(client_lock);
b32b8144
FG
14823
14824 /*
f67539c2 14825 * The whole point is to prevent blocklisting so we must time out the
b32b8144
FG
14826 * delegation before the session autoclose timeout kicks in.
14827 */
14828 if (timeout >= mdsmap->get_session_autoclose())
f67539c2 14829 return -CEPHFS_EINVAL;
b32b8144
FG
14830
14831 deleg_timeout = timeout;
14832 return 0;
14833}
14834
14835int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
14836{
f67539c2 14837 int ret = -CEPHFS_EINVAL;
b32b8144 14838
f67539c2
TL
14839 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14840 if (!mref_reader.is_state_satisfied())
14841 return -CEPHFS_ENOTCONN;
b32b8144 14842
f67539c2 14843 std::scoped_lock lock(client_lock);
b32b8144
FG
14844
14845 Inode *inode = fh->inode.get();
14846
14847 switch(cmd) {
14848 case CEPH_DELEGATION_NONE:
14849 inode->unset_deleg(fh);
14850 ret = 0;
14851 break;
14852 default:
14853 try {
14854 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 14855 } catch (std::bad_alloc&) {
f67539c2 14856 ret = -CEPHFS_ENOMEM;
b32b8144
FG
14857 }
14858 break;
14859 }
14860 return ret;
14861}
14862
7c673cae
FG
14863class C_Client_RequestInterrupt : public Context {
14864private:
14865 Client *client;
14866 MetaRequest *req;
14867public:
14868 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
14869 req->get();
14870 }
14871 void finish(int r) override {
f67539c2 14872 std::scoped_lock l(client->client_lock);
11fdf7f2 14873 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
14874 client->_interrupt_filelock(req);
14875 client->put_request(req);
14876 }
14877};
14878
14879void Client::ll_interrupt(void *d)
14880{
14881 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
14882 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
14883 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
14884 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
14885}
14886
14887// =========================================
14888// layout
14889
14890// expose file layouts
14891
14892int Client::describe_layout(const char *relpath, file_layout_t *lp,
14893 const UserPerm& perms)
14894{
f67539c2
TL
14895 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14896 if (!mref_reader.is_state_satisfied())
14897 return -CEPHFS_ENOTCONN;
7c673cae 14898
f67539c2 14899 std::scoped_lock lock(client_lock);
181888fb 14900
7c673cae
FG
14901 filepath path(relpath);
14902 InodeRef in;
14903 int r = path_walk(path, &in, perms);
14904 if (r < 0)
14905 return r;
14906
14907 *lp = in->layout;
14908
11fdf7f2 14909 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
14910 return 0;
14911}
14912
14913int Client::fdescribe_layout(int fd, file_layout_t *lp)
14914{
f67539c2
TL
14915 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14916 if (!mref_reader.is_state_satisfied())
14917 return -CEPHFS_ENOTCONN;
7c673cae 14918
f67539c2 14919 std::scoped_lock lock(client_lock);
181888fb 14920
7c673cae
FG
14921 Fh *f = get_filehandle(fd);
14922 if (!f)
f67539c2 14923 return -CEPHFS_EBADF;
7c673cae
FG
14924 Inode *in = f->inode.get();
14925
14926 *lp = in->layout;
14927
11fdf7f2 14928 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
14929 return 0;
14930}
14931
d2e6a577
FG
14932int64_t Client::get_default_pool_id()
14933{
f67539c2
TL
14934 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14935 if (!mref_reader.is_state_satisfied())
14936 return -CEPHFS_ENOTCONN;
181888fb 14937
f67539c2 14938 std::scoped_lock lock(client_lock);
181888fb 14939
d2e6a577
FG
14940 /* first data pool is the default */
14941 return mdsmap->get_first_data_pool();
14942}
7c673cae
FG
14943
14944// expose osdmap
14945
14946int64_t Client::get_pool_id(const char *pool_name)
14947{
f67539c2
TL
14948 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14949 if (!mref_reader.is_state_satisfied())
14950 return -CEPHFS_ENOTCONN;
181888fb 14951
f67539c2 14952 std::scoped_lock lock(client_lock);
181888fb 14953
7c673cae
FG
14954 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
14955 pool_name);
14956}
14957
14958string Client::get_pool_name(int64_t pool)
14959{
f67539c2
TL
14960 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14961 if (!mref_reader.is_state_satisfied())
181888fb
FG
14962 return string();
14963
f67539c2
TL
14964 std::scoped_lock lock(client_lock);
14965
7c673cae
FG
14966 return objecter->with_osdmap([pool](const OSDMap& o) {
14967 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
14968 });
14969}
14970
14971int Client::get_pool_replication(int64_t pool)
14972{
f67539c2
TL
14973 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14974 if (!mref_reader.is_state_satisfied())
14975 return -CEPHFS_ENOTCONN;
181888fb 14976
f67539c2 14977 std::scoped_lock lock(client_lock);
181888fb 14978
7c673cae 14979 return objecter->with_osdmap([pool](const OSDMap& o) {
f67539c2 14980 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
7c673cae
FG
14981 });
14982}
14983
14984int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
14985{
f67539c2
TL
14986 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14987 if (!mref_reader.is_state_satisfied())
14988 return -CEPHFS_ENOTCONN;
7c673cae 14989
f67539c2 14990 std::scoped_lock lock(client_lock);
181888fb 14991
7c673cae
FG
14992 Fh *f = get_filehandle(fd);
14993 if (!f)
f67539c2 14994 return -CEPHFS_EBADF;
7c673cae
FG
14995 Inode *in = f->inode.get();
14996
14997 vector<ObjectExtent> extents;
14998 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 14999 ceph_assert(extents.size() == 1);
7c673cae
FG
15000
15001 objecter->with_osdmap([&](const OSDMap& o) {
15002 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15003 o.pg_to_acting_osds(pg, osds);
15004 });
15005
15006 if (osds.empty())
f67539c2 15007 return -CEPHFS_EINVAL;
7c673cae
FG
15008
15009 /*
15010 * Return the remainder of the extent (stripe unit)
15011 *
15012 * If length = 1 is passed to Striper::file_to_extents we get a single
15013 * extent back, but its length is one so we still need to compute the length
15014 * to the end of the stripe unit.
15015 *
15016 * If length = su then we may get 1 or 2 objects back in the extents vector
15017 * which would have to be examined. Even then, the offsets are local to the
15018 * object, so matching up to the file offset is extra work.
15019 *
15020 * It seems simpler to stick with length = 1 and manually compute the
15021 * remainder.
15022 */
15023 if (len) {
15024 uint64_t su = in->layout.stripe_unit;
15025 *len = su - (off % su);
15026 }
15027
15028 return 0;
15029}
15030
15031int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15032{
f67539c2
TL
15033 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15034 if (!mref_reader.is_state_satisfied())
15035 return -CEPHFS_ENOTCONN;
181888fb 15036
f67539c2 15037 std::scoped_lock lock(client_lock);
181888fb 15038
7c673cae 15039 if (id < 0)
f67539c2 15040 return -CEPHFS_EINVAL;
7c673cae
FG
15041 return objecter->with_osdmap([&](const OSDMap& o) {
15042 return o.crush->get_full_location_ordered(id, path);
15043 });
15044}
15045
15046int Client::get_file_stripe_address(int fd, loff_t offset,
15047 vector<entity_addr_t>& address)
15048{
f67539c2
TL
15049 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15050 if (!mref_reader.is_state_satisfied())
15051 return -CEPHFS_ENOTCONN;
7c673cae 15052
f67539c2 15053 std::scoped_lock lock(client_lock);
181888fb 15054
7c673cae
FG
15055 Fh *f = get_filehandle(fd);
15056 if (!f)
f67539c2 15057 return -CEPHFS_EBADF;
7c673cae
FG
15058 Inode *in = f->inode.get();
15059
15060 // which object?
15061 vector<ObjectExtent> extents;
15062 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15063 in->truncate_size, extents);
11fdf7f2 15064 ceph_assert(extents.size() == 1);
7c673cae
FG
15065
15066 // now we have the object and its 'layout'
15067 return objecter->with_osdmap([&](const OSDMap& o) {
15068 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15069 vector<int> osds;
15070 o.pg_to_acting_osds(pg, osds);
15071 if (osds.empty())
f67539c2 15072 return -CEPHFS_EINVAL;
7c673cae 15073 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 15074 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
15075 address.push_back(addr);
15076 }
15077 return 0;
15078 });
15079}
15080
15081int Client::get_osd_addr(int osd, entity_addr_t& addr)
15082{
f67539c2
TL
15083 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15084 if (!mref_reader.is_state_satisfied())
15085 return -CEPHFS_ENOTCONN;
181888fb 15086
f67539c2 15087 std::scoped_lock lock(client_lock);
181888fb 15088
7c673cae
FG
15089 return objecter->with_osdmap([&](const OSDMap& o) {
15090 if (!o.exists(osd))
f67539c2 15091 return -CEPHFS_ENOENT;
7c673cae 15092
11fdf7f2 15093 addr = o.get_addrs(osd).front();
7c673cae
FG
15094 return 0;
15095 });
15096}
15097
15098int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15099 loff_t length, loff_t offset)
15100{
f67539c2
TL
15101 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15102 if (!mref_reader.is_state_satisfied())
15103 return -CEPHFS_ENOTCONN;
7c673cae 15104
f67539c2 15105 std::scoped_lock lock(client_lock);
181888fb 15106
7c673cae
FG
15107 Fh *f = get_filehandle(fd);
15108 if (!f)
f67539c2 15109 return -CEPHFS_EBADF;
7c673cae
FG
15110 Inode *in = f->inode.get();
15111
15112 // map to a list of extents
15113 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15114
11fdf7f2 15115 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
15116 return 0;
15117}
15118
15119
f67539c2 15120/* find an osd with the same ip. -CEPHFS_ENXIO if none. */
7c673cae
FG
15121int Client::get_local_osd()
15122{
f67539c2
TL
15123 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15124 if (!mref_reader.is_state_satisfied())
15125 return -CEPHFS_ENOTCONN;
181888fb 15126
f67539c2 15127 std::scoped_lock lock(client_lock);
181888fb 15128
7c673cae
FG
15129 objecter->with_osdmap([this](const OSDMap& o) {
15130 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 15131 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
15132 local_osd_epoch = o.get_epoch();
15133 }
15134 });
15135 return local_osd;
15136}
15137
15138
15139
15140
15141
15142
15143// ===============================
15144
15145void Client::ms_handle_connect(Connection *con)
15146{
11fdf7f2 15147 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15148}
15149
15150bool Client::ms_handle_reset(Connection *con)
15151{
11fdf7f2 15152 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15153 return false;
15154}
15155
15156void Client::ms_handle_remote_reset(Connection *con)
15157{
f67539c2 15158 std::scoped_lock lock(client_lock);
11fdf7f2 15159 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15160 switch (con->get_peer_type()) {
15161 case CEPH_ENTITY_TYPE_MDS:
15162 {
15163 // kludge to figure out which mds this is; fixme with a Connection* state
15164 mds_rank_t mds = MDS_RANK_NONE;
20effc67 15165 MetaSessionRef s = NULL;
11fdf7f2 15166 for (auto &p : mds_sessions) {
b3b6e05e 15167 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
11fdf7f2 15168 mds = p.first;
20effc67 15169 s = p.second;
7c673cae
FG
15170 }
15171 }
15172 if (mds >= 0) {
20effc67 15173 ceph_assert(s != NULL);
7c673cae
FG
15174 switch (s->state) {
15175 case MetaSession::STATE_CLOSING:
15176 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
20effc67 15177 _closed_mds_session(s.get());
7c673cae
FG
15178 break;
15179
15180 case MetaSession::STATE_OPENING:
15181 {
15182 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15183 list<Context*> waiters;
15184 waiters.swap(s->waiting_for_open);
20effc67
TL
15185 _closed_mds_session(s.get());
15186 auto news = _get_or_open_mds_session(mds);
7c673cae
FG
15187 news->waiting_for_open.swap(waiters);
15188 }
15189 break;
15190
15191 case MetaSession::STATE_OPEN:
15192 {
f67539c2 15193 objecter->maybe_request_map(); /* to check if we are blocklisted */
f6b5b4d7 15194 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae 15195 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
20effc67 15196 _closed_mds_session(s.get());
7c673cae
FG
15197 } else {
15198 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15199 s->state = MetaSession::STATE_STALE;
15200 }
15201 }
15202 break;
15203
15204 case MetaSession::STATE_NEW:
15205 case MetaSession::STATE_CLOSED:
15206 default:
15207 break;
15208 }
15209 }
15210 }
15211 break;
15212 }
15213}
15214
15215bool Client::ms_handle_refused(Connection *con)
15216{
11fdf7f2 15217 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15218 return false;
15219}
15220
7c673cae
FG
15221Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
15222{
11fdf7f2
TL
15223 Inode *quota_in = root_ancestor;
15224 SnapRealm *realm = in->snaprealm;
15225 while (realm) {
15226 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15227 if (realm->ino != in->ino) {
15228 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15229 if (p == inode_map.end())
15230 break;
7c673cae 15231
11fdf7f2
TL
15232 if (p->second->quota.is_enable()) {
15233 quota_in = p->second;
15234 break;
7c673cae 15235 }
7c673cae 15236 }
11fdf7f2 15237 realm = realm->pparent;
7c673cae 15238 }
11fdf7f2
TL
15239 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15240 return quota_in;
7c673cae
FG
15241}
15242
15243/**
15244 * Traverse quota ancestors of the Inode, return true
15245 * if any of them passes the passed function
15246 */
15247bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15248 std::function<bool (const Inode &in)> test)
15249{
15250 while (true) {
11fdf7f2 15251 ceph_assert(in != NULL);
7c673cae
FG
15252 if (test(*in)) {
15253 return true;
15254 }
15255
15256 if (in == root_ancestor) {
15257 // We're done traversing, drop out
15258 return false;
15259 } else {
15260 // Continue up the tree
15261 in = get_quota_root(in, perms);
15262 }
15263 }
15264
15265 return false;
15266}
15267
15268bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15269{
15270 return check_quota_condition(in, perms,
15271 [](const Inode &in) {
15272 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15273 });
15274}
15275
15276bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 15277 const UserPerm& perms)
7c673cae
FG
15278{
15279 return check_quota_condition(in, perms,
11fdf7f2 15280 [&new_bytes](const Inode &in) {
7c673cae
FG
15281 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15282 > in.quota.max_bytes;
15283 });
15284}
15285
11fdf7f2 15286bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 15287{
9f95a23c
TL
15288 ceph_assert(in->size >= in->reported_size);
15289 const uint64_t size = in->size - in->reported_size;
11fdf7f2 15290 return check_quota_condition(in, perms,
9f95a23c 15291 [&size](const Inode &in) {
11fdf7f2
TL
15292 if (in.quota.max_bytes) {
15293 if (in.rstat.rbytes >= in.quota.max_bytes) {
15294 return true;
15295 }
15296
11fdf7f2 15297 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
15298 return (space >> 4) < size;
15299 } else {
15300 return false;
15301 }
15302 });
7c673cae
FG
15303}
15304
15305enum {
15306 POOL_CHECKED = 1,
15307 POOL_CHECKING = 2,
15308 POOL_READ = 4,
15309 POOL_WRITE = 8,
15310};
15311
15312int Client::check_pool_perm(Inode *in, int need)
15313{
f67539c2
TL
15314 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15315
7c673cae
FG
15316 if (!cct->_conf->client_check_pool_perm)
15317 return 0;
15318
f67539c2
TL
15319 /* Only need to do this for regular files */
15320 if (!in->is_file())
15321 return 0;
15322
7c673cae
FG
15323 int64_t pool_id = in->layout.pool_id;
15324 std::string pool_ns = in->layout.pool_ns;
15325 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15326 int have = 0;
15327 while (true) {
15328 auto it = pool_perms.find(perm_key);
15329 if (it == pool_perms.end())
15330 break;
15331 if (it->second == POOL_CHECKING) {
15332 // avoid concurrent checkings
15333 wait_on_list(waiting_for_pool_perm);
15334 } else {
15335 have = it->second;
11fdf7f2 15336 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
15337 break;
15338 }
15339 }
15340
15341 if (!have) {
15342 if (in->snapid != CEPH_NOSNAP) {
15343 // pool permission check needs to write to the first object. But for snapshot,
20effc67 15344 // head of the first object may have already been deleted. To avoid creating
7c673cae
FG
15345 // orphan object, skip the check for now.
15346 return 0;
15347 }
15348
15349 pool_perms[perm_key] = POOL_CHECKING;
15350
15351 char oid_buf[32];
15352 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15353 object_t oid = oid_buf;
15354
15355 SnapContext nullsnapc;
15356
15357 C_SaferCond rd_cond;
15358 ObjectOperation rd_op;
f67539c2 15359 rd_op.stat(nullptr, nullptr, nullptr);
7c673cae
FG
15360
15361 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15362 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15363
15364 C_SaferCond wr_cond;
15365 ObjectOperation wr_op;
15366 wr_op.create(true);
15367
15368 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15369 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15370
9f95a23c 15371 client_lock.unlock();
7c673cae
FG
15372 int rd_ret = rd_cond.wait();
15373 int wr_ret = wr_cond.wait();
9f95a23c 15374 client_lock.lock();
7c673cae
FG
15375
15376 bool errored = false;
15377
f67539c2 15378 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
7c673cae 15379 have |= POOL_READ;
f67539c2 15380 else if (rd_ret != -CEPHFS_EPERM) {
11fdf7f2 15381 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15382 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15383 errored = true;
15384 }
15385
f67539c2 15386 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
7c673cae 15387 have |= POOL_WRITE;
f67539c2 15388 else if (wr_ret != -CEPHFS_EPERM) {
11fdf7f2 15389 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15390 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15391 errored = true;
15392 }
15393
15394 if (errored) {
15395 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15396 // Raise EIO because actual error code might be misleading for
15397 // userspace filesystem user.
15398 pool_perms.erase(perm_key);
15399 signal_cond_list(waiting_for_pool_perm);
f67539c2 15400 return -CEPHFS_EIO;
7c673cae
FG
15401 }
15402
15403 pool_perms[perm_key] = have | POOL_CHECKED;
15404 signal_cond_list(waiting_for_pool_perm);
15405 }
15406
15407 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 15408 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15409 << " need " << ccap_string(need) << ", but no read perm" << dendl;
f67539c2 15410 return -CEPHFS_EPERM;
7c673cae
FG
15411 }
15412 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 15413 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15414 << " need " << ccap_string(need) << ", but no write perm" << dendl;
f67539c2 15415 return -CEPHFS_EPERM;
7c673cae
FG
15416 }
15417
15418 return 0;
15419}
15420
15421int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15422{
15423 if (acl_type == POSIX_ACL) {
15424 if (in->xattrs.count(ACL_EA_ACCESS)) {
15425 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15426
15427 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15428 }
15429 }
f67539c2 15430 return -CEPHFS_EAGAIN;
7c673cae
FG
15431}
15432
15433int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15434{
15435 if (acl_type == NO_ACL)
15436 return 0;
15437
15438 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15439 if (r < 0)
15440 goto out;
15441
15442 if (acl_type == POSIX_ACL) {
15443 if (in->xattrs.count(ACL_EA_ACCESS)) {
15444 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15445 bufferptr acl(access_acl.c_str(), access_acl.length());
15446 r = posix_acl_access_chmod(acl, mode);
15447 if (r < 0)
15448 goto out;
15449 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15450 } else {
15451 r = 0;
15452 }
15453 }
15454out:
15455 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15456 return r;
15457}
15458
15459int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15460 const UserPerm& perms)
15461{
15462 if (acl_type == NO_ACL)
15463 return 0;
15464
15465 if (S_ISLNK(*mode))
15466 return 0;
15467
15468 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15469 if (r < 0)
15470 goto out;
15471
15472 if (acl_type == POSIX_ACL) {
15473 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15474 map<string, bufferptr> xattrs;
15475
15476 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15477 bufferptr acl(default_acl.c_str(), default_acl.length());
15478 r = posix_acl_inherit_mode(acl, mode);
15479 if (r < 0)
15480 goto out;
15481
15482 if (r > 0) {
15483 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15484 if (r < 0)
15485 goto out;
15486 if (r > 0)
15487 xattrs[ACL_EA_ACCESS] = acl;
15488 }
15489
15490 if (S_ISDIR(*mode))
15491 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15492
15493 r = xattrs.size();
15494 if (r > 0)
11fdf7f2 15495 encode(xattrs, xattrs_bl);
7c673cae
FG
15496 } else {
15497 if (umask_cb)
15498 *mode &= ~umask_cb(callback_handle);
15499 r = 0;
15500 }
15501 }
15502out:
15503 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15504 return r;
15505}
15506
15507void Client::set_filer_flags(int flags)
15508{
f67539c2 15509 std::scoped_lock l(client_lock);
11fdf7f2 15510 ceph_assert(flags == 0 ||
7c673cae
FG
15511 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15512 objecter->add_global_op_flags(flags);
15513}
15514
15515void Client::clear_filer_flags(int flags)
15516{
f67539c2 15517 std::scoped_lock l(client_lock);
11fdf7f2 15518 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
15519 objecter->clear_global_op_flag(flags);
15520}
15521
11fdf7f2
TL
15522// called before mount
15523void Client::set_uuid(const std::string& uuid)
15524{
f67539c2
TL
15525 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15526 ceph_assert(iref_reader.is_state_satisfied());
15527
15528 std::scoped_lock l(client_lock);
20effc67 15529 ceph_assert(!uuid.empty());
11fdf7f2
TL
15530
15531 metadata["uuid"] = uuid;
15532 _close_sessions();
15533}
15534
15535// called before mount. 0 means infinite
15536void Client::set_session_timeout(unsigned timeout)
15537{
f67539c2
TL
15538 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15539 ceph_assert(iref_reader.is_state_satisfied());
15540
15541 std::scoped_lock l(client_lock);
11fdf7f2
TL
15542
15543 metadata["timeout"] = stringify(timeout);
15544}
15545
15546// called before mount
15547int Client::start_reclaim(const std::string& uuid, unsigned flags,
15548 const std::string& fs_name)
15549{
f67539c2
TL
15550 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15551 if (!iref_reader.is_state_satisfied())
15552 return -CEPHFS_ENOTCONN;
11fdf7f2
TL
15553
15554 if (uuid.empty())
f67539c2 15555 return -CEPHFS_EINVAL;
11fdf7f2 15556
f67539c2 15557 std::unique_lock l(client_lock);
11fdf7f2
TL
15558 {
15559 auto it = metadata.find("uuid");
15560 if (it != metadata.end() && it->second == uuid)
f67539c2 15561 return -CEPHFS_EINVAL;
11fdf7f2
TL
15562 }
15563
15564 int r = subscribe_mdsmap(fs_name);
15565 if (r < 0) {
15566 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15567 return r;
15568 }
15569
15570 if (metadata.empty())
15571 populate_metadata("");
15572
15573 while (mdsmap->get_epoch() == 0)
15574 wait_on_list(waiting_for_mdsmap);
15575
15576 reclaim_errno = 0;
15577 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15578 if (!mdsmap->is_up(mds)) {
15579 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15580 wait_on_list(waiting_for_mdsmap);
15581 continue;
15582 }
15583
20effc67 15584 MetaSessionRef session;
11fdf7f2
TL
15585 if (!have_open_session(mds)) {
15586 session = _get_or_open_mds_session(mds);
f6b5b4d7 15587 if (session->state == MetaSession::STATE_REJECTED)
f67539c2 15588 return -CEPHFS_EPERM;
11fdf7f2
TL
15589 if (session->state != MetaSession::STATE_OPENING) {
15590 // umounting?
f67539c2 15591 return -CEPHFS_EINVAL;
11fdf7f2
TL
15592 }
15593 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
15594 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
15595 continue;
15596 }
15597
20effc67 15598 session = mds_sessions.at(mds);
11fdf7f2 15599 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
f67539c2 15600 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
15601
15602 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
15603 session->reclaim_state == MetaSession::RECLAIMING) {
15604 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 15605 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
15606 session->con->send_message2(std::move(m));
15607 wait_on_list(waiting_for_reclaim);
15608 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
f67539c2 15609 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15610 } else {
15611 mds++;
15612 }
15613 }
15614
15615 // didn't find target session in any mds
15616 if (reclaim_target_addrs.empty()) {
15617 if (flags & CEPH_RECLAIM_RESET)
f67539c2
TL
15618 return -CEPHFS_ENOENT;
15619 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15620 }
15621
15622 if (flags & CEPH_RECLAIM_RESET)
15623 return 0;
15624
f67539c2
TL
15625 // use blocklist to check if target session was killed
15626 // (config option mds_session_blocklist_on_evict needs to be true)
15627 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
15628 bs::error_code ec;
15629 l.unlock();
15630 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
15631 l.lock();
11fdf7f2 15632
f67539c2
TL
15633 if (ec)
15634 return ceph::from_error_code(ec);
15635
15636 bool blocklisted = objecter->with_osdmap(
11fdf7f2 15637 [this](const OSDMap &osd_map) -> bool {
f67539c2 15638 return osd_map.is_blocklisted(reclaim_target_addrs);
11fdf7f2 15639 });
f67539c2
TL
15640 if (blocklisted)
15641 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
15642
15643 metadata["reclaiming_uuid"] = uuid;
15644 return 0;
15645}
15646
15647void Client::finish_reclaim()
15648{
15649 auto it = metadata.find("reclaiming_uuid");
15650 if (it == metadata.end()) {
15651 for (auto &p : mds_sessions)
20effc67 15652 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
11fdf7f2
TL
15653 return;
15654 }
15655
15656 for (auto &p : mds_sessions) {
20effc67 15657 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 15658 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
20effc67 15659 p.second->con->send_message2(std::move(m));
11fdf7f2
TL
15660 }
15661
15662 metadata["uuid"] = it->second;
15663 metadata.erase(it);
15664}
15665
15666void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
15667{
15668 mds_rank_t from = mds_rank_t(reply->get_source().num());
15669 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
15670
f67539c2 15671 std::scoped_lock cl(client_lock);
20effc67 15672 auto session = _get_mds_session(from, reply->get_connection().get());
11fdf7f2
TL
15673 if (!session) {
15674 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
15675 return;
15676 }
15677
15678 if (reply->get_result() >= 0) {
15679 session->reclaim_state = MetaSession::RECLAIM_OK;
15680 if (reply->get_epoch() > reclaim_osd_epoch)
15681 reclaim_osd_epoch = reply->get_epoch();
15682 if (!reply->get_addrs().empty())
15683 reclaim_target_addrs = reply->get_addrs();
15684 } else {
15685 session->reclaim_state = MetaSession::RECLAIM_FAIL;
15686 reclaim_errno = reply->get_result();
15687 }
15688
15689 signal_cond_list(waiting_for_reclaim);
15690}
15691
7c673cae
FG
15692/**
15693 * This is included in cap release messages, to cause
15694 * the MDS to wait until this OSD map epoch. It is necessary
15695 * in corner cases where we cancel RADOS ops, so that
15696 * nobody else tries to do IO to the same objects in
15697 * the same epoch as the cancelled ops.
15698 */
15699void Client::set_cap_epoch_barrier(epoch_t e)
15700{
15701 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
15702 cap_epoch_barrier = e;
15703}
15704
15705const char** Client::get_tracked_conf_keys() const
15706{
15707 static const char* keys[] = {
15708 "client_cache_size",
15709 "client_cache_mid",
15710 "client_acl_type",
b32b8144
FG
15711 "client_deleg_timeout",
15712 "client_deleg_break_on_open",
f67539c2
TL
15713 "client_oc_size",
15714 "client_oc_max_objects",
15715 "client_oc_max_dirty",
15716 "client_oc_target_dirty",
15717 "client_oc_max_dirty_age",
7c673cae
FG
15718 NULL
15719 };
15720 return keys;
15721}
15722
11fdf7f2 15723void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
15724 const std::set <std::string> &changed)
15725{
f67539c2 15726 std::scoped_lock lock(client_lock);
7c673cae 15727
181888fb 15728 if (changed.count("client_cache_mid")) {
7c673cae
FG
15729 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
15730 }
15731 if (changed.count("client_acl_type")) {
15732 acl_type = NO_ACL;
15733 if (cct->_conf->client_acl_type == "posix_acl")
15734 acl_type = POSIX_ACL;
15735 }
f67539c2
TL
15736 if (changed.count("client_oc_size")) {
15737 objectcacher->set_max_size(cct->_conf->client_oc_size);
15738 }
15739 if (changed.count("client_oc_max_objects")) {
15740 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
15741 }
15742 if (changed.count("client_oc_max_dirty")) {
15743 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
15744 }
15745 if (changed.count("client_oc_target_dirty")) {
15746 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
15747 }
15748 if (changed.count("client_oc_max_dirty_age")) {
15749 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
15750 }
33c7a0ef
TL
15751 if (changed.count("client_collect_and_send_global_metrics")) {
15752 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
15753 "client_collect_and_send_global_metrics");
15754 }
7c673cae
FG
15755}
15756
7c673cae
FG
15757void intrusive_ptr_add_ref(Inode *in)
15758{
b3b6e05e 15759 in->iget();
7c673cae 15760}
f67539c2 15761
7c673cae
FG
15762void intrusive_ptr_release(Inode *in)
15763{
15764 in->client->put_inode(in);
15765}
15766
15767mds_rank_t Client::_get_random_up_mds() const
15768{
9f95a23c 15769 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
15770
15771 std::set<mds_rank_t> up;
15772 mdsmap->get_up_mds_set(up);
15773
15774 if (up.empty())
15775 return MDS_RANK_NONE;
15776 std::set<mds_rank_t>::const_iterator p = up.begin();
15777 for (int n = rand() % up.size(); n; n--)
15778 ++p;
15779 return *p;
15780}
15781
15782
f67539c2
TL
15783StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
15784 boost::asio::io_context& ictx)
15785 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
7c673cae
FG
15786{
15787 monclient->set_messenger(m);
15788 objecter->set_client_incarnation(0);
15789}
15790
15791StandaloneClient::~StandaloneClient()
15792{
15793 delete objecter;
15794 objecter = nullptr;
15795}
15796
15797int StandaloneClient::init()
15798{
f67539c2
TL
15799 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
15800 ceph_assert(iref_writer.is_first_writer());
15801
e306af50 15802 _pre_init();
7c673cae
FG
15803 objecter->init();
15804
9f95a23c 15805 client_lock.lock();
7c673cae
FG
15806
15807 messenger->add_dispatcher_tail(objecter);
15808 messenger->add_dispatcher_tail(this);
15809
15810 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
15811 int r = monclient->init();
15812 if (r < 0) {
15813 // need to do cleanup because we're in an intermediate init state
f67539c2
TL
15814 {
15815 std::scoped_lock l(timer_lock);
15816 timer.shutdown();
15817 }
15818
9f95a23c 15819 client_lock.unlock();
7c673cae
FG
15820 objecter->shutdown();
15821 objectcacher->stop();
15822 monclient->shutdown();
15823 return r;
15824 }
15825 objecter->start();
15826
9f95a23c 15827 client_lock.unlock();
7c673cae 15828 _finish_init();
f67539c2 15829 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
15830
15831 return 0;
15832}
15833
15834void StandaloneClient::shutdown()
15835{
15836 Client::shutdown();
15837 objecter->shutdown();
15838 monclient->shutdown();
15839}