]> git.proxmox.com Git - ceph.git/blame - ceph/src/client/Client.cc
update ceph source to reef 18.2.0
[ceph.git] / ceph / src / client / Client.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16// unix-ey fs stuff
17#include <unistd.h>
18#include <sys/types.h>
19#include <time.h>
20#include <utime.h>
11fdf7f2 21#include <string.h>
7c673cae
FG
22#include <sys/stat.h>
23#include <sys/param.h>
24#include <fcntl.h>
25#include <sys/file.h>
f67539c2 26#ifndef _WIN32
7c673cae 27#include <sys/utsname.h>
f67539c2 28#endif
7c673cae
FG
29#include <sys/uio.h>
30
31#include <boost/lexical_cast.hpp>
32#include <boost/fusion/include/std_pair.hpp>
33
f67539c2
TL
34#include "common/async/waiter.h"
35
1e59de90 36#if defined(__FreeBSD__)
7c673cae
FG
37#define XATTR_CREATE 0x1
38#define XATTR_REPLACE 0x2
1e59de90 39#elif !defined(_WIN32)
7c673cae
FG
40#include <sys/xattr.h>
41#endif
42
43#if defined(__linux__)
44#include <linux/falloc.h>
45#endif
46
47#include <sys/statvfs.h>
48
49#include "common/config.h"
50#include "common/version.h"
f67539c2 51#include "common/async/blocked_completion.h"
7c673cae 52
11fdf7f2
TL
53#include "mon/MonClient.h"
54
55#include "messages/MClientCaps.h"
56#include "messages/MClientLease.h"
57#include "messages/MClientQuota.h"
58#include "messages/MClientReclaim.h"
59#include "messages/MClientReclaimReply.h"
7c673cae 60#include "messages/MClientReconnect.h"
11fdf7f2 61#include "messages/MClientReply.h"
7c673cae
FG
62#include "messages/MClientRequest.h"
63#include "messages/MClientRequestForward.h"
11fdf7f2 64#include "messages/MClientSession.h"
7c673cae 65#include "messages/MClientSnap.h"
f67539c2 66#include "messages/MClientMetrics.h"
7c673cae 67#include "messages/MCommandReply.h"
7c673cae
FG
68#include "messages/MFSMap.h"
69#include "messages/MFSMapUser.h"
11fdf7f2
TL
70#include "messages/MMDSMap.h"
71#include "messages/MOSDMap.h"
7c673cae
FG
72
73#include "mds/flock.h"
11fdf7f2 74#include "mds/cephfs_features.h"
7c673cae
FG
75#include "osd/OSDMap.h"
76#include "osdc/Filer.h"
77
78#include "common/Cond.h"
7c673cae
FG
79#include "common/perf_counters.h"
80#include "common/admin_socket.h"
81#include "common/errno.h"
82#include "include/str_list.h"
83
84#define dout_subsys ceph_subsys_client
85
86#include "include/lru.h"
87#include "include/compat.h"
88#include "include/stringify.h"
f67539c2 89#include "include/random.h"
7c673cae
FG
90
91#include "Client.h"
92#include "Inode.h"
93#include "Dentry.h"
b32b8144 94#include "Delegation.h"
7c673cae
FG
95#include "Dir.h"
96#include "ClientSnapRealm.h"
97#include "Fh.h"
98#include "MetaSession.h"
99#include "MetaRequest.h"
100#include "ObjecterWriteback.h"
101#include "posix_acl.h"
102
11fdf7f2 103#include "include/ceph_assert.h"
7c673cae
FG
104#include "include/stat.h"
105
e306af50 106#include "include/cephfs/ceph_ll_client.h"
7c673cae
FG
107
108#if HAVE_GETGROUPLIST
109#include <grp.h>
110#include <pwd.h>
111#include <unistd.h>
112#endif
113
114#undef dout_prefix
115#define dout_prefix *_dout << "client." << whoami << " "
116
117#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
118
119// FreeBSD fails to define this
120#ifndef O_DSYNC
121#define O_DSYNC 0x0
122#endif
123// Darwin fails to define this
124#ifndef O_RSYNC
125#define O_RSYNC 0x0
126#endif
127
128#ifndef O_DIRECT
129#define O_DIRECT 0x0
130#endif
131
f67539c2
TL
132// Windows doesn't define those values. While the Posix compatibilty layer
133// doesn't support those values, the Windows native functions do provide
134// similar flags. Special care should be taken if we're going to use those
135// flags in ceph-dokan. The current values are no-ops, while propagating
136// them to the rest of the code might cause the Windows functions to reject
137// them as invalid.
138#ifndef O_NOFOLLOW
139#define O_NOFOLLOW 0x0
140#endif
141
142#ifndef O_SYNC
143#define O_SYNC 0x0
144#endif
145
7c673cae
FG
146#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
147
b3b6e05e
TL
148#ifndef S_IXUGO
149#define S_IXUGO (S_IXUSR|S_IXGRP|S_IXOTH)
150#endif
151
20effc67
TL
152using std::dec;
153using std::hex;
154using std::list;
155using std::oct;
156using std::pair;
157using std::string;
158using std::vector;
159
adb31ebb
TL
160using namespace TOPNSPC::common;
161
f67539c2
TL
162namespace bs = boost::system;
163namespace ca = ceph::async;
164
7c673cae
FG
165void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
166{
167 Client *client = static_cast<Client*>(p);
168 client->flush_set_callback(oset);
169}
170
b3b6e05e
TL
171bool Client::is_reserved_vino(vinodeno_t &vino) {
172 if (MDS_IS_PRIVATE_INO(vino.ino)) {
173 ldout(cct, -1) << __func__ << " attempt to access reserved inode number " << vino << dendl;
174 return true;
175 }
176 return false;
177}
178
2a845540
TL
179// running average and standard deviation -- presented in
180// Donald Knuth's TAoCP, Volume II.
181double calc_average(double old_avg, double value, uint64_t count) {
182 double new_avg;
183 if (count == 1) {
184 new_avg = value;
185 } else {
186 new_avg = old_avg + ((value - old_avg) / count);
187 }
188
189 return new_avg;
190}
191
192double calc_sq_sum(double old_sq_sum, double old_mean, double new_mean,
193 double value, uint64_t count) {
194 double new_sq_sum;
195 if (count == 1) {
196 new_sq_sum = 0.0;
197 } else {
198 new_sq_sum = old_sq_sum + (value - old_mean)*(value - new_mean);
199 }
200
201 return new_sq_sum;
202}
7c673cae
FG
203
204// -------------
205
206Client::CommandHook::CommandHook(Client *client) :
207 m_client(client)
208{
209}
210
9f95a23c
TL
211int Client::CommandHook::call(
212 std::string_view command,
213 const cmdmap_t& cmdmap,
39ae355f 214 const bufferlist&,
9f95a23c
TL
215 Formatter *f,
216 std::ostream& errss,
217 bufferlist& out)
7c673cae 218{
7c673cae 219 f->open_object_section("result");
9f95a23c 220 {
f67539c2 221 std::scoped_lock l{m_client->client_lock};
9f95a23c
TL
222 if (command == "mds_requests")
223 m_client->dump_mds_requests(f);
adb31ebb
TL
224 else if (command == "mds_sessions") {
225 bool cap_dump = false;
226 cmd_getval(cmdmap, "cap_dump", cap_dump);
227 m_client->dump_mds_sessions(f, cap_dump);
228 } else if (command == "dump_cache")
9f95a23c
TL
229 m_client->dump_cache(f);
230 else if (command == "kick_stale_sessions")
231 m_client->_kick_stale_sessions();
232 else if (command == "status")
233 m_client->dump_status(f);
234 else
235 ceph_abort_msg("bad command registered");
236 }
7c673cae 237 f->close_section();
9f95a23c 238 return 0;
7c673cae
FG
239}
240
241
242// -------------
243
b3b6e05e
TL
244int Client::get_fd_inode(int fd, InodeRef *in) {
245 int r = 0;
246 if (fd == CEPHFS_AT_FDCWD) {
247 *in = cwd;
248 } else {
249 Fh *f = get_filehandle(fd);
250 if (!f) {
251 r = -CEPHFS_EBADF;
252 } else {
253 *in = f->inode;
254 }
255 }
256 return r;
257}
258
7c673cae
FG
259dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
260 : inode(in), offset(0), next_offset(2),
261 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
262 perms(perms)
263 { }
264
265void Client::_reset_faked_inos()
266{
267 ino_t start = 1024;
268 free_faked_inos.clear();
269 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
270 last_used_faked_ino = 0;
11fdf7f2 271 last_used_faked_root = 0;
f67539c2
TL
272 #ifdef _WIN32
273 // On Windows, sizeof(ino_t) is just 2. Despite that, most "native"
274 // Windows structures, including Dokan ones, are using 64B identifiers.
275 _use_faked_inos = false;
276 #else
7c673cae 277 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
f67539c2 278 #endif
7c673cae
FG
279}
280
281void Client::_assign_faked_ino(Inode *in)
282{
11fdf7f2
TL
283 if (0 == last_used_faked_ino)
284 last_used_faked_ino = last_used_faked_ino + 2048; // start(1024)~2048 reserved for _assign_faked_root
7c673cae
FG
285 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
286 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
11fdf7f2 287 last_used_faked_ino = 2048;
7c673cae
FG
288 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
289 }
11fdf7f2 290 ceph_assert(it != free_faked_inos.end());
7c673cae 291 if (last_used_faked_ino < it.get_start()) {
11fdf7f2 292 ceph_assert(it.get_len() > 0);
7c673cae
FG
293 last_used_faked_ino = it.get_start();
294 } else {
295 ++last_used_faked_ino;
11fdf7f2 296 ceph_assert(it.get_start() + it.get_len() > last_used_faked_ino);
7c673cae
FG
297 }
298 in->faked_ino = last_used_faked_ino;
299 free_faked_inos.erase(in->faked_ino);
300 faked_ino_map[in->faked_ino] = in->vino();
301}
302
11fdf7f2
TL
303/*
304 * In the faked mode, if you export multiple subdirectories,
305 * you will see that the inode numbers of the exported subdirectories
306 * are the same. so we distinguish the mount point by reserving
307 * the "fake ids" between "1024~2048" and combining the last
308 * 10bits(0x3ff) of the "root inodes".
309*/
310void Client::_assign_faked_root(Inode *in)
311{
312 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_root + 1);
313 if (it == free_faked_inos.end() && last_used_faked_root > 0) {
314 last_used_faked_root = 0;
315 it = free_faked_inos.lower_bound(last_used_faked_root + 1);
316 }
20effc67 317 ceph_assert(it != free_faked_inos.end());
11fdf7f2
TL
318 vinodeno_t inode_info = in->vino();
319 uint64_t inode_num = (uint64_t)inode_info.ino;
320 ldout(cct, 10) << "inode_num " << inode_num << "inode_num & 0x3ff=" << (inode_num & 0x3ff)<< dendl;
321 last_used_faked_root = it.get_start() + (inode_num & 0x3ff); // 0x3ff mask and get_start will not exceed 2048
20effc67 322 ceph_assert(it.get_start() + it.get_len() > last_used_faked_root);
11fdf7f2
TL
323
324 in->faked_ino = last_used_faked_root;
325 free_faked_inos.erase(in->faked_ino);
326 faked_ino_map[in->faked_ino] = in->vino();
327}
328
7c673cae
FG
329void Client::_release_faked_ino(Inode *in)
330{
331 free_faked_inos.insert(in->faked_ino);
332 faked_ino_map.erase(in->faked_ino);
333}
334
335vinodeno_t Client::_map_faked_ino(ino_t ino)
336{
337 vinodeno_t vino;
338 if (ino == 1)
339 vino = root->vino();
340 else if (faked_ino_map.count(ino))
341 vino = faked_ino_map[ino];
342 else
343 vino = vinodeno_t(0, CEPH_NOSNAP);
11fdf7f2 344 ldout(cct, 10) << __func__ << " " << ino << " -> " << vino << dendl;
7c673cae
FG
345 return vino;
346}
347
348vinodeno_t Client::map_faked_ino(ino_t ino)
349{
f67539c2 350 std::scoped_lock lock(client_lock);
7c673cae
FG
351 return _map_faked_ino(ino);
352}
353
354// cons/des
355
356Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
f67539c2
TL
357 : Dispatcher(m->cct->get()),
358 timer(m->cct, timer_lock, false),
11fdf7f2
TL
359 messenger(m),
360 monclient(mc),
361 objecter(objecter_),
362 whoami(mc->get_global_id()),
f67539c2
TL
363 mount_state(CLIENT_UNMOUNTED, "Client::mountstate_lock"),
364 initialize_state(CLIENT_NEW, "Client::initstate_lock"),
365 cct_deleter{m->cct, [](CephContext *p) {p->put();}},
7c673cae
FG
366 async_ino_invalidator(m->cct),
367 async_dentry_invalidator(m->cct),
368 interrupt_finisher(m->cct),
369 remount_finisher(m->cct),
e306af50 370 async_ino_releasor(m->cct),
7c673cae 371 objecter_finisher(m->cct),
11fdf7f2
TL
372 m_command_hook(this),
373 fscid(0)
7c673cae
FG
374{
375 _reset_faked_inos();
7c673cae 376
7c673cae
FG
377 user_id = cct->_conf->client_mount_uid;
378 group_id = cct->_conf->client_mount_gid;
92f5a8d4
TL
379 fuse_default_permissions = cct->_conf.get_val<bool>(
380 "fuse_default_permissions");
7c673cae 381
33c7a0ef
TL
382 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
383 "client_collect_and_send_global_metrics");
384
2a845540
TL
385 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
386 "client_mount_timeout");
387
388 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
389 "client_caps_release_delay");
390
7c673cae
FG
391 if (cct->_conf->client_acl_type == "posix_acl")
392 acl_type = POSIX_ACL;
393
7c673cae
FG
394 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
395
396 // file handles
397 free_fd_set.insert(10, 1<<30);
398
399 mdsmap.reset(new MDSMap);
400
401 // osd interfaces
402 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
403 &client_lock));
404 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
405 client_flush_set_callback, // all commit callback
406 (void*)this,
407 cct->_conf->client_oc_size,
408 cct->_conf->client_oc_max_objects,
409 cct->_conf->client_oc_max_dirty,
410 cct->_conf->client_oc_target_dirty,
411 cct->_conf->client_oc_max_dirty_age,
412 true));
7c673cae
FG
413}
414
415
416Client::~Client()
417{
9f95a23c 418 ceph_assert(ceph_mutex_is_not_locked(client_lock));
7c673cae 419
f67539c2
TL
420 // If the task is crashed or aborted and doesn't
421 // get any chance to run the umount and shutdow.
422 {
423 std::scoped_lock l{client_lock};
424 tick_thread_stopped = true;
425 upkeep_cond.notify_one();
426 }
427
428 if (upkeeper.joinable())
429 upkeeper.join();
430
31f18b77
FG
431 // It is necessary to hold client_lock, because any inode destruction
432 // may call into ObjectCacher, which asserts that it's lock (which is
433 // client_lock) is held.
f67539c2 434 std::scoped_lock l{client_lock};
7c673cae
FG
435 tear_down_cache();
436}
437
438void Client::tear_down_cache()
439{
440 // fd's
f67539c2
TL
441 for (auto &[fd, fh] : fd_map) {
442 ldout(cct, 1) << __func__ << " forcing close of fh " << fd << " ino " << fh->inode->ino << dendl;
7c673cae
FG
443 _release_fh(fh);
444 }
445 fd_map.clear();
446
447 while (!opened_dirs.empty()) {
448 dir_result_t *dirp = *opened_dirs.begin();
11fdf7f2 449 ldout(cct, 1) << __func__ << " forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
7c673cae
FG
450 _closedir(dirp);
451 }
452
453 // caps!
454 // *** FIXME ***
455
456 // empty lru
7c673cae 457 trim_cache();
11fdf7f2 458 ceph_assert(lru.lru_get_size() == 0);
7c673cae
FG
459
460 // close root ino
11fdf7f2 461 ceph_assert(inode_map.size() <= 1 + root_parents.size());
7c673cae 462 if (root && inode_map.size() == 1 + root_parents.size()) {
b3b6e05e 463 root.reset();
7c673cae
FG
464 }
465
11fdf7f2 466 ceph_assert(inode_map.empty());
7c673cae
FG
467}
468
469inodeno_t Client::get_root_ino()
470{
f67539c2 471 std::scoped_lock l(client_lock);
7c673cae
FG
472 if (use_faked_inos())
473 return root->faked_ino;
474 else
475 return root->ino;
476}
477
478Inode *Client::get_root()
479{
f67539c2 480 std::scoped_lock l(client_lock);
7c673cae 481 root->ll_get();
b3b6e05e 482 return root.get();
7c673cae
FG
483}
484
485
486// debug crapola
487
488void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
489{
490 filepath path;
491 in->make_long_path(path);
492 ldout(cct, 1) << "dump_inode: "
493 << (disconnected ? "DISCONNECTED ":"")
494 << "inode " << in->ino
495 << " " << path
b3b6e05e 496 << " ref " << in->get_nref()
f67539c2 497 << " " << *in << dendl;
7c673cae
FG
498
499 if (f) {
500 f->open_object_section("inode");
501 f->dump_stream("path") << path;
502 if (disconnected)
503 f->dump_int("disconnected", 1);
504 in->dump(f);
505 f->close_section();
506 }
507
508 did.insert(in);
509 if (in->dir) {
510 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
511 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
512 it != in->dir->dentries.end();
513 ++it) {
514 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
515 if (f) {
516 f->open_object_section("dentry");
517 it->second->dump(f);
518 f->close_section();
519 }
520 if (it->second->inode)
521 dump_inode(f, it->second->inode.get(), did, false);
522 }
523 }
524}
525
526void Client::dump_cache(Formatter *f)
527{
528 set<Inode*> did;
529
11fdf7f2 530 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
531
532 if (f)
533 f->open_array_section("cache");
534
535 if (root)
b3b6e05e 536 dump_inode(f, root.get(), did, true);
7c673cae
FG
537
538 // make a second pass to catch anything disconnected
539 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
540 it != inode_map.end();
541 ++it) {
542 if (did.count(it->second))
543 continue;
544 dump_inode(f, it->second, did, true);
545 }
546
547 if (f)
548 f->close_section();
549}
550
551void Client::dump_status(Formatter *f)
552{
9f95a23c 553 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
554
555 ldout(cct, 1) << __func__ << dendl;
556
557 const epoch_t osd_epoch
558 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
559
560 if (f) {
561 f->open_object_section("metadata");
562 for (const auto& kv : metadata)
563 f->dump_string(kv.first.c_str(), kv.second);
564 f->close_section();
565
566 f->dump_int("dentry_count", lru.lru_get_size());
567 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
568 f->dump_int("id", get_nodeid().v);
11fdf7f2 569 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230 570 f->dump_object("inst", inst);
11fdf7f2
TL
571 f->dump_object("addr", inst.addr);
572 f->dump_stream("inst_str") << inst.name << " " << inst.addr.get_legacy_str();
573 f->dump_string("addr_str", inst.addr.get_legacy_str());
7c673cae
FG
574 f->dump_int("inode_count", inode_map.size());
575 f->dump_int("mds_epoch", mdsmap->get_epoch());
576 f->dump_int("osd_epoch", osd_epoch);
577 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
f67539c2 578 f->dump_bool("blocklisted", blocklisted);
adb31ebb 579 f->dump_string("fs_name", mdsmap->get_fs_name());
7c673cae
FG
580 }
581}
582
e306af50 583void Client::_pre_init()
7c673cae
FG
584{
585 timer.init();
e306af50
TL
586
587 objecter_finisher.start();
588 filer.reset(new Filer(objecter, &objecter_finisher));
e306af50 589
7c673cae 590 objectcacher->start();
e306af50
TL
591}
592
593int Client::init()
594{
f67539c2
TL
595 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
596 ceph_assert(iref_writer.is_first_writer());
597
e306af50 598 _pre_init();
9f95a23c 599 {
f67539c2 600 std::scoped_lock l{client_lock};
9f95a23c
TL
601 messenger->add_dispatcher_tail(this);
602 }
7c673cae 603 _finish_init();
f67539c2 604 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
605 return 0;
606}
607
608void Client::_finish_init()
609{
9f95a23c 610 {
f67539c2 611 std::scoped_lock l{client_lock};
9f95a23c
TL
612 // logger
613 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
614 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
615 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
616 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
617 plb.add_time_avg(l_c_read, "rdlat", "Latency of a file data read operation");
618 plb.add_time_avg(l_c_fsync, "fsync", "Latency of a file sync operation");
2a845540
TL
619 // average, standard deviation mds/r/w/ latencies
620 plb.add_time(l_c_md_avg, "mdavg", "Average latency for processing metadata requests");
621 plb.add_u64(l_c_md_sqsum, "mdsqsum", "Sum of squares (to calculate variability/stdev) for metadata requests");
622 plb.add_u64(l_c_md_ops, "mdops", "Total metadata IO operations");
623 plb.add_time(l_c_rd_avg, "readavg", "Average latency for processing read requests");
624 plb.add_u64(l_c_rd_sqsum, "readsqsum", "Sum of squares ((to calculate variability/stdev) for read requests");
625 plb.add_u64(l_c_rd_ops, "rdops", "Total read IO operations");
626 plb.add_time(l_c_wr_avg, "writeavg", "Average latency for processing write requests");
627 plb.add_u64(l_c_wr_sqsum, "writesqsum", "Sum of squares ((to calculate variability/stdev) for write requests");
628 plb.add_u64(l_c_wr_ops, "rdops", "Total write IO operations");
9f95a23c
TL
629 logger.reset(plb.create_perf_counters());
630 cct->get_perfcounters_collection()->add(logger.get());
631 }
7c673cae 632
11fdf7f2 633 cct->_conf.add_observer(this);
7c673cae
FG
634
635 AdminSocket* admin_socket = cct->get_admin_socket();
636 int ret = admin_socket->register_command("mds_requests",
7c673cae
FG
637 &m_command_hook,
638 "show in-progress mds requests");
639 if (ret < 0) {
640 lderr(cct) << "error registering admin socket command: "
641 << cpp_strerror(-ret) << dendl;
642 }
adb31ebb
TL
643 ret = admin_socket->register_command("mds_sessions "
644 "name=cap_dump,type=CephBool,req=false",
7c673cae
FG
645 &m_command_hook,
646 "show mds session state");
647 if (ret < 0) {
648 lderr(cct) << "error registering admin socket command: "
649 << cpp_strerror(-ret) << dendl;
650 }
651 ret = admin_socket->register_command("dump_cache",
7c673cae
FG
652 &m_command_hook,
653 "show in-memory metadata cache contents");
654 if (ret < 0) {
655 lderr(cct) << "error registering admin socket command: "
656 << cpp_strerror(-ret) << dendl;
657 }
658 ret = admin_socket->register_command("kick_stale_sessions",
7c673cae
FG
659 &m_command_hook,
660 "kick sessions that were remote reset");
661 if (ret < 0) {
662 lderr(cct) << "error registering admin socket command: "
663 << cpp_strerror(-ret) << dendl;
664 }
665 ret = admin_socket->register_command("status",
7c673cae
FG
666 &m_command_hook,
667 "show overall client status");
668 if (ret < 0) {
669 lderr(cct) << "error registering admin socket command: "
670 << cpp_strerror(-ret) << dendl;
671 }
7c673cae
FG
672}
673
674void Client::shutdown()
675{
11fdf7f2 676 ldout(cct, 1) << __func__ << dendl;
7c673cae
FG
677
678 // If we were not mounted, but were being used for sending
679 // MDS commands, we may have sessions that need closing.
9f95a23c 680 {
f67539c2
TL
681 std::scoped_lock l{client_lock};
682
683 // To make sure the tick thread will be stoppped before
684 // destructing the Client, just in case like the _mount()
685 // failed but didn't not get a chance to stop the tick
686 // thread
687 tick_thread_stopped = true;
688 upkeep_cond.notify_one();
689
9f95a23c
TL
690 _close_sessions();
691 }
11fdf7f2 692 cct->_conf.remove_observer(this);
7c673cae 693
11fdf7f2 694 cct->get_admin_socket()->unregister_commands(&m_command_hook);
7c673cae
FG
695
696 if (ino_invalidate_cb) {
697 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
698 async_ino_invalidator.wait_for_empty();
699 async_ino_invalidator.stop();
700 }
701
702 if (dentry_invalidate_cb) {
703 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
704 async_dentry_invalidator.wait_for_empty();
705 async_dentry_invalidator.stop();
706 }
707
708 if (switch_interrupt_cb) {
709 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
710 interrupt_finisher.wait_for_empty();
711 interrupt_finisher.stop();
712 }
713
714 if (remount_cb) {
715 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
716 remount_finisher.wait_for_empty();
717 remount_finisher.stop();
718 }
719
e306af50
TL
720 if (ino_release_cb) {
721 ldout(cct, 10) << "shutdown stopping inode release finisher" << dendl;
722 async_ino_releasor.wait_for_empty();
723 async_ino_releasor.stop();
724 }
725
7c673cae 726 objectcacher->stop(); // outside of client_lock! this does a join.
f67539c2
TL
727
728 /*
729 * We are shuting down the client.
730 *
731 * Just declare the state to CLIENT_NEW to block and fail any
732 * new comming "reader" and then try to wait all the in-flight
733 * "readers" to finish.
734 */
735 RWRef_t iref_writer(initialize_state, CLIENT_NEW, false);
736 if (!iref_writer.is_first_writer())
737 return;
738 iref_writer.wait_readers_done();
739
9f95a23c 740 {
f67539c2 741 std::scoped_lock l(timer_lock);
9f95a23c
TL
742 timer.shutdown();
743 }
f67539c2 744
7c673cae
FG
745 objecter_finisher.wait_for_empty();
746 objecter_finisher.stop();
747
748 if (logger) {
749 cct->get_perfcounters_collection()->remove(logger.get());
750 logger.reset();
751 }
752}
753
2a845540
TL
754void Client::update_io_stat_metadata(utime_t latency) {
755 auto lat_nsec = latency.to_nsec();
756 // old values are used to compute new ones
757 auto o_avg = logger->tget(l_c_md_avg).to_nsec();
758 auto o_sqsum = logger->get(l_c_md_sqsum);
759
760 auto n_avg = calc_average(o_avg, lat_nsec, nr_metadata_request);
761 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
762 nr_metadata_request);
763
764 logger->tinc(l_c_lat, latency);
765 logger->tinc(l_c_reply, latency);
766
767 utime_t avg;
768 avg.set_from_double(n_avg / 1000000000);
769 logger->tset(l_c_md_avg, avg);
770 logger->set(l_c_md_sqsum, n_sqsum);
771 logger->set(l_c_md_ops, nr_metadata_request);
772}
773
774void Client::update_io_stat_read(utime_t latency) {
775 auto lat_nsec = latency.to_nsec();
776 // old values are used to compute new ones
777 auto o_avg = logger->tget(l_c_rd_avg).to_nsec();
778 auto o_sqsum = logger->get(l_c_rd_sqsum);
779
780 auto n_avg = calc_average(o_avg, lat_nsec, nr_read_request);
781 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
782 nr_read_request);
783
784 logger->tinc(l_c_read, latency);
785
786 utime_t avg;
787 avg.set_from_double(n_avg / 1000000000);
788 logger->tset(l_c_rd_avg, avg);
789 logger->set(l_c_rd_sqsum, n_sqsum);
790 logger->set(l_c_rd_ops, nr_read_request);
791}
792
793void Client::update_io_stat_write(utime_t latency) {
794 auto lat_nsec = latency.to_nsec();
795 // old values are used to compute new ones
796 auto o_avg = logger->tget(l_c_wr_avg).to_nsec();
797 auto o_sqsum = logger->get(l_c_wr_sqsum);
798
799 auto n_avg = calc_average(o_avg, lat_nsec, nr_write_request);
800 auto n_sqsum = calc_sq_sum(o_sqsum, o_avg, n_avg, lat_nsec,
801 nr_write_request);
802
803 logger->tinc(l_c_wrlat, latency);
804
805 utime_t avg;
806 avg.set_from_double(n_avg / 1000000000);
807 logger->tset(l_c_wr_avg, avg);
808 logger->set(l_c_wr_sqsum, n_sqsum);
809 logger->set(l_c_wr_ops, nr_write_request);
810}
7c673cae
FG
811
812// ===================
813// metadata cache stuff
814
815void Client::trim_cache(bool trim_kernel_dcache)
816{
181888fb
FG
817 uint64_t max = cct->_conf->client_cache_size;
818 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
7c673cae
FG
819 unsigned last = 0;
820 while (lru.lru_get_size() != last) {
821 last = lru.lru_get_size();
822
f67539c2 823 if (!is_unmounting() && lru.lru_get_size() <= max) break;
7c673cae
FG
824
825 // trim!
31f18b77 826 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
7c673cae
FG
827 if (!dn)
828 break; // done
f67539c2 829
7c673cae
FG
830 trim_dentry(dn);
831 }
832
181888fb 833 if (trim_kernel_dcache && lru.lru_get_size() > max)
7c673cae
FG
834 _invalidate_kernel_dcache();
835
836 // hose root?
b3b6e05e 837 if (lru.lru_get_size() == 0 && root && root->get_nref() == 1 && inode_map.size() == 1 + root_parents.size()) {
7c673cae 838 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
b3b6e05e 839 root.reset();
7c673cae
FG
840 }
841}
842
843void Client::trim_cache_for_reconnect(MetaSession *s)
844{
845 mds_rank_t mds = s->mds_num;
11fdf7f2 846 ldout(cct, 20) << __func__ << " mds." << mds << dendl;
7c673cae
FG
847
848 int trimmed = 0;
849 list<Dentry*> skipped;
850 while (lru.lru_get_size() > 0) {
851 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
852 if (!dn)
853 break;
854
855 if ((dn->inode && dn->inode->caps.count(mds)) ||
856 dn->dir->parent_inode->caps.count(mds)) {
857 trim_dentry(dn);
858 trimmed++;
859 } else
860 skipped.push_back(dn);
861 }
862
863 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
864 lru.lru_insert_mid(*p);
865
11fdf7f2 866 ldout(cct, 20) << __func__ << " mds." << mds
7c673cae
FG
867 << " trimmed " << trimmed << " dentries" << dendl;
868
869 if (s->caps.size() > 0)
870 _invalidate_kernel_dcache();
871}
872
873void Client::trim_dentry(Dentry *dn)
874{
875 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
11fdf7f2
TL
876 << " in dir "
877 << std::hex << dn->dir->parent_inode->ino << std::dec
7c673cae
FG
878 << dendl;
879 if (dn->inode) {
880 Inode *diri = dn->dir->parent_inode;
7c673cae
FG
881 clear_dir_complete_and_ordered(diri, true);
882 }
883 unlink(dn, false, false); // drop dir, drop dentry
884}
885
886
1adf2230
AA
887void Client::update_inode_file_size(Inode *in, int issued, uint64_t size,
888 uint64_t truncate_seq, uint64_t truncate_size)
7c673cae 889{
7c673cae
FG
890 uint64_t prior_size = in->size;
891
7c673cae
FG
892 if (truncate_seq > in->truncate_seq ||
893 (truncate_seq == in->truncate_seq && size > in->size)) {
894 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
895 in->size = size;
896 in->reported_size = size;
897 if (truncate_seq != in->truncate_seq) {
898 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
899 << truncate_seq << dendl;
900 in->truncate_seq = truncate_seq;
901 in->oset.truncate_seq = truncate_seq;
902
903 // truncate cached file data
904 if (prior_size > size) {
1e59de90 905 _invalidate_inode_cache(in, size, prior_size - size);
7c673cae
FG
906 }
907 }
908
909 // truncate inline data
910 if (in->inline_version < CEPH_INLINE_NONE) {
911 uint32_t len = in->inline_data.length();
912 if (size < len)
913 in->inline_data.splice(size, len - size);
914 }
915 }
916 if (truncate_seq >= in->truncate_seq &&
917 in->truncate_size != truncate_size) {
918 if (in->is_file()) {
919 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
920 << truncate_size << dendl;
921 in->truncate_size = truncate_size;
922 in->oset.truncate_size = truncate_size;
923 } else {
924 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
925 }
926 }
1adf2230
AA
927}
928
929void Client::update_inode_file_time(Inode *in, int issued, uint64_t time_warp_seq,
930 utime_t ctime, utime_t mtime, utime_t atime)
931{
932 ldout(cct, 10) << __func__ << " " << *in << " " << ccap_string(issued)
933 << " ctime " << ctime << " mtime " << mtime << dendl;
934
935 if (time_warp_seq > in->time_warp_seq)
936 ldout(cct, 10) << " mds time_warp_seq " << time_warp_seq
937 << " is higher than local time_warp_seq "
938 << in->time_warp_seq << dendl;
939
940 int warn = false;
7c673cae
FG
941 // be careful with size, mtime, atime
942 if (issued & (CEPH_CAP_FILE_EXCL|
943 CEPH_CAP_FILE_WR|
944 CEPH_CAP_FILE_BUFFER|
945 CEPH_CAP_AUTH_EXCL|
946 CEPH_CAP_XATTR_EXCL)) {
947 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
948 if (ctime > in->ctime)
949 in->ctime = ctime;
950 if (time_warp_seq > in->time_warp_seq) {
7c673cae
FG
951 //the mds updated times, so take those!
952 in->mtime = mtime;
953 in->atime = atime;
954 in->time_warp_seq = time_warp_seq;
955 } else if (time_warp_seq == in->time_warp_seq) {
956 //take max times
957 if (mtime > in->mtime)
958 in->mtime = mtime;
959 if (atime > in->atime)
960 in->atime = atime;
961 } else if (issued & CEPH_CAP_FILE_EXCL) {
962 //ignore mds values as we have a higher seq
963 } else warn = true;
964 } else {
965 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
966 if (time_warp_seq >= in->time_warp_seq) {
967 in->ctime = ctime;
968 in->mtime = mtime;
969 in->atime = atime;
970 in->time_warp_seq = time_warp_seq;
971 } else warn = true;
972 }
973 if (warn) {
974 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
975 << time_warp_seq << " is lower than local time_warp_seq "
976 << in->time_warp_seq
977 << dendl;
978 }
979}
980
981void Client::_fragmap_remove_non_leaves(Inode *in)
982{
983 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
984 if (!in->dirfragtree.is_leaf(p->first))
985 in->fragmap.erase(p++);
986 else
987 ++p;
988}
989
990void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
991{
992 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
993 if (p->second == mds)
994 in->fragmap.erase(p++);
995 else
996 ++p;
997}
998
999Inode * Client::add_update_inode(InodeStat *st, utime_t from,
1000 MetaSession *session,
1001 const UserPerm& request_perms)
1002{
1003 Inode *in;
1004 bool was_new = false;
1005 if (inode_map.count(st->vino)) {
1006 in = inode_map[st->vino];
11fdf7f2 1007 ldout(cct, 12) << __func__ << " had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
7c673cae
FG
1008 } else {
1009 in = new Inode(this, st->vino, &st->layout);
1010 inode_map[st->vino] = in;
1011
1012 if (use_faked_inos())
1013 _assign_faked_ino(in);
1014
1015 if (!root) {
1016 root = in;
11fdf7f2 1017 if (use_faked_inos())
b3b6e05e 1018 _assign_faked_root(root.get());
7c673cae
FG
1019 root_ancestor = in;
1020 cwd = root;
f67539c2 1021 } else if (is_mounting()) {
7c673cae
FG
1022 root_parents[root_ancestor] = in;
1023 root_ancestor = in;
1024 }
1025
1026 // immutable bits
1027 in->ino = st->vino.ino;
1028 in->snapid = st->vino.snapid;
1029 in->mode = st->mode & S_IFMT;
1030 was_new = true;
1031 }
1032
1033 in->rdev = st->rdev;
1034 if (in->is_symlink())
1035 in->symlink = st->symlink;
1036
7c673cae 1037 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
1adf2230
AA
1038 bool new_version = false;
1039 if (in->version == 0 ||
1040 ((st->cap.flags & CEPH_CAP_FLAG_AUTH) &&
1041 (in->version & ~1) < st->version))
1042 new_version = true;
7c673cae 1043
1adf2230
AA
1044 int issued;
1045 in->caps_issued(&issued);
1046 issued |= in->caps_dirty();
1047 int new_issued = ~issued & (int)st->cap.caps;
7c673cae 1048
39ae355f 1049 bool need_snapdir_attr_refresh = false;
1adf2230
AA
1050 if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) &&
1051 !(issued & CEPH_CAP_AUTH_EXCL)) {
1052 in->mode = st->mode;
1053 in->uid = st->uid;
1054 in->gid = st->gid;
1055 in->btime = st->btime;
81eedcae 1056 in->snap_btime = st->snap_btime;
f67539c2 1057 in->snap_metadata = st->snap_metadata;
1e59de90 1058 in->fscrypt_auth = st->fscrypt_auth;
39ae355f 1059 need_snapdir_attr_refresh = true;
1adf2230 1060 }
7c673cae 1061
1adf2230
AA
1062 if ((new_version || (new_issued & CEPH_CAP_LINK_SHARED)) &&
1063 !(issued & CEPH_CAP_LINK_EXCL)) {
1064 in->nlink = st->nlink;
1065 }
7c673cae 1066
1adf2230 1067 if (new_version || (new_issued & CEPH_CAP_ANY_RD)) {
39ae355f 1068 need_snapdir_attr_refresh = true;
1adf2230
AA
1069 update_inode_file_time(in, issued, st->time_warp_seq,
1070 st->ctime, st->mtime, st->atime);
1071 }
7c673cae 1072
1adf2230
AA
1073 if (new_version ||
1074 (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
7c673cae 1075 in->layout = st->layout;
1e59de90 1076 in->fscrypt_file = st->fscrypt_file;
1adf2230
AA
1077 update_inode_file_size(in, issued, st->size, st->truncate_seq, st->truncate_size);
1078 }
7c673cae 1079
1adf2230
AA
1080 if (in->is_dir()) {
1081 if (new_version || (new_issued & CEPH_CAP_FILE_SHARED)) {
1082 in->dirstat = st->dirstat;
1083 }
1084 // dir_layout/rstat/quota are not tracked by capability, update them only if
1085 // the inode stat is from auth mds
1086 if (new_version || (st->cap.flags & CEPH_CAP_FLAG_AUTH)) {
7c673cae
FG
1087 in->dir_layout = st->dir_layout;
1088 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
1adf2230
AA
1089 in->rstat = st->rstat;
1090 in->quota = st->quota;
11fdf7f2 1091 in->dir_pin = st->dir_pin;
1adf2230
AA
1092 }
1093 // move me if/when version reflects fragtree changes.
1094 if (in->dirfragtree != st->dirfragtree) {
1095 in->dirfragtree = st->dirfragtree;
1096 _fragmap_remove_non_leaves(in);
7c673cae 1097 }
7c673cae
FG
1098 }
1099
1100 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
1101 st->xattrbl.length() &&
1102 st->xattr_version > in->xattr_version) {
11fdf7f2
TL
1103 auto p = st->xattrbl.cbegin();
1104 decode(in->xattrs, p);
7c673cae 1105 in->xattr_version = st->xattr_version;
39ae355f 1106 need_snapdir_attr_refresh = true;
7c673cae
FG
1107 }
1108
1adf2230
AA
1109 if (st->inline_version > in->inline_version) {
1110 in->inline_data = st->inline_data;
1111 in->inline_version = st->inline_version;
7c673cae
FG
1112 }
1113
1adf2230 1114 /* always take a newer change attr */
39ae355f 1115 ldout(cct, 12) << __func__ << " client inode change_attr: " << in->change_attr << " , mds inodestat change_attr: " << st->change_attr << dendl;
1adf2230
AA
1116 if (st->change_attr > in->change_attr)
1117 in->change_attr = st->change_attr;
1118
1119 if (st->version > in->version)
1120 in->version = st->version;
1121
1122 if (was_new)
1123 ldout(cct, 12) << __func__ << " adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
1124
1125 if (!st->cap.caps)
1126 return in; // as with readdir returning indoes in different snaprealms (no caps!)
1127
7c673cae 1128 if (in->snapid == CEPH_NOSNAP) {
a8e16298
TL
1129 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.wanted,
1130 st->cap.seq, st->cap.mseq, inodeno_t(st->cap.realm),
1131 st->cap.flags, request_perms);
28e407b8 1132 if (in->auth_cap && in->auth_cap->session == session) {
7c673cae 1133 in->max_size = st->max_size;
28e407b8
AA
1134 in->rstat = st->rstat;
1135 }
7c673cae 1136
1adf2230
AA
1137 // setting I_COMPLETE needs to happen after adding the cap
1138 if (in->is_dir() &&
1139 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
1140 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
1141 in->dirstat.nfiles == 0 &&
1142 in->dirstat.nsubdirs == 0) {
1143 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
1144 in->flags |= I_COMPLETE | I_DIR_ORDERED;
1145 if (in->dir) {
1146 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
1147 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
1148 in->dir->readdir_cache.clear();
1149 for (const auto& p : in->dir->dentries) {
1150 unlink(p.second, true, true); // keep dir, keep dentry
1151 }
1152 if (in->dir->dentries.empty())
1153 close_dir(in->dir);
7c673cae 1154 }
7c673cae 1155 }
1adf2230
AA
1156 } else {
1157 in->snap_caps |= st->cap.caps;
7c673cae
FG
1158 }
1159
39ae355f
TL
1160 if (need_snapdir_attr_refresh && in->is_dir() && in->snapid == CEPH_NOSNAP) {
1161 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
1162 if (inode_map.count(vino)) {
1163 refresh_snapdir_attrs(inode_map[vino], in);
1164 }
1165 }
1166
7c673cae
FG
1167 return in;
1168}
1169
1170
1171/*
1172 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
1173 */
1174Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
1175 Inode *in, utime_t from, MetaSession *session,
1176 Dentry *old_dentry)
1177{
1178 Dentry *dn = NULL;
1179 if (dir->dentries.count(dname))
1180 dn = dir->dentries[dname];
1181
11fdf7f2 1182 ldout(cct, 12) << __func__ << " '" << dname << "' vino " << in->vino()
7c673cae
FG
1183 << " in dir " << dir->parent_inode->vino() << " dn " << dn
1184 << dendl;
1185
1186 if (dn && dn->inode) {
1187 if (dn->inode->vino() == in->vino()) {
1188 touch_dn(dn);
1189 ldout(cct, 12) << " had dentry " << dname
1190 << " with correct vino " << dn->inode->vino()
1191 << dendl;
1192 } else {
1193 ldout(cct, 12) << " had dentry " << dname
1194 << " with WRONG vino " << dn->inode->vino()
1195 << dendl;
1196 unlink(dn, true, true); // keep dir, keep dentry
1197 }
1198 }
1199
1200 if (!dn || !dn->inode) {
1201 InodeRef tmp_ref(in);
1202 if (old_dentry) {
1203 if (old_dentry->dir != dir) {
1204 Inode *old_diri = old_dentry->dir->parent_inode;
7c673cae
FG
1205 clear_dir_complete_and_ordered(old_diri, false);
1206 }
1207 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
1208 }
1209 Inode *diri = dir->parent_inode;
7c673cae
FG
1210 clear_dir_complete_and_ordered(diri, false);
1211 dn = link(dir, dname, in, dn);
1212 }
1213
1214 update_dentry_lease(dn, dlease, from, session);
1215 return dn;
1216}
1217
1218void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
1219{
1220 utime_t dttl = from;
1221 dttl += (float)dlease->duration_ms / 1000.0;
f67539c2
TL
1222
1223 ldout(cct, 15) << __func__ << " " << *dn << " " << *dlease << " from " << from << dendl;
7c673cae 1224
11fdf7f2 1225 ceph_assert(dn);
7c673cae 1226
9f95a23c 1227 if (dlease->mask & CEPH_LEASE_VALID) {
7c673cae
FG
1228 if (dttl > dn->lease_ttl) {
1229 ldout(cct, 10) << "got dentry lease on " << dn->name
1230 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
1231 dn->lease_ttl = dttl;
1232 dn->lease_mds = session->mds_num;
1233 dn->lease_seq = dlease->seq;
1234 dn->lease_gen = session->cap_gen;
1235 }
1236 }
1237 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
f91f0fd5
TL
1238 if (dlease->mask & CEPH_LEASE_PRIMARY_LINK)
1239 dn->mark_primary();
f67539c2 1240 dn->alternate_name = std::move(dlease->alternate_name);
7c673cae
FG
1241}
1242
1243
1244/*
1245 * update MDS location cache for a single inode
1246 */
522d829b 1247void Client::update_dir_dist(Inode *in, DirStat *dst, mds_rank_t from)
7c673cae
FG
1248{
1249 // auth
1250 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1251 if (dst->auth >= 0) {
1252 in->fragmap[dst->frag] = dst->auth;
1253 } else {
1254 in->fragmap.erase(dst->frag);
1255 }
1256 if (!in->dirfragtree.is_leaf(dst->frag)) {
1257 in->dirfragtree.force_to_leaf(cct, dst->frag);
1258 _fragmap_remove_non_leaves(in);
1259 }
1260
522d829b
TL
1261 // replicated, only update from auth mds reply
1262 if (from == dst->auth) {
1263 in->dir_replicated = !dst->dist.empty();
1264 if (!dst->dist.empty())
1265 in->frag_repmap[dst->frag].assign(dst->dist.begin(), dst->dist.end()) ;
1266 else
1267 in->frag_repmap.erase(dst->frag);
1268 }
7c673cae
FG
1269}
1270
1271void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1272{
f91f0fd5
TL
1273 if (complete)
1274 diri->dir_release_count++;
1275 else
1276 diri->dir_ordered_count++;
7c673cae
FG
1277 if (diri->flags & I_COMPLETE) {
1278 if (complete) {
1279 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1280 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1281 } else {
1282 if (diri->flags & I_DIR_ORDERED) {
1283 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1284 diri->flags &= ~I_DIR_ORDERED;
1285 }
1286 }
1287 if (diri->dir)
1288 diri->dir->readdir_cache.clear();
1289 }
1290}
1291
1292/*
1293 * insert results from readdir or lssnap into the metadata cache.
1294 */
1295void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1296
11fdf7f2 1297 auto& reply = request->reply;
7c673cae 1298 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1299 uint64_t features;
1300 if(session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1301 features = (uint64_t)-1;
1302 }
1303 else {
1304 features = con->get_features();
1305 }
7c673cae
FG
1306
1307 dir_result_t *dirp = request->dirp;
11fdf7f2 1308 ceph_assert(dirp);
7c673cae
FG
1309
1310 // the extra buffer list is only set for readdir and lssnap replies
11fdf7f2 1311 auto p = reply->get_extra_bl().cbegin();
7c673cae
FG
1312 if (!p.end()) {
1313 // snapdir?
1314 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
11fdf7f2 1315 ceph_assert(diri);
7c673cae
FG
1316 diri = open_snapdir(diri);
1317 }
1318
1319 // only open dir if we're actually adding stuff to it!
1320 Dir *dir = diri->open_dir();
11fdf7f2 1321 ceph_assert(dir);
7c673cae
FG
1322
1323 // dirstat
11fdf7f2 1324 DirStat dst(p, features);
7c673cae
FG
1325 __u32 numdn;
1326 __u16 flags;
11fdf7f2
TL
1327 decode(numdn, p);
1328 decode(flags, p);
7c673cae
FG
1329
1330 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1331 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1332
1333 frag_t fg = (unsigned)request->head.args.readdir.frag;
1334 unsigned readdir_offset = dirp->next_offset;
1335 string readdir_start = dirp->last_name;
11fdf7f2 1336 ceph_assert(!readdir_start.empty() || readdir_offset == 2);
7c673cae
FG
1337
1338 unsigned last_hash = 0;
1339 if (hash_order) {
1340 if (!readdir_start.empty()) {
1341 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1342 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1343 /* mds understands offset_hash */
1344 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1345 }
1346 }
1347
1348 if (fg != dst.frag) {
1349 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1350 fg = dst.frag;
1351 if (!hash_order) {
1352 readdir_offset = 2;
1353 readdir_start.clear();
1354 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1355 }
1356 }
1357
1358 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1359 << ", hash_order=" << hash_order
1360 << ", readdir_start " << readdir_start
1361 << ", last_hash " << last_hash
1362 << ", next_offset " << readdir_offset << dendl;
1363
1364 if (diri->snapid != CEPH_SNAPDIR &&
1365 fg.is_leftmost() && readdir_offset == 2 &&
1366 !(hash_order && last_hash)) {
1367 dirp->release_count = diri->dir_release_count;
1368 dirp->ordered_count = diri->dir_ordered_count;
1369 dirp->start_shared_gen = diri->shared_gen;
1370 dirp->cache_index = 0;
1371 }
1372
1373 dirp->buffer_frag = fg;
1374
1375 _readdir_drop_dirp_buffer(dirp);
1376 dirp->buffer.reserve(numdn);
1377
1378 string dname;
1379 LeaseStat dlease;
1380 for (unsigned i=0; i<numdn; i++) {
11fdf7f2
TL
1381 decode(dname, p);
1382 dlease.decode(p, features);
7c673cae
FG
1383 InodeStat ist(p, features);
1384
1385 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1386
1387 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1388 request->perms);
1389 Dentry *dn;
1390 if (diri->dir->dentries.count(dname)) {
1391 Dentry *olddn = diri->dir->dentries[dname];
1392 if (olddn->inode != in) {
1393 // replace incorrect dentry
1394 unlink(olddn, true, true); // keep dir, dentry
1395 dn = link(dir, dname, in, olddn);
11fdf7f2 1396 ceph_assert(dn == olddn);
7c673cae
FG
1397 } else {
1398 // keep existing dn
1399 dn = olddn;
1400 touch_dn(dn);
1401 }
1402 } else {
1403 // new dn
1404 dn = link(dir, dname, in, NULL);
1405 }
f67539c2 1406 dn->alternate_name = std::move(dlease.alternate_name);
7c673cae
FG
1407
1408 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1409 if (hash_order) {
1410 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1411 if (hash != last_hash)
1412 readdir_offset = 2;
1413 last_hash = hash;
1414 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1415 } else {
1416 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1417 }
1418 // add to readdir cache
1419 if (dirp->release_count == diri->dir_release_count &&
1420 dirp->ordered_count == diri->dir_ordered_count &&
1421 dirp->start_shared_gen == diri->shared_gen) {
1422 if (dirp->cache_index == dir->readdir_cache.size()) {
1423 if (i == 0) {
11fdf7f2 1424 ceph_assert(!dirp->inode->is_complete_and_ordered());
7c673cae
FG
1425 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1426 }
1427 dir->readdir_cache.push_back(dn);
1428 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1429 if (dirp->inode->is_complete_and_ordered())
11fdf7f2 1430 ceph_assert(dir->readdir_cache[dirp->cache_index] == dn);
7c673cae
FG
1431 else
1432 dir->readdir_cache[dirp->cache_index] = dn;
1433 } else {
11fdf7f2 1434 ceph_abort_msg("unexpected readdir buffer idx");
7c673cae
FG
1435 }
1436 dirp->cache_index++;
1437 }
1438 // add to cached result list
f67539c2 1439 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, dn->alternate_name, in));
7c673cae
FG
1440 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1441 }
1442
1443 if (numdn > 0)
1444 dirp->last_name = dname;
1445 if (end)
1446 dirp->next_offset = 2;
1447 else
1448 dirp->next_offset = readdir_offset;
1449
1450 if (dir->is_empty())
1451 close_dir(dir);
1452 }
1453}
1454
1455/** insert_trace
1456 *
1457 * insert a trace from a MDS reply into the cache.
1458 */
1459Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1460{
11fdf7f2 1461 auto& reply = request->reply;
7c673cae
FG
1462 int op = request->get_op();
1463
1464 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1465 << " is_target=" << (int)reply->head.is_target
1466 << " is_dentry=" << (int)reply->head.is_dentry
1467 << dendl;
1468
11fdf7f2 1469 auto p = reply->get_trace_bl().cbegin();
7c673cae
FG
1470 if (request->got_unsafe) {
1471 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
11fdf7f2 1472 ceph_assert(p.end());
7c673cae
FG
1473 return NULL;
1474 }
1475
1476 if (p.end()) {
1477 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1478
1479 Dentry *d = request->dentry();
1480 if (d) {
1481 Inode *diri = d->dir->parent_inode;
7c673cae
FG
1482 clear_dir_complete_and_ordered(diri, true);
1483 }
1484
1485 if (d && reply->get_result() == 0) {
1486 if (op == CEPH_MDS_OP_RENAME) {
1487 // rename
1488 Dentry *od = request->old_dentry();
1489 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
11fdf7f2 1490 ceph_assert(od);
7c673cae
FG
1491 unlink(od, true, true); // keep dir, dentry
1492 } else if (op == CEPH_MDS_OP_RMDIR ||
1493 op == CEPH_MDS_OP_UNLINK) {
1494 // unlink, rmdir
1495 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1496 unlink(d, true, true); // keep dir, dentry
1497 }
1498 }
1499 return NULL;
1500 }
1501
1502 ConnectionRef con = request->reply->get_connection();
11fdf7f2
TL
1503 uint64_t features;
1504 if (session->mds_features.test(CEPHFS_FEATURE_REPLY_ENCODING)) {
1505 features = (uint64_t)-1;
1506 }
1507 else {
1508 features = con->get_features();
1509 }
7c673cae
FG
1510 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1511
1512 // snap trace
1513 SnapRealm *realm = NULL;
1514 if (reply->snapbl.length())
1e59de90 1515 update_snap_trace(session, reply->snapbl, &realm);
7c673cae
FG
1516
1517 ldout(cct, 10) << " hrm "
1518 << " is_target=" << (int)reply->head.is_target
1519 << " is_dentry=" << (int)reply->head.is_dentry
1520 << dendl;
1521
1522 InodeStat dirst;
1523 DirStat dst;
1524 string dname;
1525 LeaseStat dlease;
1526 InodeStat ist;
1527
1528 if (reply->head.is_dentry) {
1529 dirst.decode(p, features);
11fdf7f2
TL
1530 dst.decode(p, features);
1531 decode(dname, p);
1532 dlease.decode(p, features);
7c673cae
FG
1533 }
1534
1535 Inode *in = 0;
1536 if (reply->head.is_target) {
1537 ist.decode(p, features);
1538 if (cct->_conf->client_debug_getattr_caps) {
1539 unsigned wanted = 0;
1540 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1541 wanted = request->head.args.getattr.mask;
1542 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1543 wanted = request->head.args.open.mask;
1544
1545 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1546 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
11fdf7f2 1547 ceph_abort_msg("MDS reply does not contain xattrs");
7c673cae
FG
1548 }
1549
1550 in = add_update_inode(&ist, request->sent_stamp, session,
1551 request->perms);
1552 }
1553
1554 Inode *diri = NULL;
1555 if (reply->head.is_dentry) {
1556 diri = add_update_inode(&dirst, request->sent_stamp, session,
1557 request->perms);
522d829b
TL
1558 mds_rank_t from_mds = mds_rank_t(reply->get_source().num());
1559 update_dir_dist(diri, &dst, from_mds); // dir stat info is attached to ..
7c673cae
FG
1560
1561 if (in) {
1562 Dir *dir = diri->open_dir();
1563 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1564 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1565 } else {
1566 Dentry *dn = NULL;
1567 if (diri->dir && diri->dir->dentries.count(dname)) {
1568 dn = diri->dir->dentries[dname];
1569 if (dn->inode) {
7c673cae
FG
1570 clear_dir_complete_and_ordered(diri, false);
1571 unlink(dn, true, true); // keep dir, dentry
1572 }
1573 }
1574 if (dlease.duration_ms > 0) {
1575 if (!dn) {
1576 Dir *dir = diri->open_dir();
1577 dn = link(dir, dname, NULL, NULL);
1578 }
1579 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1580 }
1581 }
1582 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1583 op == CEPH_MDS_OP_MKSNAP) {
1584 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1585 // fake it for snap lookup
1586 vinodeno_t vino = ist.vino;
1587 vino.snapid = CEPH_SNAPDIR;
11fdf7f2 1588 ceph_assert(inode_map.count(vino));
7c673cae
FG
1589 diri = inode_map[vino];
1590
1591 string dname = request->path.last_dentry();
1592
1593 LeaseStat dlease;
1594 dlease.duration_ms = 0;
1595
1596 if (in) {
1597 Dir *dir = diri->open_dir();
1598 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1599 } else {
1600 if (diri->dir && diri->dir->dentries.count(dname)) {
1601 Dentry *dn = diri->dir->dentries[dname];
1602 if (dn->inode)
1603 unlink(dn, true, true); // keep dir, dentry
1604 }
1605 }
1606 }
1607
1608 if (in) {
1609 if (op == CEPH_MDS_OP_READDIR ||
1610 op == CEPH_MDS_OP_LSSNAP) {
1611 insert_readdir_results(request, session, in);
1612 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1613 // hack: return parent inode instead
1614 in = diri;
1615 }
1616
1617 if (request->dentry() == NULL && in != request->inode()) {
1618 // pin the target inode if its parent dentry is not pinned
1619 request->set_other_inode(in);
1620 }
1621 }
1622
1623 if (realm)
1624 put_snap_realm(realm);
1625
1626 request->target = in;
1627 return in;
1628}
1629
1630// -------
1631
1632mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1633{
1634 mds_rank_t mds = MDS_RANK_NONE;
1635 __u32 hash = 0;
1636 bool is_hash = false;
2a845540 1637 int issued = 0;
7c673cae
FG
1638
1639 Inode *in = NULL;
1640 Dentry *de = NULL;
7c673cae
FG
1641
1642 if (req->resend_mds >= 0) {
1643 mds = req->resend_mds;
1644 req->resend_mds = -1;
11fdf7f2 1645 ldout(cct, 10) << __func__ << " resend_mds specified as mds." << mds << dendl;
7c673cae
FG
1646 goto out;
1647 }
1648
1649 if (cct->_conf->client_use_random_mds)
1650 goto random_mds;
1651
1652 in = req->inode();
1653 de = req->dentry();
1654 if (in) {
11fdf7f2 1655 ldout(cct, 20) << __func__ << " starting with req->inode " << *in << dendl;
7c673cae
FG
1656 if (req->path.depth()) {
1657 hash = in->hash_dentry_name(req->path[0]);
11fdf7f2 1658 ldout(cct, 20) << __func__ << " inode dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1659 << " on " << req->path[0]
1660 << " => " << hash << dendl;
1661 is_hash = true;
1662 }
1663 } else if (de) {
1664 if (de->inode) {
1665 in = de->inode.get();
11fdf7f2 1666 ldout(cct, 20) << __func__ << " starting with req->dentry inode " << *in << dendl;
7c673cae
FG
1667 } else {
1668 in = de->dir->parent_inode;
1669 hash = in->hash_dentry_name(de->name);
11fdf7f2 1670 ldout(cct, 20) << __func__ << " dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
7c673cae
FG
1671 << " on " << de->name
1672 << " => " << hash << dendl;
1673 is_hash = true;
1674 }
1675 }
1676 if (in) {
1677 if (in->snapid != CEPH_NOSNAP) {
11fdf7f2 1678 ldout(cct, 10) << __func__ << " " << *in << " is snapped, using nonsnap parent" << dendl;
7c673cae
FG
1679 while (in->snapid != CEPH_NOSNAP) {
1680 if (in->snapid == CEPH_SNAPDIR)
1681 in = in->snapdir_parent.get();
11fdf7f2 1682 else if (!in->dentries.empty())
7c673cae
FG
1683 /* In most cases there will only be one dentry, so getting it
1684 * will be the correct action. If there are multiple hard links,
1685 * I think the MDS should be able to redirect as needed*/
1686 in = in->get_first_parent()->dir->parent_inode;
1687 else {
1688 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1689 break;
1690 }
1691 }
1692 is_hash = false;
1693 }
1694
11fdf7f2 1695 ldout(cct, 20) << __func__ << " " << *in << " is_hash=" << is_hash
7c673cae
FG
1696 << " hash=" << hash << dendl;
1697
2a845540
TL
1698 if (req->get_op() == CEPH_MDS_OP_GETATTR)
1699 issued = req->inode()->caps_issued();
1700
f67539c2 1701 if (is_hash && S_ISDIR(in->mode) && (!in->fragmap.empty() || !in->frag_repmap.empty())) {
7c673cae 1702 frag_t fg = in->dirfragtree[hash];
2a845540 1703 if (!req->auth_is_best(issued)) {
f67539c2
TL
1704 auto repmapit = in->frag_repmap.find(fg);
1705 if (repmapit != in->frag_repmap.end()) {
1706 auto& repmap = repmapit->second;
1707 auto r = ceph::util::generate_random_number<uint64_t>(0, repmap.size()-1);
1708 mds = repmap.at(r);
1709 }
1710 } else if (in->fragmap.count(fg)) {
7c673cae
FG
1711 mds = in->fragmap[fg];
1712 if (phash_diri)
1713 *phash_diri = in;
91327a77 1714 } else if (in->auth_cap) {
f67539c2 1715 req->send_to_auth = true;
91327a77
AA
1716 mds = in->auth_cap->session->mds_num;
1717 }
1718 if (mds >= 0) {
11fdf7f2 1719 ldout(cct, 10) << __func__ << " from dirfragtree hash" << dendl;
7c673cae
FG
1720 goto out;
1721 }
1722 }
1723
2a845540 1724 if (in->auth_cap && req->auth_is_best(issued)) {
11fdf7f2
TL
1725 mds = in->auth_cap->session->mds_num;
1726 } else if (!in->caps.empty()) {
1727 mds = in->caps.begin()->second.session->mds_num;
1728 } else {
7c673cae 1729 goto random_mds;
11fdf7f2
TL
1730 }
1731 ldout(cct, 10) << __func__ << " from caps on inode " << *in << dendl;
7c673cae
FG
1732
1733 goto out;
1734 }
1735
1736random_mds:
1737 if (mds < 0) {
1738 mds = _get_random_up_mds();
1739 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1740 }
1741
1742out:
1743 ldout(cct, 20) << "mds is " << mds << dendl;
1744 return mds;
1745}
1746
7c673cae
FG
1747void Client::connect_mds_targets(mds_rank_t mds)
1748{
11fdf7f2
TL
1749 ldout(cct, 10) << __func__ << " for mds." << mds << dendl;
1750 ceph_assert(mds_sessions.count(mds));
7c673cae 1751 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
f67539c2
TL
1752 for (const auto &rank : info.export_targets) {
1753 if (mds_sessions.count(rank) == 0 &&
1754 mdsmap->is_clientreplay_or_active_or_stopping(rank)) {
7c673cae 1755 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
f67539c2 1756 << " export target mds." << rank << dendl;
1e59de90
TL
1757
1758 auto session = _get_or_open_mds_session(rank);
1759 if (session->state == MetaSession::STATE_OPENING ||
1760 session->state == MetaSession::STATE_OPEN)
1761 continue;
1762
f67539c2 1763 _open_mds_session(rank);
7c673cae
FG
1764 }
1765 }
1766}
1767
adb31ebb 1768void Client::dump_mds_sessions(Formatter *f, bool cap_dump)
7c673cae
FG
1769{
1770 f->dump_int("id", get_nodeid().v);
11fdf7f2 1771 entity_inst_t inst(messenger->get_myname(), messenger->get_myaddr_legacy());
1adf2230
AA
1772 f->dump_object("inst", inst);
1773 f->dump_stream("inst_str") << inst;
1774 f->dump_stream("addr_str") << inst.addr;
7c673cae 1775 f->open_array_section("sessions");
11fdf7f2 1776 for (const auto &p : mds_sessions) {
7c673cae 1777 f->open_object_section("session");
20effc67 1778 p.second->dump(f, cap_dump);
7c673cae
FG
1779 f->close_section();
1780 }
1781 f->close_section();
1782 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1783}
f67539c2 1784
7c673cae
FG
1785void Client::dump_mds_requests(Formatter *f)
1786{
1787 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1788 p != mds_requests.end();
1789 ++p) {
1790 f->open_object_section("request");
1791 p->second->dump(f);
1792 f->close_section();
1793 }
1794}
1795
9f95a23c 1796int Client::verify_reply_trace(int r, MetaSession *session,
11fdf7f2 1797 MetaRequest *request, const MConstRef<MClientReply>& reply,
7c673cae
FG
1798 InodeRef *ptarget, bool *pcreated,
1799 const UserPerm& perms)
1800{
1801 // check whether this request actually did the create, and set created flag
1802 bufferlist extra_bl;
1803 inodeno_t created_ino;
1804 bool got_created_ino = false;
1805 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1806
11fdf7f2 1807 extra_bl = reply->get_extra_bl();
7c673cae 1808 if (extra_bl.length() >= 8) {
9f95a23c
TL
1809 if (session->mds_features.test(CEPHFS_FEATURE_DELEG_INO)) {
1810 struct openc_response_t ocres;
1811
1812 decode(ocres, extra_bl);
1813 created_ino = ocres.created_ino;
1814 /*
1815 * The userland cephfs client doesn't have a way to do an async create
1816 * (yet), so just discard delegated_inos for now. Eventually we should
1817 * store them and use them in create calls, even if they are synchronous,
1818 * if only for testing purposes.
1819 */
1820 ldout(cct, 10) << "delegated_inos: " << ocres.delegated_inos << dendl;
1821 } else {
1822 // u64 containing number of created ino
1823 decode(created_ino, extra_bl);
1824 }
7c673cae 1825 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
9f95a23c 1826 got_created_ino = true;
7c673cae
FG
1827 }
1828
1829 if (pcreated)
1830 *pcreated = got_created_ino;
1831
1832 if (request->target) {
1833 *ptarget = request->target;
1834 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1835 } else {
1836 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1837 (*ptarget) = p->second;
1838 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1839 } else {
1840 // we got a traceless reply, and need to look up what we just
1841 // created. for now, do this by name. someday, do this by the
1842 // ino... which we know! FIXME.
1843 InodeRef target;
1844 Dentry *d = request->dentry();
1845 if (d) {
1846 if (d->dir) {
1847 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1848 << d->dir->parent_inode->ino << "/" << d->name
1849 << " got_ino " << got_created_ino
1850 << " ino " << created_ino
1851 << dendl;
1852 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1853 &target, perms);
1854 } else {
1855 // if the dentry is not linked, just do our best. see #5021.
11fdf7f2 1856 ceph_abort_msg("how did this happen? i want logs!");
7c673cae
FG
1857 }
1858 } else {
1859 Inode *in = request->inode();
1860 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1861 << in->ino << dendl;
1862 r = _getattr(in, request->regetattr_mask, perms, true);
1863 target = in;
1864 }
1865 if (r >= 0) {
1866 // verify ino returned in reply and trace_dist are the same
1867 if (got_created_ino &&
1868 created_ino.val != target->ino.val) {
1869 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
f67539c2 1870 r = -CEPHFS_EINTR;
7c673cae
FG
1871 }
1872 if (ptarget)
1873 ptarget->swap(target);
1874 }
1875 }
1876 }
1877
1878 return r;
1879}
1880
1881
1882/**
1883 * make a request
1884 *
1885 * Blocking helper to make an MDS request.
1886 *
1887 * If the ptarget flag is set, behavior changes slightly: the caller
1888 * expects to get a pointer to the inode we are creating or operating
1889 * on. As a result, we will follow up any traceless mutation reply
1890 * with a getattr or lookup to transparently handle a traceless reply
1891 * from the MDS (as when the MDS restarts and the client has to replay
1892 * a request).
1893 *
1894 * @param request the MetaRequest to execute
1895 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1896 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1897 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1898 * @param use_mds [optional] prefer a specific mds (-1 for default)
1899 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1900 */
1901int Client::make_request(MetaRequest *request,
1902 const UserPerm& perms,
1903 InodeRef *ptarget, bool *pcreated,
1904 mds_rank_t use_mds,
39ae355f
TL
1905 bufferlist *pdirbl,
1906 size_t feature_needed)
7c673cae
FG
1907{
1908 int r = 0;
1909
1910 // assign a unique tid
1911 ceph_tid_t tid = ++last_tid;
1912 request->set_tid(tid);
1913
1914 // and timestamp
1915 request->op_stamp = ceph_clock_now();
2a845540 1916 request->created = ceph::coarse_mono_clock::now();
7c673cae
FG
1917
1918 // make note
1919 mds_requests[tid] = request->get();
1920 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1921 oldest_tid = tid;
1922
1923 request->set_caller_perms(perms);
1924
1925 if (cct->_conf->client_inject_fixed_oldest_tid) {
1926 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1927 request->set_oldest_client_tid(1);
1928 } else {
1929 request->set_oldest_client_tid(oldest_tid);
1930 }
1931
1932 // hack target mds?
1933 if (use_mds >= 0)
1934 request->resend_mds = use_mds;
1935
20effc67 1936 MetaSessionRef session = NULL;
7c673cae
FG
1937 while (1) {
1938 if (request->aborted())
1939 break;
1940
f67539c2
TL
1941 if (blocklisted) {
1942 request->abort(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
1943 break;
1944 }
1945
7c673cae 1946 // set up wait cond
9f95a23c 1947 ceph::condition_variable caller_cond;
7c673cae
FG
1948 request->caller_cond = &caller_cond;
1949
1950 // choose mds
1951 Inode *hash_diri = NULL;
1952 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1953 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1954 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1955 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1956 if (hash_diri) {
1957 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1958 _fragmap_remove_stopped_mds(hash_diri, mds);
1959 } else {
1960 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1961 request->resend_mds = _get_random_up_mds();
1962 }
1963 } else {
1964 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1965 wait_on_list(waiting_for_mdsmap);
1966 }
1967 continue;
1968 }
1969
1970 // open a session?
7c673cae
FG
1971 if (!have_open_session(mds)) {
1972 session = _get_or_open_mds_session(mds);
f6b5b4d7 1973 if (session->state == MetaSession::STATE_REJECTED) {
f67539c2 1974 request->abort(-CEPHFS_EPERM);
f6b5b4d7
TL
1975 break;
1976 }
7c673cae
FG
1977 // wait
1978 if (session->state == MetaSession::STATE_OPENING) {
1979 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1980 wait_on_context_list(session->waiting_for_open);
7c673cae
FG
1981 continue;
1982 }
1983
1984 if (!have_open_session(mds))
1985 continue;
1986 } else {
20effc67 1987 session = mds_sessions.at(mds);
7c673cae
FG
1988 }
1989
39ae355f
TL
1990 if (feature_needed != ULONG_MAX && !session->mds_features.test(feature_needed)) {
1991 request->abort(-CEPHFS_EOPNOTSUPP);
1992 break;
1993 }
1994
7c673cae 1995 // send request.
20effc67 1996 send_request(request, session.get());
7c673cae
FG
1997
1998 // wait for signal
1999 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
2000 request->kick = false;
9f95a23c
TL
2001 std::unique_lock l{client_lock, std::adopt_lock};
2002 caller_cond.wait(l, [request] {
2003 return (request->reply || // reply
2004 request->resend_mds >= 0 || // forward
2005 request->kick);
2006 });
2007 l.release();
2008 request->caller_cond = nullptr;
7c673cae
FG
2009
2010 // did we get a reply?
39ae355f 2011 if (request->reply)
7c673cae
FG
2012 break;
2013 }
2014
2015 if (!request->reply) {
11fdf7f2
TL
2016 ceph_assert(request->aborted());
2017 ceph_assert(!request->got_unsafe);
7c673cae
FG
2018 r = request->get_abort_code();
2019 request->item.remove_myself();
2020 unregister_request(request);
11fdf7f2 2021 put_request(request);
7c673cae
FG
2022 return r;
2023 }
2024
2025 // got it!
11fdf7f2 2026 auto reply = std::move(request->reply);
7c673cae
FG
2027 r = reply->get_result();
2028 if (r >= 0)
2029 request->success = true;
2030
2031 // kick dispatcher (we've got it!)
11fdf7f2 2032 ceph_assert(request->dispatch_cond);
9f95a23c 2033 request->dispatch_cond->notify_all();
7c673cae
FG
2034 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
2035 request->dispatch_cond = 0;
2036
2037 if (r >= 0 && ptarget)
20effc67 2038 r = verify_reply_trace(r, session.get(), request, reply, ptarget, pcreated, perms);
7c673cae
FG
2039
2040 if (pdirbl)
11fdf7f2 2041 *pdirbl = reply->get_extra_bl();
7c673cae
FG
2042
2043 // -- log times --
2044 utime_t lat = ceph_clock_now();
2045 lat -= request->sent_stamp;
2046 ldout(cct, 20) << "lat " << lat << dendl;
2a845540
TL
2047
2048 ++nr_metadata_request;
2049 update_io_stat_metadata(lat);
7c673cae
FG
2050
2051 put_request(request);
7c673cae
FG
2052 return r;
2053}
2054
2055void Client::unregister_request(MetaRequest *req)
2056{
2057 mds_requests.erase(req->tid);
2058 if (req->tid == oldest_tid) {
2059 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
2060 while (true) {
2061 if (p == mds_requests.end()) {
2062 oldest_tid = 0;
2063 break;
2064 }
2065 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
2066 oldest_tid = p->first;
2067 break;
2068 }
2069 ++p;
2070 }
2071 }
2072 put_request(req);
2073}
2074
2075void Client::put_request(MetaRequest *request)
2076{
2077 if (request->_put()) {
2078 int op = -1;
2079 if (request->success)
2080 op = request->get_op();
2081 InodeRef other_in;
2082 request->take_other_inode(&other_in);
2083 delete request;
2084
2085 if (other_in &&
2086 (op == CEPH_MDS_OP_RMDIR ||
2087 op == CEPH_MDS_OP_RENAME ||
2088 op == CEPH_MDS_OP_RMSNAP)) {
2089 _try_to_trim_inode(other_in.get(), false);
2090 }
2091 }
2092}
2093
2094int Client::encode_inode_release(Inode *in, MetaRequest *req,
2095 mds_rank_t mds, int drop,
2096 int unless, int force)
2097{
11fdf7f2 2098 ldout(cct, 20) << __func__ << " enter(in:" << *in << ", req:" << req
f67539c2 2099 << " mds:" << mds << ", drop:" << ccap_string(drop) << ", unless:" << ccap_string(unless)
1911f103 2100 << ", force:" << force << ")" << dendl;
7c673cae 2101 int released = 0;
11fdf7f2
TL
2102 auto it = in->caps.find(mds);
2103 if (it != in->caps.end()) {
2104 Cap &cap = it->second;
7c673cae 2105 drop &= ~(in->dirty_caps | get_caps_used(in));
11fdf7f2
TL
2106 if ((drop & cap.issued) &&
2107 !(unless & cap.issued)) {
1911f103 2108 ldout(cct, 25) << "dropping caps " << ccap_string(drop) << dendl;
11fdf7f2
TL
2109 cap.issued &= ~drop;
2110 cap.implemented &= ~drop;
7c673cae 2111 released = 1;
7c673cae
FG
2112 } else {
2113 released = force;
2114 }
2115 if (released) {
1911f103
TL
2116 cap.wanted = in->caps_wanted();
2117 if (&cap == in->auth_cap &&
2118 !(cap.wanted & CEPH_CAP_ANY_FILE_WR)) {
2119 in->requested_max_size = 0;
2120 ldout(cct, 25) << "reset requested_max_size due to not wanting any file write cap" << dendl;
2121 }
7c673cae
FG
2122 ceph_mds_request_release rel;
2123 rel.ino = in->ino;
11fdf7f2
TL
2124 rel.cap_id = cap.cap_id;
2125 rel.seq = cap.seq;
2126 rel.issue_seq = cap.issue_seq;
2127 rel.mseq = cap.mseq;
2128 rel.caps = cap.implemented;
2129 rel.wanted = cap.wanted;
7c673cae
FG
2130 rel.dname_len = 0;
2131 rel.dname_seq = 0;
2132 req->cap_releases.push_back(MClientRequest::Release(rel,""));
2133 }
2134 }
11fdf7f2 2135 ldout(cct, 25) << __func__ << " exit(in:" << *in << ") released:"
7c673cae
FG
2136 << released << dendl;
2137 return released;
2138}
2139
2140void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
2141 mds_rank_t mds, int drop, int unless)
2142{
11fdf7f2 2143 ldout(cct, 20) << __func__ << " enter(dn:"
7c673cae
FG
2144 << dn << ")" << dendl;
2145 int released = 0;
2146 if (dn->dir)
2147 released = encode_inode_release(dn->dir->parent_inode, req,
2148 mds, drop, unless, 1);
2149 if (released && dn->lease_mds == mds) {
2150 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
11fdf7f2 2151 auto& rel = req->cap_releases.back();
7c673cae
FG
2152 rel.item.dname_len = dn->name.length();
2153 rel.item.dname_seq = dn->lease_seq;
2154 rel.dname = dn->name;
adb31ebb 2155 dn->lease_mds = -1;
7c673cae 2156 }
11fdf7f2 2157 ldout(cct, 25) << __func__ << " exit(dn:"
7c673cae
FG
2158 << dn << ")" << dendl;
2159}
2160
2161
2162/*
2163 * This requires the MClientRequest *request member to be set.
2164 * It will error out horribly without one.
2165 * Additionally, if you set any *drop member, you'd better have
2166 * set the corresponding dentry!
2167 */
2168void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
2169{
11fdf7f2 2170 ldout(cct, 20) << __func__ << " enter (req: "
7c673cae
FG
2171 << req << ", mds: " << mds << ")" << dendl;
2172 if (req->inode_drop && req->inode())
2173 encode_inode_release(req->inode(), req,
2174 mds, req->inode_drop,
2175 req->inode_unless);
2176
2177 if (req->old_inode_drop && req->old_inode())
2178 encode_inode_release(req->old_inode(), req,
2179 mds, req->old_inode_drop,
2180 req->old_inode_unless);
2181 if (req->other_inode_drop && req->other_inode())
2182 encode_inode_release(req->other_inode(), req,
2183 mds, req->other_inode_drop,
2184 req->other_inode_unless);
2185
2186 if (req->dentry_drop && req->dentry())
2187 encode_dentry_release(req->dentry(), req,
2188 mds, req->dentry_drop,
2189 req->dentry_unless);
2190
2191 if (req->old_dentry_drop && req->old_dentry())
2192 encode_dentry_release(req->old_dentry(), req,
2193 mds, req->old_dentry_drop,
2194 req->old_dentry_unless);
11fdf7f2 2195 ldout(cct, 25) << __func__ << " exit (req: "
7c673cae
FG
2196 << req << ", mds " << mds <<dendl;
2197}
2198
2199bool Client::have_open_session(mds_rank_t mds)
2200{
11fdf7f2
TL
2201 const auto &it = mds_sessions.find(mds);
2202 return it != mds_sessions.end() &&
20effc67
TL
2203 (it->second->state == MetaSession::STATE_OPEN ||
2204 it->second->state == MetaSession::STATE_STALE);
7c673cae
FG
2205}
2206
20effc67 2207MetaSessionRef Client::_get_mds_session(mds_rank_t mds, Connection *con)
7c673cae 2208{
11fdf7f2 2209 const auto &it = mds_sessions.find(mds);
20effc67 2210 if (it == mds_sessions.end() || it->second->con != con) {
7c673cae 2211 return NULL;
11fdf7f2 2212 } else {
20effc67 2213 return it->second;
11fdf7f2 2214 }
7c673cae
FG
2215}
2216
20effc67 2217MetaSessionRef Client::_get_or_open_mds_session(mds_rank_t mds)
7c673cae 2218{
11fdf7f2 2219 auto it = mds_sessions.find(mds);
20effc67 2220 return it == mds_sessions.end() ? _open_mds_session(mds) : it->second;
7c673cae
FG
2221}
2222
2223/**
2224 * Populate a map of strings with client-identifying metadata,
2225 * such as the hostname. Call this once at initialization.
2226 */
2227void Client::populate_metadata(const std::string &mount_root)
2228{
2229 // Hostname
f67539c2
TL
2230#ifdef _WIN32
2231 // TODO: move this to compat.h
2232 char hostname[64];
2233 DWORD hostname_sz = 64;
2234 GetComputerNameA(hostname, &hostname_sz);
2235 metadata["hostname"] = hostname;
2236#else
7c673cae
FG
2237 struct utsname u;
2238 int r = uname(&u);
2239 if (r >= 0) {
2240 metadata["hostname"] = u.nodename;
2241 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
2242 } else {
2243 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
2244 }
f67539c2 2245#endif
7c673cae
FG
2246
2247 metadata["pid"] = stringify(getpid());
2248
2249 // Ceph entity id (the '0' in "client.0")
2250 metadata["entity_id"] = cct->_conf->name.get_id();
2251
2252 // Our mount position
2253 if (!mount_root.empty()) {
2254 metadata["root"] = mount_root;
2255 }
2256
2257 // Ceph version
2258 metadata["ceph_version"] = pretty_version_to_str();
2259 metadata["ceph_sha1"] = git_version_to_str();
2260
2261 // Apply any metadata from the user's configured overrides
2262 std::vector<std::string> tokens;
2263 get_str_vec(cct->_conf->client_metadata, ",", tokens);
2264 for (const auto &i : tokens) {
2265 auto eqpos = i.find("=");
2266 // Throw out anything that isn't of the form "<str>=<str>"
2267 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
2268 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
2269 continue;
2270 }
2271 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
2272 }
2273}
2274
2275/**
2276 * Optionally add or override client metadata fields.
2277 */
2278void Client::update_metadata(std::string const &k, std::string const &v)
2279{
f67539c2
TL
2280 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2281 ceph_assert(iref_reader.is_state_satisfied());
2282
2283 std::scoped_lock l(client_lock);
7c673cae 2284
11fdf7f2
TL
2285 auto it = metadata.find(k);
2286 if (it != metadata.end()) {
7c673cae 2287 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
11fdf7f2 2288 << "' from '" << it->second << "' to '" << v << "'" << dendl;
7c673cae
FG
2289 }
2290
2291 metadata[k] = v;
2292}
2293
20effc67 2294MetaSessionRef Client::_open_mds_session(mds_rank_t mds)
7c673cae 2295{
11fdf7f2
TL
2296 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
2297 auto addrs = mdsmap->get_addrs(mds);
2298 auto em = mds_sessions.emplace(std::piecewise_construct,
2299 std::forward_as_tuple(mds),
20effc67 2300 std::forward_as_tuple(new MetaSession(mds, messenger->connect_to_mds(addrs), addrs)));
11fdf7f2 2301 ceph_assert(em.second); /* not already present */
20effc67 2302 auto session = em.first->second;
7c673cae 2303
9f95a23c 2304 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_OPEN);
11fdf7f2
TL
2305 m->metadata = metadata;
2306 m->supported_features = feature_bitset_t(CEPHFS_FEATURES_CLIENT_SUPPORTED);
f67539c2 2307 m->metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
11fdf7f2 2308 session->con->send_message2(std::move(m));
7c673cae
FG
2309 return session;
2310}
2311
2312void Client::_close_mds_session(MetaSession *s)
2313{
11fdf7f2 2314 ldout(cct, 2) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
7c673cae 2315 s->state = MetaSession::STATE_CLOSING;
9f95a23c 2316 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
2317}
2318
f6b5b4d7 2319void Client::_closed_mds_session(MetaSession *s, int err, bool rejected)
7c673cae 2320{
11fdf7f2 2321 ldout(cct, 5) << __func__ << " mds." << s->mds_num << " seq " << s->seq << dendl;
f6b5b4d7
TL
2322 if (rejected && s->state != MetaSession::STATE_CLOSING)
2323 s->state = MetaSession::STATE_REJECTED;
2324 else
2325 s->state = MetaSession::STATE_CLOSED;
7c673cae
FG
2326 s->con->mark_down();
2327 signal_context_list(s->waiting_for_open);
9f95a23c 2328 mount_cond.notify_all();
f6b5b4d7 2329 remove_session_caps(s, err);
7c673cae 2330 kick_requests_closed(s);
f6b5b4d7
TL
2331 mds_ranks_closing.erase(s->mds_num);
2332 if (s->state == MetaSession::STATE_CLOSED)
2333 mds_sessions.erase(s->mds_num);
7c673cae
FG
2334}
2335
11fdf7f2 2336void Client::handle_client_session(const MConstRef<MClientSession>& m)
7c673cae
FG
2337{
2338 mds_rank_t from = mds_rank_t(m->get_source().num());
11fdf7f2 2339 ldout(cct, 10) << __func__ << " " << *m << " from mds." << from << dendl;
7c673cae 2340
f67539c2 2341 std::scoped_lock cl(client_lock);
20effc67 2342 auto session = _get_mds_session(from, m->get_connection().get());
7c673cae
FG
2343 if (!session) {
2344 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
7c673cae
FG
2345 return;
2346 }
2347
2348 switch (m->get_op()) {
2349 case CEPH_SESSION_OPEN:
11fdf7f2 2350 {
39ae355f
TL
2351 if (session->state == MetaSession::STATE_OPEN) {
2352 ldout(cct, 10) << "mds." << from << " already opened, ignore it"
2353 << dendl;
2354 return;
2355 }
2356 /*
2357 * The connection maybe broken and the session in client side
2358 * has been reinitialized, need to update the seq anyway.
2359 */
2360 if (!session->seq && m->get_seq())
2361 session->seq = m->get_seq();
2362
11fdf7f2 2363 session->mds_features = std::move(m->supported_features);
33c7a0ef 2364 session->mds_metric_flags = std::move(m->metric_spec.metric_flags);
11fdf7f2 2365
20effc67 2366 renew_caps(session.get());
11fdf7f2 2367 session->state = MetaSession::STATE_OPEN;
f67539c2 2368 if (is_unmounting())
9f95a23c 2369 mount_cond.notify_all();
11fdf7f2
TL
2370 else
2371 connect_mds_targets(from);
2372 signal_context_list(session->waiting_for_open);
2373 break;
2374 }
7c673cae
FG
2375
2376 case CEPH_SESSION_CLOSE:
20effc67 2377 _closed_mds_session(session.get());
7c673cae
FG
2378 break;
2379
2380 case CEPH_SESSION_RENEWCAPS:
2381 if (session->cap_renew_seq == m->get_seq()) {
a8e16298 2382 bool was_stale = ceph_clock_now() >= session->cap_ttl;
7c673cae
FG
2383 session->cap_ttl =
2384 session->last_cap_renew_request + mdsmap->get_session_timeout();
a8e16298 2385 if (was_stale)
20effc67 2386 wake_up_session_caps(session.get(), false);
7c673cae
FG
2387 }
2388 break;
2389
2390 case CEPH_SESSION_STALE:
28e407b8
AA
2391 // invalidate session caps/leases
2392 session->cap_gen++;
2393 session->cap_ttl = ceph_clock_now();
2394 session->cap_ttl -= 1;
20effc67 2395 renew_caps(session.get());
7c673cae
FG
2396 break;
2397
2398 case CEPH_SESSION_RECALL_STATE:
f67539c2
TL
2399 /*
2400 * Call the renew caps and flush cap releases just before
2401 * triming the caps in case the tick() won't get a chance
2402 * to run them, which could cause the client to be blocklisted
2403 * and MDS daemons trying to recall the caps again and
2404 * again.
2405 *
2406 * In most cases it will do nothing, and the new cap releases
2407 * added by trim_caps() followed will be deferred flushing
2408 * by tick().
2409 */
2410 renew_and_flush_cap_releases();
20effc67 2411 trim_caps(session.get(), m->get_max_caps());
7c673cae
FG
2412 break;
2413
2414 case CEPH_SESSION_FLUSHMSG:
a8e16298 2415 /* flush cap release */
11fdf7f2
TL
2416 if (auto& m = session->release; m) {
2417 session->con->send_message2(std::move(m));
a8e16298 2418 }
9f95a23c 2419 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
7c673cae
FG
2420 break;
2421
2422 case CEPH_SESSION_FORCE_RO:
20effc67 2423 force_session_readonly(session.get());
7c673cae
FG
2424 break;
2425
2426 case CEPH_SESSION_REJECT:
11fdf7f2
TL
2427 {
2428 std::string_view error_str;
2429 auto it = m->metadata.find("error_string");
2430 if (it != m->metadata.end())
2431 error_str = it->second;
2432 else
2433 error_str = "unknown error";
2434 lderr(cct) << "mds." << from << " rejected us (" << error_str << ")" << dendl;
7c673cae 2435
20effc67 2436 _closed_mds_session(session.get(), -CEPHFS_EPERM, true);
11fdf7f2 2437 }
7c673cae
FG
2438 break;
2439
2440 default:
2441 ceph_abort();
2442 }
7c673cae
FG
2443}
2444
2445bool Client::_any_stale_sessions() const
2446{
9f95a23c 2447 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae 2448
11fdf7f2 2449 for (const auto &p : mds_sessions) {
20effc67 2450 if (p.second->state == MetaSession::STATE_STALE) {
7c673cae
FG
2451 return true;
2452 }
2453 }
2454
2455 return false;
2456}
2457
2458void Client::_kick_stale_sessions()
2459{
11fdf7f2 2460 ldout(cct, 1) << __func__ << dendl;
7c673cae 2461
11fdf7f2 2462 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67
TL
2463 auto s = it->second;
2464 if (s->state == MetaSession::STATE_REJECTED) {
2465 mds_sessions.erase(it->first);
f6b5b4d7
TL
2466 continue;
2467 }
20effc67
TL
2468 if (s->state == MetaSession::STATE_STALE)
2469 _closed_mds_session(s.get());
7c673cae
FG
2470 }
2471}
2472
2473void Client::send_request(MetaRequest *request, MetaSession *session,
2474 bool drop_cap_releases)
2475{
2476 // make the request
2477 mds_rank_t mds = session->mds_num;
11fdf7f2 2478 ldout(cct, 10) << __func__ << " rebuilding request " << request->get_tid()
7c673cae 2479 << " for mds." << mds << dendl;
1e59de90
TL
2480 auto r = build_client_request(request, mds);
2481 if (!r)
2482 return;
2483
7c673cae
FG
2484 if (request->dentry()) {
2485 r->set_dentry_wanted();
2486 }
2487 if (request->got_unsafe) {
2488 r->set_replayed_op();
2489 if (request->target)
2490 r->head.ino = request->target->ino;
2491 } else {
2492 encode_cap_releases(request, mds);
2493 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2494 request->cap_releases.clear();
2495 else
2496 r->releases.swap(request->cap_releases);
2497 }
2498 r->set_mdsmap_epoch(mdsmap->get_epoch());
2499 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2500 objecter->with_osdmap([r](const OSDMap& o) {
2501 r->set_osdmap_epoch(o.get_epoch());
2502 });
2503 }
2504
2505 if (request->mds == -1) {
2506 request->sent_stamp = ceph_clock_now();
11fdf7f2 2507 ldout(cct, 20) << __func__ << " set sent_stamp to " << request->sent_stamp << dendl;
7c673cae
FG
2508 }
2509 request->mds = mds;
2510
2511 Inode *in = request->inode();
11fdf7f2
TL
2512 if (in) {
2513 auto it = in->caps.find(mds);
2514 if (it != in->caps.end()) {
2515 request->sent_on_mseq = it->second.mseq;
2516 }
2517 }
7c673cae
FG
2518
2519 session->requests.push_back(&request->item);
2520
11fdf7f2
TL
2521 ldout(cct, 10) << __func__ << " " << *r << " to mds." << mds << dendl;
2522 session->con->send_message2(std::move(r));
7c673cae
FG
2523}
2524
1e59de90 2525ref_t<MClientRequest> Client::build_client_request(MetaRequest *request, mds_rank_t mds)
7c673cae 2526{
1e59de90
TL
2527 auto session = mds_sessions.at(mds);
2528 bool old_version = !session->mds_features.test(CEPHFS_FEATURE_32BITS_RETRY_FWD);
2529
2530 /*
2531 * Avoid inifinite retrying after overflow.
2532 *
2533 * The client will increase the retry count and if the MDS is
2534 * old version, so we limit to retry at most 256 times.
2535 */
2536 if (request->retry_attempt) {
2537 int old_max_retry = sizeof(((struct ceph_mds_request_head*)0)->num_retry);
2538 old_max_retry = 1 << (old_max_retry * CHAR_BIT);
2539 if ((old_version && request->retry_attempt >= old_max_retry) ||
2540 (uint32_t)request->retry_attempt >= UINT32_MAX) {
2541 request->abort(-CEPHFS_EMULTIHOP);
2542 request->caller_cond->notify_all();
2543 ldout(cct, 1) << __func__ << " request tid " << request->tid
2544 << " retry seq overflow" << ", abort it" << dendl;
2545 return nullptr;
2546 }
2547 }
2548
2549 auto req = make_message<MClientRequest>(request->get_op(), old_version);
7c673cae
FG
2550 req->set_tid(request->tid);
2551 req->set_stamp(request->op_stamp);
2552 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2553
2554 // if the filepath's haven't been set, set them!
2555 if (request->path.empty()) {
2556 Inode *in = request->inode();
2557 Dentry *de = request->dentry();
2558 if (in)
2559 in->make_nosnap_relative_path(request->path);
2560 else if (de) {
2561 if (de->inode)
2562 de->inode->make_nosnap_relative_path(request->path);
2563 else if (de->dir) {
2564 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2565 request->path.push_dentry(de->name);
2566 }
2567 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2568 << " No path, inode, or appropriately-endowed dentry given!"
2569 << dendl;
2570 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2571 << " No path, inode, or dentry given!"
2572 << dendl;
2573 }
2574 req->set_filepath(request->get_filepath());
2575 req->set_filepath2(request->get_filepath2());
f67539c2 2576 req->set_alternate_name(request->alternate_name);
7c673cae 2577 req->set_data(request->data);
1e59de90
TL
2578 req->fscrypt_auth = request->fscrypt_auth;
2579 req->fscrypt_file = request->fscrypt_file;
7c673cae 2580 req->set_retry_attempt(request->retry_attempt++);
1e59de90 2581 req->head.ext_num_fwd = request->num_fwd;
7c673cae
FG
2582 const gid_t *_gids;
2583 int gid_count = request->perms.get_gids(&_gids);
2584 req->set_gid_list(gid_count, _gids);
2585 return req;
2586}
2587
2588
2589
11fdf7f2 2590void Client::handle_client_request_forward(const MConstRef<MClientRequestForward>& fwd)
7c673cae
FG
2591{
2592 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
f67539c2
TL
2593
2594 std::scoped_lock cl(client_lock);
20effc67 2595 auto session = _get_mds_session(mds, fwd->get_connection().get());
7c673cae 2596 if (!session) {
7c673cae
FG
2597 return;
2598 }
2599 ceph_tid_t tid = fwd->get_tid();
2600
2601 if (mds_requests.count(tid) == 0) {
11fdf7f2 2602 ldout(cct, 10) << __func__ << " no pending request on tid " << tid << dendl;
7c673cae
FG
2603 return;
2604 }
2605
2606 MetaRequest *request = mds_requests[tid];
11fdf7f2 2607 ceph_assert(request);
7c673cae 2608
33c7a0ef 2609 /*
1e59de90 2610 * Avoid inifinite retrying after overflow.
33c7a0ef 2611 *
1e59de90
TL
2612 * The MDS will increase the fwd count and in client side
2613 * if the num_fwd is less than the one saved in request
2614 * that means the MDS is an old version and overflowed of
2615 * 8 bits.
33c7a0ef 2616 */
33c7a0ef 2617 auto num_fwd = fwd->get_num_fwd();
1e59de90
TL
2618 if (num_fwd <= request->num_fwd || (uint32_t)num_fwd >= UINT32_MAX) {
2619 request->abort(-CEPHFS_EMULTIHOP);
2620 request->caller_cond->notify_all();
2621 ldout(cct, 0) << __func__ << " request tid " << tid << " new num_fwd "
2622 << num_fwd << " old num_fwd " << request->num_fwd << ", fwd seq overflow"
2623 << ", abort it" << dendl;
33c7a0ef
TL
2624 return;
2625 }
2626
7c673cae
FG
2627 // reset retry counter
2628 request->retry_attempt = 0;
2629
2630 // request not forwarded, or dest mds has no session.
2631 // resend.
11fdf7f2 2632 ldout(cct, 10) << __func__ << " tid " << tid
7c673cae
FG
2633 << " fwd " << fwd->get_num_fwd()
2634 << " to mds." << fwd->get_dest_mds()
2635 << ", resending to " << fwd->get_dest_mds()
2636 << dendl;
2637
2638 request->mds = -1;
2639 request->item.remove_myself();
33c7a0ef 2640 request->num_fwd = num_fwd;
7c673cae 2641 request->resend_mds = fwd->get_dest_mds();
9f95a23c 2642 request->caller_cond->notify_all();
7c673cae
FG
2643}
2644
2645bool Client::is_dir_operation(MetaRequest *req)
2646{
2647 int op = req->get_op();
2648 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2649 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2650 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2651 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2652 return true;
2653 return false;
2654}
2655
11fdf7f2 2656void Client::handle_client_reply(const MConstRef<MClientReply>& reply)
7c673cae
FG
2657{
2658 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
f67539c2
TL
2659
2660 std::scoped_lock cl(client_lock);
20effc67 2661 auto session = _get_mds_session(mds_num, reply->get_connection().get());
7c673cae 2662 if (!session) {
7c673cae
FG
2663 return;
2664 }
2665
2666 ceph_tid_t tid = reply->get_tid();
2667 bool is_safe = reply->is_safe();
2668
2669 if (mds_requests.count(tid) == 0) {
11fdf7f2 2670 lderr(cct) << __func__ << " no pending request on tid " << tid
7c673cae 2671 << " safe is:" << is_safe << dendl;
7c673cae
FG
2672 return;
2673 }
2674 MetaRequest *request = mds_requests.at(tid);
2675
11fdf7f2 2676 ldout(cct, 20) << __func__ << " got a reply. Safe:" << is_safe
7c673cae
FG
2677 << " tid " << tid << dendl;
2678
1e59de90
TL
2679 // correct sessions ?
2680 if (request->mds != mds_num) {
2681 ldout(cct, 0) << "got a stale reply from mds." << mds_num
2682 << " instead of mds." << request->mds << dendl;
2683 return;
2684 }
2685
7c673cae
FG
2686 if (request->got_unsafe && !is_safe) {
2687 //duplicate response
2688 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2689 << mds_num << " safe:" << is_safe << dendl;
7c673cae
FG
2690 return;
2691 }
2692
11fdf7f2 2693 ceph_assert(!request->reply);
7c673cae 2694 request->reply = reply;
20effc67 2695 insert_trace(request, session.get());
7c673cae
FG
2696
2697 // Handle unsafe reply
2698 if (!is_safe) {
2699 request->got_unsafe = true;
2700 session->unsafe_requests.push_back(&request->unsafe_item);
2701 if (is_dir_operation(request)) {
2702 Inode *dir = request->inode();
11fdf7f2 2703 ceph_assert(dir);
7c673cae
FG
2704 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2705 }
2706 if (request->target) {
2707 InodeRef &in = request->target;
2708 in->unsafe_ops.push_back(&request->unsafe_target_item);
2709 }
2710 }
2711
2712 // Only signal the caller once (on the first reply):
2713 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2714 if (!is_safe || !request->got_unsafe) {
9f95a23c 2715 ceph::condition_variable cond;
7c673cae
FG
2716 request->dispatch_cond = &cond;
2717
2718 // wake up waiter
11fdf7f2 2719 ldout(cct, 20) << __func__ << " signalling caller " << (void*)request->caller_cond << dendl;
9f95a23c 2720 request->caller_cond->notify_all();
7c673cae
FG
2721
2722 // wake for kick back
9f95a23c
TL
2723 std::unique_lock l{client_lock, std::adopt_lock};
2724 cond.wait(l, [tid, request, &cond, this] {
2725 if (request->dispatch_cond) {
2726 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid "
2727 << tid << " " << &cond << dendl;
2728 }
2729 return !request->dispatch_cond;
2730 });
2731 l.release();
7c673cae
FG
2732 }
2733
2734 if (is_safe) {
2735 // the filesystem change is committed to disk
2736 // we're done, clean up
2737 if (request->got_unsafe) {
2738 request->unsafe_item.remove_myself();
2739 request->unsafe_dir_item.remove_myself();
2740 request->unsafe_target_item.remove_myself();
2741 signal_cond_list(request->waitfor_safe);
2742 }
2743 request->item.remove_myself();
2744 unregister_request(request);
2745 }
f67539c2 2746 if (is_unmounting())
9f95a23c 2747 mount_cond.notify_all();
7c673cae
FG
2748}
2749
2750void Client::_handle_full_flag(int64_t pool)
2751{
2752 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2753 << "on " << pool << dendl;
f67539c2 2754 // Cancel all outstanding ops in this pool with -CEPHFS_ENOSPC: it is necessary
7c673cae
FG
2755 // to do this rather than blocking, because otherwise when we fill up we
2756 // potentially lock caps forever on files with dirty pages, and we need
2757 // to be able to release those caps to the MDS so that it can delete files
2758 // and free up space.
f67539c2 2759 epoch_t cancelled_epoch = objecter->op_cancel_writes(-CEPHFS_ENOSPC, pool);
7c673cae
FG
2760
2761 // For all inodes with layouts in this pool and a pending flush write op
2762 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2763 // from ObjectCacher so that it doesn't re-issue the write in response to
2764 // the ENOSPC error.
2765 // Fortunately since we're cancelling everything in a given pool, we don't
2766 // need to know which ops belong to which ObjectSet, we can just blow all
2767 // the un-flushed cached data away and mark any dirty inodes' async_err
f67539c2 2768 // field with -CEPHFS_ENOSPC as long as we're sure all the ops we cancelled were
7c673cae
FG
2769 // affecting this pool, and all the objectsets we're purging were also
2770 // in this pool.
2771 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2772 i != inode_map.end(); ++i)
2773 {
2774 Inode *inode = i->second;
2775 if (inode->oset.dirty_or_tx
2776 && (pool == -1 || inode->layout.pool_id == pool)) {
2777 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2778 << " has dirty objects, purging and setting ENOSPC" << dendl;
2779 objectcacher->purge_set(&inode->oset);
f67539c2 2780 inode->set_async_err(-CEPHFS_ENOSPC);
7c673cae
FG
2781 }
2782 }
2783
2784 if (cancelled_epoch != (epoch_t)-1) {
2785 set_cap_epoch_barrier(cancelled_epoch);
2786 }
2787}
2788
11fdf7f2 2789void Client::handle_osd_map(const MConstRef<MOSDMap>& m)
7c673cae 2790{
f67539c2 2791 std::scoped_lock cl(client_lock);
31f18b77 2792
11fdf7f2 2793 const auto myaddrs = messenger->get_myaddrs();
33c7a0ef 2794 bool new_blocklist = objecter->with_osdmap(
11fdf7f2 2795 [&](const OSDMap& o) {
33c7a0ef 2796 return o.is_blocklisted(myaddrs);
11fdf7f2 2797 });
33c7a0ef
TL
2798
2799 if (new_blocklist && !blocklisted) {
31f18b77
FG
2800 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2801 return o.get_epoch();
2802 });
f67539c2
TL
2803 lderr(cct) << "I was blocklisted at osd epoch " << epoch << dendl;
2804 blocklisted = true;
31f18b77 2805
f67539c2 2806 _abort_mds_sessions(-CEPHFS_EBLOCKLISTED);
31f18b77
FG
2807
2808 // Since we know all our OSD ops will fail, cancel them all preemtively,
2809 // so that on an unhealthy cluster we can umount promptly even if e.g.
2810 // some PGs were inaccessible.
f67539c2
TL
2811 objecter->op_cancel_writes(-CEPHFS_EBLOCKLISTED);
2812
2813 }
31f18b77 2814
f67539c2
TL
2815 if (blocklisted) {
2816 // Handle case where we were blocklisted but no longer are
2817 blocklisted = objecter->with_osdmap([myaddrs](const OSDMap &o){
2818 return o.is_blocklisted(myaddrs);});
31f18b77
FG
2819 }
2820
f67539c2
TL
2821 // Always subscribe to next osdmap for blocklisted client
2822 // until this client is not blocklisted.
2823 if (blocklisted) {
f64942e4
AA
2824 objecter->maybe_request_map();
2825 }
2826
7c673cae
FG
2827 if (objecter->osdmap_full_flag()) {
2828 _handle_full_flag(-1);
2829 } else {
2830 // Accumulate local list of full pools so that I can drop
2831 // the objecter lock before re-entering objecter in
2832 // cancel_writes
2833 std::vector<int64_t> full_pools;
2834
2835 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2836 for (const auto& kv : o.get_pools()) {
2837 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2838 full_pools.push_back(kv.first);
2839 }
2840 }
2841 });
2842
2843 for (auto p : full_pools)
2844 _handle_full_flag(p);
2845
2846 // Subscribe to subsequent maps to watch for the full flag going
2847 // away. For the global full flag objecter does this for us, but
2848 // it pays no attention to the per-pool full flag so in this branch
2849 // we do it ourselves.
2850 if (!full_pools.empty()) {
2851 objecter->maybe_request_map();
2852 }
2853 }
7c673cae
FG
2854}
2855
2856
2857// ------------------------
2858// incoming messages
2859
2860
11fdf7f2 2861bool Client::ms_dispatch2(const MessageRef &m)
7c673cae 2862{
f67539c2
TL
2863 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
2864 if (!iref_reader.is_state_satisfied()) {
7c673cae 2865 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
7c673cae
FG
2866 return true;
2867 }
2868
2869 switch (m->get_type()) {
2870 // mounting and mds sessions
2871 case CEPH_MSG_MDS_MAP:
9f95a23c 2872 handle_mds_map(ref_cast<MMDSMap>(m));
7c673cae
FG
2873 break;
2874 case CEPH_MSG_FS_MAP:
9f95a23c 2875 handle_fs_map(ref_cast<MFSMap>(m));
7c673cae
FG
2876 break;
2877 case CEPH_MSG_FS_MAP_USER:
9f95a23c 2878 handle_fs_map_user(ref_cast<MFSMapUser>(m));
7c673cae
FG
2879 break;
2880 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 2881 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
2882 break;
2883
2884 case CEPH_MSG_OSD_MAP:
9f95a23c 2885 handle_osd_map(ref_cast<MOSDMap>(m));
7c673cae
FG
2886 break;
2887
2888 // requests
2889 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
9f95a23c 2890 handle_client_request_forward(ref_cast<MClientRequestForward>(m));
7c673cae
FG
2891 break;
2892 case CEPH_MSG_CLIENT_REPLY:
9f95a23c 2893 handle_client_reply(ref_cast<MClientReply>(m));
11fdf7f2
TL
2894 break;
2895
2896 // reclaim reply
2897 case CEPH_MSG_CLIENT_RECLAIM_REPLY:
9f95a23c 2898 handle_client_reclaim_reply(ref_cast<MClientReclaimReply>(m));
7c673cae
FG
2899 break;
2900
2901 case CEPH_MSG_CLIENT_SNAP:
9f95a23c 2902 handle_snap(ref_cast<MClientSnap>(m));
7c673cae
FG
2903 break;
2904 case CEPH_MSG_CLIENT_CAPS:
9f95a23c 2905 handle_caps(ref_cast<MClientCaps>(m));
7c673cae
FG
2906 break;
2907 case CEPH_MSG_CLIENT_LEASE:
9f95a23c 2908 handle_lease(ref_cast<MClientLease>(m));
7c673cae
FG
2909 break;
2910 case MSG_COMMAND_REPLY:
2911 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
9f95a23c 2912 handle_command_reply(ref_cast<MCommandReply>(m));
7c673cae
FG
2913 } else {
2914 return false;
2915 }
2916 break;
2917 case CEPH_MSG_CLIENT_QUOTA:
9f95a23c 2918 handle_quota(ref_cast<MClientQuota>(m));
7c673cae
FG
2919 break;
2920
2921 default:
2922 return false;
2923 }
2924
2925 // unmounting?
f67539c2
TL
2926 std::scoped_lock cl(client_lock);
2927 if (is_unmounting()) {
7c673cae
FG
2928 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2929 << "+" << inode_map.size() << dendl;
f67539c2 2930 uint64_t size = lru.lru_get_size() + inode_map.size();
7c673cae 2931 trim_cache();
f67539c2 2932 if (size > lru.lru_get_size() + inode_map.size()) {
7c673cae 2933 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
9f95a23c 2934 mount_cond.notify_all();
7c673cae
FG
2935 } else {
2936 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2937 << "+" << inode_map.size() << dendl;
2938 }
2939 }
2940
2941 return true;
2942}
2943
11fdf7f2 2944void Client::handle_fs_map(const MConstRef<MFSMap>& m)
7c673cae 2945{
f67539c2 2946 std::scoped_lock cl(client_lock);
7c673cae 2947 fsmap.reset(new FSMap(m->get_fsmap()));
7c673cae
FG
2948
2949 signal_cond_list(waiting_for_fsmap);
2950
2951 monclient->sub_got("fsmap", fsmap->get_epoch());
2952}
2953
11fdf7f2 2954void Client::handle_fs_map_user(const MConstRef<MFSMapUser>& m)
7c673cae 2955{
f67539c2 2956 std::scoped_lock cl(client_lock);
7c673cae
FG
2957 fsmap_user.reset(new FSMapUser);
2958 *fsmap_user = m->get_fsmap();
7c673cae
FG
2959
2960 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2961 signal_cond_list(waiting_for_fsmap);
2962}
2963
f67539c2
TL
2964// Cancel all the commands for missing or laggy GIDs
2965void Client::cancel_commands(const MDSMap& newmap)
7c673cae 2966{
f67539c2 2967 std::vector<ceph_tid_t> cancel_ops;
7c673cae 2968
f67539c2 2969 std::scoped_lock cmd_lock(command_lock);
7c673cae 2970 auto &commands = command_table.get_commands();
f67539c2 2971 for (const auto &[tid, op] : commands) {
7c673cae 2972 const mds_gid_t op_mds_gid = op.mds_gid;
f67539c2
TL
2973 if (newmap.is_dne_gid(op_mds_gid) || newmap.is_laggy_gid(op_mds_gid)) {
2974 ldout(cct, 1) << __func__ << ": cancelling command op " << tid << dendl;
2975 cancel_ops.push_back(tid);
7c673cae
FG
2976 if (op.outs) {
2977 std::ostringstream ss;
2978 ss << "MDS " << op_mds_gid << " went away";
2979 *(op.outs) = ss.str();
2980 }
f67539c2
TL
2981 /*
2982 * No need to make the con->mark_down under
2983 * client_lock here, because the con will
2984 * has its own lock.
2985 */
7c673cae 2986 op.con->mark_down();
f67539c2
TL
2987 if (op.on_finish)
2988 op.on_finish->complete(-CEPHFS_ETIMEDOUT);
7c673cae
FG
2989 }
2990 }
2991
f67539c2
TL
2992 for (const auto &tid : cancel_ops)
2993 command_table.erase(tid);
2994}
2995
2996void Client::handle_mds_map(const MConstRef<MMDSMap>& m)
2997{
2998 std::unique_lock cl(client_lock);
2999 if (m->get_epoch() <= mdsmap->get_epoch()) {
3000 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch()
3001 << " is identical to or older than our "
3002 << mdsmap->get_epoch() << dendl;
3003 return;
7c673cae
FG
3004 }
3005
f67539c2
TL
3006 cl.unlock();
3007 ldout(cct, 1) << __func__ << " epoch " << m->get_epoch() << dendl;
3008 std::unique_ptr<MDSMap> _mdsmap(new MDSMap);
3009 _mdsmap->decode(m->get_encoded());
3010 cancel_commands(*_mdsmap.get());
3011 cl.lock();
3012
3013 _mdsmap.swap(mdsmap);
3014
7c673cae 3015 // reset session
11fdf7f2 3016 for (auto p = mds_sessions.begin(); p != mds_sessions.end(); ) {
7c673cae 3017 mds_rank_t mds = p->first;
20effc67 3018 MetaSessionRef session = p->second;
7c673cae
FG
3019 ++p;
3020
f67539c2 3021 int oldstate = _mdsmap->get_state(mds);
7c673cae
FG
3022 int newstate = mdsmap->get_state(mds);
3023 if (!mdsmap->is_up(mds)) {
3024 session->con->mark_down();
11fdf7f2 3025 } else if (mdsmap->get_addrs(mds) != session->addrs) {
f67539c2
TL
3026 auto old_inc = _mdsmap->get_incarnation(mds);
3027 auto new_inc = mdsmap->get_incarnation(mds);
f64942e4
AA
3028 if (old_inc != new_inc) {
3029 ldout(cct, 1) << "mds incarnation changed from "
3030 << old_inc << " to " << new_inc << dendl;
3031 oldstate = MDSMap::STATE_NULL;
3032 }
7c673cae 3033 session->con->mark_down();
11fdf7f2 3034 session->addrs = mdsmap->get_addrs(mds);
7c673cae
FG
3035 // When new MDS starts to take over, notify kernel to trim unused entries
3036 // in its dcache/icache. Hopefully, the kernel will release some unused
3037 // inodes before the new MDS enters reconnect state.
20effc67 3038 trim_cache_for_reconnect(session.get());
7c673cae
FG
3039 } else if (oldstate == newstate)
3040 continue; // no change
f67539c2 3041
7c673cae
FG
3042 session->mds_state = newstate;
3043 if (newstate == MDSMap::STATE_RECONNECT) {
11fdf7f2 3044 session->con = messenger->connect_to_mds(session->addrs);
20effc67 3045 send_reconnect(session.get());
81eedcae
TL
3046 } else if (newstate > MDSMap::STATE_RECONNECT) {
3047 if (oldstate < MDSMap::STATE_RECONNECT) {
3048 ldout(cct, 1) << "we may miss the MDSMap::RECONNECT, close mds session ... " << dendl;
20effc67 3049 _closed_mds_session(session.get());
81eedcae
TL
3050 continue;
3051 }
3052 if (newstate >= MDSMap::STATE_ACTIVE) {
3053 if (oldstate < MDSMap::STATE_ACTIVE) {
3054 // kick new requests
20effc67
TL
3055 kick_requests(session.get());
3056 kick_flushing_caps(session.get());
81eedcae 3057 signal_context_list(session->waiting_for_open);
20effc67 3058 wake_up_session_caps(session.get(), true);
81eedcae
TL
3059 }
3060 connect_mds_targets(mds);
7c673cae 3061 }
7c673cae
FG
3062 } else if (newstate == MDSMap::STATE_NULL &&
3063 mds >= mdsmap->get_max_mds()) {
20effc67 3064 _closed_mds_session(session.get());
7c673cae
FG
3065 }
3066 }
3067
3068 // kick any waiting threads
3069 signal_cond_list(waiting_for_mdsmap);
3070
7c673cae
FG
3071 monclient->sub_got("mdsmap", mdsmap->get_epoch());
3072}
3073
3074void Client::send_reconnect(MetaSession *session)
3075{
3076 mds_rank_t mds = session->mds_num;
11fdf7f2 3077 ldout(cct, 10) << __func__ << " to mds." << mds << dendl;
7c673cae
FG
3078
3079 // trim unused caps to reduce MDS's cache rejoin time
3080 trim_cache_for_reconnect(session);
3081
3082 session->readonly = false;
3083
11fdf7f2 3084 session->release.reset();
7c673cae
FG
3085
3086 // reset my cap seq number
3087 session->seq = 0;
3088 //connect to the mds' offload targets
3089 connect_mds_targets(mds);
3090 //make sure unsafe requests get saved
3091 resend_unsafe_requests(session);
3092
11fdf7f2
TL
3093 early_kick_flushing_caps(session);
3094
9f95a23c 3095 auto m = make_message<MClientReconnect>();
11fdf7f2 3096 bool allow_multi = session->mds_features.test(CEPHFS_FEATURE_MULTI_RECONNECT);
7c673cae
FG
3097
3098 // i have an open session.
3099 ceph::unordered_set<inodeno_t> did_snaprealm;
3100 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
3101 p != inode_map.end();
3102 ++p) {
3103 Inode *in = p->second;
11fdf7f2
TL
3104 auto it = in->caps.find(mds);
3105 if (it != in->caps.end()) {
3106 if (allow_multi &&
9f95a23c
TL
3107 m->get_approx_size() >=
3108 static_cast<size_t>((std::numeric_limits<int>::max() >> 1))) {
11fdf7f2
TL
3109 m->mark_more();
3110 session->con->send_message2(std::move(m));
3111
9f95a23c 3112 m = make_message<MClientReconnect>();
11fdf7f2
TL
3113 }
3114
3115 Cap &cap = it->second;
7c673cae 3116 ldout(cct, 10) << " caps on " << p->first
11fdf7f2 3117 << " " << ccap_string(cap.issued)
7c673cae
FG
3118 << " wants " << ccap_string(in->caps_wanted())
3119 << dendl;
3120 filepath path;
f91f0fd5 3121 in->make_short_path(path);
7c673cae
FG
3122 ldout(cct, 10) << " path " << path << dendl;
3123
3124 bufferlist flockbl;
3125 _encode_filelocks(in, flockbl);
3126
11fdf7f2
TL
3127 cap.seq = 0; // reset seq.
3128 cap.issue_seq = 0; // reset seq.
3129 cap.mseq = 0; // reset seq.
3130 // cap gen should catch up with session cap_gen
3131 if (cap.gen < session->cap_gen) {
3132 cap.gen = session->cap_gen;
3133 cap.issued = cap.implemented = CEPH_CAP_PIN;
3134 } else {
3135 cap.issued = cap.implemented;
3136 }
7c673cae
FG
3137 snapid_t snap_follows = 0;
3138 if (!in->cap_snaps.empty())
3139 snap_follows = in->cap_snaps.begin()->first;
3140
3141 m->add_cap(p->first.ino,
11fdf7f2 3142 cap.cap_id,
7c673cae
FG
3143 path.get_ino(), path.get_path(), // ino
3144 in->caps_wanted(), // wanted
11fdf7f2 3145 cap.issued, // issued
7c673cae
FG
3146 in->snaprealm->ino,
3147 snap_follows,
3148 flockbl);
3149
3150 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
3151 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
3152 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
3153 did_snaprealm.insert(in->snaprealm->ino);
3154 }
3155 }
3156 }
3157
11fdf7f2
TL
3158 if (!allow_multi)
3159 m->set_encoding_version(0); // use connection features to choose encoding
3160 session->con->send_message2(std::move(m));
7c673cae 3161
9f95a23c 3162 mount_cond.notify_all();
11fdf7f2
TL
3163
3164 if (session->reclaim_state == MetaSession::RECLAIMING)
3165 signal_cond_list(waiting_for_reclaim);
7c673cae
FG
3166}
3167
3168
3169void Client::kick_requests(MetaSession *session)
3170{
11fdf7f2 3171 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3172 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3173 p != mds_requests.end();
3174 ++p) {
31f18b77
FG
3175 MetaRequest *req = p->second;
3176 if (req->got_unsafe)
3177 continue;
3178 if (req->aborted()) {
3179 if (req->caller_cond) {
3180 req->kick = true;
9f95a23c 3181 req->caller_cond->notify_all();
31f18b77 3182 }
7c673cae 3183 continue;
31f18b77
FG
3184 }
3185 if (req->retry_attempt > 0)
7c673cae 3186 continue; // new requests only
31f18b77 3187 if (req->mds == session->mds_num) {
7c673cae
FG
3188 send_request(p->second, session);
3189 }
3190 }
3191}
3192
3193void Client::resend_unsafe_requests(MetaSession *session)
3194{
3195 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
3196 !iter.end();
3197 ++iter)
3198 send_request(*iter, session);
3199
3200 // also re-send old requests when MDS enters reconnect stage. So that MDS can
3201 // process completed requests in clientreplay stage.
3202 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3203 p != mds_requests.end();
3204 ++p) {
3205 MetaRequest *req = p->second;
3206 if (req->got_unsafe)
3207 continue;
31f18b77
FG
3208 if (req->aborted())
3209 continue;
7c673cae
FG
3210 if (req->retry_attempt == 0)
3211 continue; // old requests only
3212 if (req->mds == session->mds_num)
3213 send_request(req, session, true);
3214 }
3215}
3216
3217void Client::wait_unsafe_requests()
3218{
3219 list<MetaRequest*> last_unsafe_reqs;
11fdf7f2 3220 for (const auto &p : mds_sessions) {
20effc67
TL
3221 const auto s = p.second;
3222 if (!s->unsafe_requests.empty()) {
3223 MetaRequest *req = s->unsafe_requests.back();
7c673cae
FG
3224 req->get();
3225 last_unsafe_reqs.push_back(req);
3226 }
3227 }
3228
3229 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
3230 p != last_unsafe_reqs.end();
3231 ++p) {
3232 MetaRequest *req = *p;
3233 if (req->unsafe_item.is_on_list())
3234 wait_on_list(req->waitfor_safe);
3235 put_request(req);
3236 }
3237}
3238
3239void Client::kick_requests_closed(MetaSession *session)
3240{
11fdf7f2 3241 ldout(cct, 10) << __func__ << " for mds." << session->mds_num << dendl;
7c673cae
FG
3242 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
3243 p != mds_requests.end(); ) {
3244 MetaRequest *req = p->second;
3245 ++p;
3246 if (req->mds == session->mds_num) {
3247 if (req->caller_cond) {
3248 req->kick = true;
9f95a23c 3249 req->caller_cond->notify_all();
7c673cae
FG
3250 }
3251 req->item.remove_myself();
3252 if (req->got_unsafe) {
11fdf7f2 3253 lderr(cct) << __func__ << " removing unsafe request " << req->get_tid() << dendl;
7c673cae 3254 req->unsafe_item.remove_myself();
eafe8130
TL
3255 if (is_dir_operation(req)) {
3256 Inode *dir = req->inode();
20effc67 3257 ceph_assert(dir);
f67539c2 3258 dir->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3259 lderr(cct) << "kick_requests_closed drop req of inode(dir) : "
3260 << dir->ino << " " << req->get_tid() << dendl;
3261 req->unsafe_dir_item.remove_myself();
3262 }
3263 if (req->target) {
3264 InodeRef &in = req->target;
f67539c2 3265 in->set_async_err(-CEPHFS_EIO);
eafe8130
TL
3266 lderr(cct) << "kick_requests_closed drop req of inode : "
3267 << in->ino << " " << req->get_tid() << dendl;
3268 req->unsafe_target_item.remove_myself();
3269 }
7c673cae
FG
3270 signal_cond_list(req->waitfor_safe);
3271 unregister_request(req);
3272 }
3273 }
3274 }
11fdf7f2
TL
3275 ceph_assert(session->requests.empty());
3276 ceph_assert(session->unsafe_requests.empty());
7c673cae
FG
3277}
3278
3279
3280
3281
3282/************
3283 * leases
3284 */
3285
3286void Client::got_mds_push(MetaSession *s)
3287{
3288 s->seq++;
3289 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
3290 if (s->state == MetaSession::STATE_CLOSING) {
9f95a23c 3291 s->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_CLOSE, s->seq));
7c673cae
FG
3292 }
3293}
3294
11fdf7f2 3295void Client::handle_lease(const MConstRef<MClientLease>& m)
7c673cae 3296{
11fdf7f2 3297 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 3298
11fdf7f2 3299 ceph_assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
7c673cae 3300 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
3301
3302 std::scoped_lock cl(client_lock);
20effc67 3303 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 3304 if (!session) {
7c673cae
FG
3305 return;
3306 }
3307
20effc67 3308 got_mds_push(session.get());
7c673cae
FG
3309
3310 ceph_seq_t seq = m->get_seq();
3311
3312 Inode *in;
3313 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
3314 if (inode_map.count(vino) == 0) {
3315 ldout(cct, 10) << " don't have vino " << vino << dendl;
3316 goto revoke;
3317 }
3318 in = inode_map[vino];
3319
9f95a23c 3320 if (m->get_mask() & CEPH_LEASE_VALID) {
7c673cae
FG
3321 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
3322 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
3323 goto revoke;
3324 }
3325 Dentry *dn = in->dir->dentries[m->dname];
3326 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
3327 dn->lease_mds = -1;
3328 }
3329
3330 revoke:
11fdf7f2 3331 {
9f95a23c
TL
3332 auto reply = make_message<MClientLease>(CEPH_MDS_LEASE_RELEASE, seq,
3333 m->get_mask(), m->get_ino(),
3334 m->get_first(), m->get_last(), m->dname);
11fdf7f2
TL
3335 m->get_connection()->send_message2(std::move(reply));
3336 }
7c673cae
FG
3337}
3338
f67539c2 3339void Client::_put_inode(Inode *in, int n)
7c673cae 3340{
f67539c2
TL
3341 ldout(cct, 10) << __func__ << " on " << *in << " n = " << n << dendl;
3342
b3b6e05e
TL
3343 int left = in->get_nref();
3344 ceph_assert(left >= n + 1);
3345 in->iput(n);
3346 left -= n;
3347 if (left == 1) { // the last one will be held by the inode_map
7c673cae
FG
3348 // release any caps
3349 remove_all_caps(in);
3350
11fdf7f2 3351 ldout(cct, 10) << __func__ << " deleting " << *in << dendl;
7c673cae 3352 bool unclean = objectcacher->release_set(&in->oset);
11fdf7f2 3353 ceph_assert(!unclean);
7c673cae
FG
3354 inode_map.erase(in->vino());
3355 if (use_faked_inos())
3356 _release_faked_ino(in);
3357
b3b6e05e 3358 if (root == nullptr) {
7c673cae
FG
3359 root_ancestor = 0;
3360 while (!root_parents.empty())
3361 root_parents.erase(root_parents.begin());
3362 }
3363
b3b6e05e 3364 in->iput();
7c673cae
FG
3365 }
3366}
3367
f67539c2
TL
3368void Client::delay_put_inodes(bool wakeup)
3369{
3370 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
3371
3372 std::map<Inode*,int> release;
3373 {
3374 std::scoped_lock dl(delay_i_lock);
3375 release.swap(delay_i_release);
3376 }
3377
3378 if (release.empty())
3379 return;
3380
3381 for (auto &[in, cnt] : release)
3382 _put_inode(in, cnt);
3383
3384 if (wakeup)
3385 mount_cond.notify_all();
3386}
3387
3388void Client::put_inode(Inode *in, int n)
3389{
3390 ldout(cct, 20) << __func__ << " on " << *in << " n = " << n << dendl;
3391
3392 std::scoped_lock dl(delay_i_lock);
3393 delay_i_release[in] += n;
3394}
3395
7c673cae
FG
3396void Client::close_dir(Dir *dir)
3397{
3398 Inode *in = dir->parent_inode;
11fdf7f2
TL
3399 ldout(cct, 15) << __func__ << " dir " << dir << " on " << in << dendl;
3400 ceph_assert(dir->is_empty());
3401 ceph_assert(in->dir == dir);
3402 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
3403 if (!in->dentries.empty())
7c673cae
FG
3404 in->get_first_parent()->put(); // unpin dentry
3405
3406 delete in->dir;
3407 in->dir = 0;
3408 put_inode(in); // unpin inode
3409}
3410
3411 /**
3412 * Don't call this with in==NULL, use get_or_create for that
3413 * leave dn set to default NULL unless you're trying to add
3414 * a new inode to a pre-created Dentry
3415 */
3416Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
3417{
3418 if (!dn) {
3419 // create a new Dentry
11fdf7f2
TL
3420 dn = new Dentry(dir, name);
3421
7c673cae
FG
3422 lru.lru_insert_mid(dn); // mid or top?
3423
3424 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3425 << " dn " << dn << " (new dn)" << dendl;
3426 } else {
11fdf7f2 3427 ceph_assert(!dn->inode);
7c673cae
FG
3428 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
3429 << " dn " << dn << " (old dn)" << dendl;
3430 }
3431
3432 if (in) { // link to inode
11fdf7f2 3433 InodeRef tmp_ref;
7c673cae 3434 // only one parent for directories!
11fdf7f2
TL
3435 if (in->is_dir() && !in->dentries.empty()) {
3436 tmp_ref = in; // prevent unlink below from freeing the inode.
7c673cae 3437 Dentry *olddn = in->get_first_parent();
11fdf7f2 3438 ceph_assert(olddn->dir != dir || olddn->name != name);
7c673cae 3439 Inode *old_diri = olddn->dir->parent_inode;
7c673cae
FG
3440 clear_dir_complete_and_ordered(old_diri, true);
3441 unlink(olddn, true, true); // keep dir, dentry
3442 }
3443
11fdf7f2 3444 dn->link(in);
f67539c2 3445 inc_dentry_nr();
11fdf7f2 3446 ldout(cct, 20) << "link inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3447 }
3448
3449 return dn;
3450}
3451
3452void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3453{
11fdf7f2 3454 InodeRef in(dn->inode);
7c673cae
FG
3455 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3456 << " inode " << dn->inode << dendl;
3457
3458 // unlink from inode
11fdf7f2
TL
3459 if (dn->inode) {
3460 dn->unlink();
f67539c2 3461 dec_dentry_nr();
11fdf7f2 3462 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dentries << dendl;
7c673cae
FG
3463 }
3464
3465 if (keepdentry) {
3466 dn->lease_mds = -1;
3467 } else {
3468 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3469
3470 // unlink from dir
11fdf7f2
TL
3471 Dir *dir = dn->dir;
3472 dn->detach();
7c673cae
FG
3473
3474 // delete den
3475 lru.lru_remove(dn);
3476 dn->put();
11fdf7f2
TL
3477
3478 if (dir->is_empty() && !keepdir)
3479 close_dir(dir);
7c673cae
FG
3480 }
3481}
3482
3483/**
3484 * For asynchronous flushes, check for errors from the IO and
3485 * update the inode if necessary
3486 */
3487class C_Client_FlushComplete : public Context {
3488private:
3489 Client *client;
3490 InodeRef inode;
3491public:
3492 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3493 void finish(int r) override {
9f95a23c 3494 ceph_assert(ceph_mutex_is_locked_by_me(client->client_lock));
7c673cae
FG
3495 if (r != 0) {
3496 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3497 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3498 << " 0x" << std::hex << inode->ino << std::dec
3499 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3500 inode->set_async_err(r);
3501 }
3502 }
3503};
3504
3505
3506/****
3507 * caps
3508 */
3509
3510void Client::get_cap_ref(Inode *in, int cap)
3511{
3512 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3513 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
11fdf7f2 3514 ldout(cct, 5) << __func__ << " got first FILE_BUFFER ref on " << *in << dendl;
b3b6e05e 3515 in->iget();
7c673cae
FG
3516 }
3517 if ((cap & CEPH_CAP_FILE_CACHE) &&
3518 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
11fdf7f2 3519 ldout(cct, 5) << __func__ << " got first FILE_CACHE ref on " << *in << dendl;
b3b6e05e 3520 in->iget();
7c673cae
FG
3521 }
3522 in->get_cap_ref(cap);
3523}
3524
3525void Client::put_cap_ref(Inode *in, int cap)
3526{
3527 int last = in->put_cap_ref(cap);
3528 if (last) {
3529 int put_nref = 0;
3530 int drop = last & ~in->caps_issued();
3531 if (in->snapid == CEPH_NOSNAP) {
f67539c2 3532 if ((last & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER)) &&
7c673cae
FG
3533 !in->cap_snaps.empty() &&
3534 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 3535 ldout(cct, 10) << __func__ << " finishing pending cap_snap on " << *in << dendl;
7c673cae
FG
3536 in->cap_snaps.rbegin()->second.writing = 0;
3537 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3538 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3539 }
3540 if (last & CEPH_CAP_FILE_BUFFER) {
3541 for (auto &p : in->cap_snaps)
3542 p.second.dirty_data = 0;
3543 signal_cond_list(in->waitfor_commit);
11fdf7f2 3544 ldout(cct, 5) << __func__ << " dropped last FILE_BUFFER ref on " << *in << dendl;
7c673cae
FG
3545 ++put_nref;
3546 }
3547 }
3548 if (last & CEPH_CAP_FILE_CACHE) {
11fdf7f2 3549 ldout(cct, 5) << __func__ << " dropped last FILE_CACHE ref on " << *in << dendl;
7c673cae
FG
3550 ++put_nref;
3551 }
3552 if (drop)
3553 check_caps(in, 0);
3554 if (put_nref)
3555 put_inode(in, put_nref);
3556 }
3557}
3558
f67539c2
TL
3559// get caps for a given file handle -- the inode should have @need caps
3560// issued by the mds and @want caps not revoked (or not under revocation).
3561// this routine blocks till the cap requirement is satisfied. also account
3562// (track) for capability hit when required (when cap requirement succeedes).
f6b5b4d7 3563int Client::get_caps(Fh *fh, int need, int want, int *phave, loff_t endoff)
7c673cae 3564{
f6b5b4d7
TL
3565 Inode *in = fh->inode.get();
3566
7c673cae
FG
3567 int r = check_pool_perm(in, need);
3568 if (r < 0)
3569 return r;
3570
3571 while (1) {
3572 int file_wanted = in->caps_file_wanted();
3573 if ((file_wanted & need) != need) {
3574 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3575 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3576 << dendl;
f67539c2 3577 return -CEPHFS_EBADF;
7c673cae
FG
3578 }
3579
f6b5b4d7 3580 if ((fh->mode & CEPH_FILE_MODE_WR) && fh->gen != fd_gen)
f67539c2 3581 return -CEPHFS_EBADF;
f6b5b4d7
TL
3582
3583 if ((in->flags & I_ERROR_FILELOCK) && fh->has_any_filelocks())
f67539c2 3584 return -CEPHFS_EIO;
f6b5b4d7 3585
7c673cae
FG
3586 int implemented;
3587 int have = in->caps_issued(&implemented);
3588
3589 bool waitfor_caps = false;
3590 bool waitfor_commit = false;
3591
3592 if (have & need & CEPH_CAP_FILE_WR) {
1911f103
TL
3593 if (endoff > 0) {
3594 if ((endoff >= (loff_t)in->max_size ||
3595 endoff > (loff_t)(in->size << 1)) &&
3596 endoff > (loff_t)in->wanted_max_size) {
3597 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3598 in->wanted_max_size = endoff;
3599 }
3600 if (in->wanted_max_size > in->max_size &&
3601 in->wanted_max_size > in->requested_max_size)
3602 check_caps(in, 0);
7c673cae
FG
3603 }
3604
3605 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3606 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3607 waitfor_caps = true;
3608 }
3609 if (!in->cap_snaps.empty()) {
3610 if (in->cap_snaps.rbegin()->second.writing) {
3611 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3612 waitfor_caps = true;
3613 }
3614 for (auto &p : in->cap_snaps) {
3615 if (p.second.dirty_data) {
3616 waitfor_commit = true;
3617 break;
3618 }
3619 }
3620 if (waitfor_commit) {
3621 _flush(in, new C_Client_FlushComplete(this, in));
3622 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3623 }
3624 }
3625 }
3626
3627 if (!waitfor_caps && !waitfor_commit) {
3628 if ((have & need) == need) {
7c673cae
FG
3629 int revoking = implemented & ~have;
3630 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3631 << " need " << ccap_string(need) << " want " << ccap_string(want)
c07f9fc5 3632 << " revoking " << ccap_string(revoking)
7c673cae 3633 << dendl;
c07f9fc5 3634 if ((revoking & want) == 0) {
7c673cae
FG
3635 *phave = need | (have & want);
3636 in->get_cap_ref(need);
f67539c2 3637 cap_hit();
7c673cae
FG
3638 return 0;
3639 }
3640 }
3641 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3642 waitfor_caps = true;
3643 }
3644
1e59de90
TL
3645 if ((need & CEPH_CAP_FILE_WR) &&
3646 ((in->auth_cap && in->auth_cap->session->readonly) ||
3647 // userland clients are only allowed to read if fscrypt enabled
3648 in->is_fscrypt_enabled()))
f67539c2 3649 return -CEPHFS_EROFS;
7c673cae
FG
3650
3651 if (in->flags & I_CAP_DROPPED) {
3652 int mds_wanted = in->caps_mds_wanted();
3653 if ((mds_wanted & need) != need) {
3654 int ret = _renew_caps(in);
3655 if (ret < 0)
3656 return ret;
3657 continue;
3658 }
a8e16298 3659 if (!(file_wanted & ~mds_wanted))
7c673cae 3660 in->flags &= ~I_CAP_DROPPED;
7c673cae
FG
3661 }
3662
3663 if (waitfor_caps)
3664 wait_on_list(in->waitfor_caps);
3665 else if (waitfor_commit)
3666 wait_on_list(in->waitfor_commit);
3667 }
3668}
3669
3670int Client::get_caps_used(Inode *in)
3671{
3672 unsigned used = in->caps_used();
3673 if (!(used & CEPH_CAP_FILE_CACHE) &&
3674 !objectcacher->set_is_empty(&in->oset))
3675 used |= CEPH_CAP_FILE_CACHE;
3676 return used;
3677}
3678
3679void Client::cap_delay_requeue(Inode *in)
3680{
11fdf7f2 3681 ldout(cct, 10) << __func__ << " on " << *in << dendl;
2a845540
TL
3682
3683 in->hold_caps_until = ceph::coarse_mono_clock::now() + caps_release_delay;
28e407b8 3684 delayed_list.push_back(&in->delay_cap_item);
7c673cae
FG
3685}
3686
3687void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
eafe8130 3688 int flags, int used, int want, int retain,
7c673cae
FG
3689 int flush, ceph_tid_t flush_tid)
3690{
3691 int held = cap->issued | cap->implemented;
3692 int revoking = cap->implemented & ~cap->issued;
3693 retain &= ~revoking;
3694 int dropping = cap->issued & ~retain;
3695 int op = CEPH_CAP_OP_UPDATE;
3696
11fdf7f2 3697 ldout(cct, 10) << __func__ << " " << *in
7c673cae 3698 << " mds." << session->mds_num << " seq " << cap->seq
7c673cae
FG
3699 << " used " << ccap_string(used)
3700 << " want " << ccap_string(want)
3701 << " flush " << ccap_string(flush)
3702 << " retain " << ccap_string(retain)
3703 << " held "<< ccap_string(held)
3704 << " revoking " << ccap_string(revoking)
3705 << " dropping " << ccap_string(dropping)
3706 << dendl;
3707
3708 if (cct->_conf->client_inject_release_failure && revoking) {
3709 const int would_have_issued = cap->issued & retain;
3710 const int would_have_implemented = cap->implemented & (cap->issued | used);
3711 // Simulated bug:
3712 // - tell the server we think issued is whatever they issued plus whatever we implemented
3713 // - leave what we have implemented in place
3714 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3715 cap->issued = cap->issued | cap->implemented;
3716
3717 // Make an exception for revoking xattr caps: we are injecting
3718 // failure to release other caps, but allow xattr because client
3719 // will block on xattr ops if it can't release these to MDS (#9800)
3720 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3721 cap->issued ^= xattr_mask & revoking;
3722 cap->implemented ^= xattr_mask & revoking;
3723
3724 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3725 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3726 } else {
3727 // Normal behaviour
3728 cap->issued &= retain;
3729 cap->implemented &= cap->issued | used;
3730 }
3731
3732 snapid_t follows = 0;
3733
3734 if (flush)
3735 follows = in->snaprealm->get_snap_context().seq;
20effc67 3736
9f95a23c 3737 auto m = make_message<MClientCaps>(op,
7c673cae
FG
3738 in->ino,
3739 0,
3740 cap->cap_id, cap->seq,
3741 cap->implemented,
3742 want,
3743 flush,
3744 cap->mseq,
3745 cap_epoch_barrier);
3746 m->caller_uid = in->cap_dirtier_uid;
3747 m->caller_gid = in->cap_dirtier_gid;
3748
3749 m->head.issue_seq = cap->issue_seq;
3750 m->set_tid(flush_tid);
3751
3752 m->head.uid = in->uid;
3753 m->head.gid = in->gid;
3754 m->head.mode = in->mode;
20effc67 3755
7c673cae 3756 m->head.nlink = in->nlink;
20effc67 3757
7c673cae 3758 if (flush & CEPH_CAP_XATTR_EXCL) {
11fdf7f2 3759 encode(in->xattrs, m->xattrbl);
7c673cae
FG
3760 m->head.xattr_version = in->xattr_version;
3761 }
20effc67 3762
7c673cae
FG
3763 m->size = in->size;
3764 m->max_size = in->max_size;
3765 m->truncate_seq = in->truncate_seq;
3766 m->truncate_size = in->truncate_size;
3767 m->mtime = in->mtime;
3768 m->atime = in->atime;
3769 m->ctime = in->ctime;
3770 m->btime = in->btime;
3771 m->time_warp_seq = in->time_warp_seq;
3772 m->change_attr = in->change_attr;
1e59de90
TL
3773 m->fscrypt_auth = in->fscrypt_auth;
3774 m->fscrypt_file = in->fscrypt_file;
eafe8130
TL
3775
3776 if (!(flags & MClientCaps::FLAG_PENDING_CAPSNAP) &&
3777 !in->cap_snaps.empty() &&
3778 in->cap_snaps.rbegin()->second.flush_tid == 0)
3779 flags |= MClientCaps::FLAG_PENDING_CAPSNAP;
3780 m->flags = flags;
3781
7c673cae
FG
3782 if (flush & CEPH_CAP_FILE_WR) {
3783 m->inline_version = in->inline_version;
3784 m->inline_data = in->inline_data;
3785 }
3786
3787 in->reported_size = in->size;
3788 m->set_snap_follows(follows);
3789 cap->wanted = want;
3790 if (cap == in->auth_cap) {
1911f103
TL
3791 if (want & CEPH_CAP_ANY_FILE_WR) {
3792 m->set_max_size(in->wanted_max_size);
3793 in->requested_max_size = in->wanted_max_size;
3794 ldout(cct, 15) << "auth cap, requesting max_size " << in->requested_max_size << dendl;
3795 } else {
3796 in->requested_max_size = 0;
3797 ldout(cct, 15) << "auth cap, reset requested_max_size due to not wanting any file write cap" << dendl;
3798 }
7c673cae
FG
3799 }
3800
3801 if (!session->flushing_caps_tids.empty())
3802 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3803
11fdf7f2 3804 session->con->send_message2(std::move(m));
7c673cae
FG
3805}
3806
31f18b77
FG
3807static bool is_max_size_approaching(Inode *in)
3808{
3809 /* mds will adjust max size according to the reported size */
3810 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3811 return false;
3812 if (in->size >= in->max_size)
3813 return true;
3814 /* half of previous max_size increment has been used */
3815 if (in->max_size > in->reported_size &&
3816 (in->size << 1) >= in->max_size + in->reported_size)
3817 return true;
3818 return false;
3819}
7c673cae 3820
11fdf7f2
TL
3821static int adjust_caps_used_for_lazyio(int used, int issued, int implemented)
3822{
3823 if (!(used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER)))
3824 return used;
3825 if (!(implemented & CEPH_CAP_FILE_LAZYIO))
3826 return used;
3827
3828 if (issued & CEPH_CAP_FILE_LAZYIO) {
3829 if (!(issued & CEPH_CAP_FILE_CACHE)) {
3830 used &= ~CEPH_CAP_FILE_CACHE;
3831 used |= CEPH_CAP_FILE_LAZYIO;
3832 }
3833 if (!(issued & CEPH_CAP_FILE_BUFFER)) {
3834 used &= ~CEPH_CAP_FILE_BUFFER;
3835 used |= CEPH_CAP_FILE_LAZYIO;
3836 }
3837 } else {
3838 if (!(implemented & CEPH_CAP_FILE_CACHE)) {
3839 used &= ~CEPH_CAP_FILE_CACHE;
3840 used |= CEPH_CAP_FILE_LAZYIO;
3841 }
3842 if (!(implemented & CEPH_CAP_FILE_BUFFER)) {
3843 used &= ~CEPH_CAP_FILE_BUFFER;
3844 used |= CEPH_CAP_FILE_LAZYIO;
3845 }
3846 }
3847 return used;
3848}
3849
7c673cae
FG
3850/**
3851 * check_caps
3852 *
3853 * Examine currently used and wanted versus held caps. Release, flush or ack
3854 * revoked caps to the MDS as appropriate.
3855 *
3856 * @param in the inode to check
3857 * @param flags flags to apply to cap check
3858 */
3859void Client::check_caps(Inode *in, unsigned flags)
3860{
3861 unsigned wanted = in->caps_wanted();
3862 unsigned used = get_caps_used(in);
3863 unsigned cap_used;
3864
7c673cae
FG
3865 int implemented;
3866 int issued = in->caps_issued(&implemented);
3867 int revoking = implemented & ~issued;
3868
11fdf7f2
TL
3869 int orig_used = used;
3870 used = adjust_caps_used_for_lazyio(used, issued, implemented);
3871
7c673cae 3872 int retain = wanted | used | CEPH_CAP_PIN;
f67539c2 3873 if (!is_unmounting() && in->nlink > 0) {
a8e16298 3874 if (wanted) {
7c673cae 3875 retain |= CEPH_CAP_ANY;
a8e16298
TL
3876 } else if (in->is_dir() &&
3877 (issued & CEPH_CAP_FILE_SHARED) &&
3878 (in->flags & I_COMPLETE)) {
3879 // we do this here because we don't want to drop to Fs (and then
3880 // drop the Fs if we do a create!) if that alone makes us send lookups
3881 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3882 wanted = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
3883 retain |= wanted;
3884 } else {
7c673cae 3885 retain |= CEPH_CAP_ANY_SHARED;
a8e16298
TL
3886 // keep RD only if we didn't have the file open RW,
3887 // because then the mds would revoke it anyway to
3888 // journal max_size=0.
3889 if (in->max_size == 0)
3890 retain |= CEPH_CAP_ANY_RD;
3891 }
7c673cae
FG
3892 }
3893
11fdf7f2 3894 ldout(cct, 10) << __func__ << " on " << *in
7c673cae
FG
3895 << " wanted " << ccap_string(wanted)
3896 << " used " << ccap_string(used)
3897 << " issued " << ccap_string(issued)
3898 << " revoking " << ccap_string(revoking)
3899 << " flags=" << flags
3900 << dendl;
3901
3902 if (in->snapid != CEPH_NOSNAP)
3903 return; //snap caps last forever, can't write
3904
3905 if (in->caps.empty())
3906 return; // guard if at end of func
3907
11fdf7f2
TL
3908 if (!(orig_used & CEPH_CAP_FILE_BUFFER) &&
3909 (revoking & used & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
94b18763 3910 if (_release(in))
11fdf7f2 3911 used &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
94b18763 3912 }
7c673cae 3913
20effc67
TL
3914 for (auto &[mds, cap] : in->caps) {
3915 auto session = mds_sessions.at(mds);
7c673cae
FG
3916
3917 cap_used = used;
11fdf7f2 3918 if (in->auth_cap && &cap != in->auth_cap)
7c673cae
FG
3919 cap_used &= ~in->auth_cap->issued;
3920
11fdf7f2 3921 revoking = cap.implemented & ~cap.issued;
20effc67 3922
7c673cae 3923 ldout(cct, 10) << " cap mds." << mds
11fdf7f2
TL
3924 << " issued " << ccap_string(cap.issued)
3925 << " implemented " << ccap_string(cap.implemented)
7c673cae
FG
3926 << " revoking " << ccap_string(revoking) << dendl;
3927
3928 if (in->wanted_max_size > in->max_size &&
3929 in->wanted_max_size > in->requested_max_size &&
11fdf7f2 3930 &cap == in->auth_cap)
7c673cae
FG
3931 goto ack;
3932
3933 /* approaching file_max? */
11fdf7f2
TL
3934 if ((cap.issued & CEPH_CAP_FILE_WR) &&
3935 &cap == in->auth_cap &&
31f18b77 3936 is_max_size_approaching(in)) {
7c673cae 3937 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
31f18b77 3938 << ", reported " << in->reported_size << dendl;
7c673cae
FG
3939 goto ack;
3940 }
3941
3942 /* completed revocation? */
3943 if (revoking && (revoking & cap_used) == 0) {
11fdf7f2 3944 ldout(cct, 10) << "completed revocation of " << ccap_string(cap.implemented & ~cap.issued) << dendl;
7c673cae
FG
3945 goto ack;
3946 }
3947
3948 /* want more caps from mds? */
11fdf7f2 3949 if (wanted & ~(cap.wanted | cap.issued))
7c673cae
FG
3950 goto ack;
3951
f67539c2 3952 if (!revoking && is_unmounting() && (cap_used == 0))
7c673cae
FG
3953 goto ack;
3954
11fdf7f2 3955 if ((cap.issued & ~retain) == 0 && // and we don't have anything we wouldn't like
a8e16298 3956 !in->dirty_caps) // and we have no dirty caps
7c673cae
FG
3957 continue;
3958
11fdf7f2 3959 if (!(flags & CHECK_CAPS_NODELAY)) {
7c673cae 3960 ldout(cct, 10) << "delaying cap release" << dendl;
11fdf7f2 3961 cap_delay_requeue(in);
7c673cae
FG
3962 continue;
3963 }
3964
3965 ack:
eafe8130
TL
3966 if (&cap == in->auth_cap) {
3967 if (in->flags & I_KICK_FLUSH) {
3968 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3969 << " to mds." << mds << dendl;
20effc67 3970 kick_flushing_caps(in, session.get());
eafe8130
TL
3971 }
3972 if (!in->cap_snaps.empty() &&
3973 in->cap_snaps.rbegin()->second.flush_tid == 0)
3974 flush_snaps(in);
7c673cae
FG
3975 }
3976
3977 int flushing;
e306af50 3978 int msg_flags = 0;
7c673cae 3979 ceph_tid_t flush_tid;
11fdf7f2 3980 if (in->auth_cap == &cap && in->dirty_caps) {
7c673cae 3981 flushing = mark_caps_flushing(in, &flush_tid);
e306af50
TL
3982 if (flags & CHECK_CAPS_SYNCHRONOUS)
3983 msg_flags |= MClientCaps::FLAG_SYNC;
7c673cae
FG
3984 } else {
3985 flushing = 0;
3986 flush_tid = 0;
3987 }
3988
20effc67
TL
3989 in->delay_cap_item.remove_myself();
3990 send_cap(in, session.get(), &cap, msg_flags, cap_used, wanted, retain,
eafe8130 3991 flushing, flush_tid);
7c673cae
FG
3992 }
3993}
3994
3995
3996void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3997{
3998 int used = get_caps_used(in);
3999 int dirty = in->caps_dirty();
11fdf7f2 4000 ldout(cct, 10) << __func__ << " " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
7c673cae
FG
4001
4002 if (in->cap_snaps.size() &&
4003 in->cap_snaps.rbegin()->second.writing) {
11fdf7f2 4004 ldout(cct, 10) << __func__ << " already have pending cap_snap on " << *in << dendl;
7c673cae
FG
4005 return;
4006 } else if (in->caps_dirty() ||
4007 (used & CEPH_CAP_FILE_WR) ||
4008 (dirty & CEPH_CAP_ANY_WR)) {
4009 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
11fdf7f2 4010 ceph_assert(capsnapem.second); /* element inserted */
7c673cae
FG
4011 CapSnap &capsnap = capsnapem.first->second;
4012 capsnap.context = old_snapc;
4013 capsnap.issued = in->caps_issued();
4014 capsnap.dirty = in->caps_dirty();
f67539c2 4015
7c673cae 4016 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
f67539c2 4017
7c673cae
FG
4018 capsnap.uid = in->uid;
4019 capsnap.gid = in->gid;
4020 capsnap.mode = in->mode;
4021 capsnap.btime = in->btime;
4022 capsnap.xattrs = in->xattrs;
4023 capsnap.xattr_version = in->xattr_version;
11fdf7f2
TL
4024 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4025 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
f67539c2 4026
7c673cae 4027 if (used & CEPH_CAP_FILE_WR) {
11fdf7f2 4028 ldout(cct, 10) << __func__ << " WR used on " << *in << dendl;
7c673cae
FG
4029 capsnap.writing = 1;
4030 } else {
4031 finish_cap_snap(in, capsnap, used);
4032 }
4033 } else {
11fdf7f2 4034 ldout(cct, 10) << __func__ << " not dirty|writing on " << *in << dendl;
7c673cae
FG
4035 }
4036}
4037
4038void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
4039{
11fdf7f2 4040 ldout(cct, 10) << __func__ << " " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
7c673cae
FG
4041 capsnap.size = in->size;
4042 capsnap.mtime = in->mtime;
4043 capsnap.atime = in->atime;
4044 capsnap.ctime = in->ctime;
4045 capsnap.time_warp_seq = in->time_warp_seq;
4046 capsnap.change_attr = in->change_attr;
7c673cae
FG
4047 capsnap.dirty |= in->caps_dirty();
4048
11fdf7f2
TL
4049 /* Only reset it if it wasn't set before */
4050 if (capsnap.cap_dirtier_uid == -1) {
4051 capsnap.cap_dirtier_uid = in->cap_dirtier_uid;
4052 capsnap.cap_dirtier_gid = in->cap_dirtier_gid;
4053 }
4054
7c673cae
FG
4055 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4056 capsnap.inline_data = in->inline_data;
4057 capsnap.inline_version = in->inline_version;
4058 }
4059
4060 if (used & CEPH_CAP_FILE_BUFFER) {
f67539c2 4061 capsnap.writing = 1;
11fdf7f2 4062 ldout(cct, 10) << __func__ << " " << *in << " cap_snap " << &capsnap << " used " << used
7c673cae
FG
4063 << " WRBUFFER, delaying" << dendl;
4064 } else {
4065 capsnap.dirty_data = 0;
4066 flush_snaps(in);
4067 }
4068}
4069
eafe8130
TL
4070void Client::send_flush_snap(Inode *in, MetaSession *session,
4071 snapid_t follows, CapSnap& capsnap)
4072{
9f95a23c
TL
4073 auto m = make_message<MClientCaps>(CEPH_CAP_OP_FLUSHSNAP,
4074 in->ino, in->snaprealm->ino, 0,
4075 in->auth_cap->mseq, cap_epoch_barrier);
eafe8130
TL
4076 m->caller_uid = capsnap.cap_dirtier_uid;
4077 m->caller_gid = capsnap.cap_dirtier_gid;
4078
4079 m->set_client_tid(capsnap.flush_tid);
4080 m->head.snap_follows = follows;
4081
4082 m->head.caps = capsnap.issued;
4083 m->head.dirty = capsnap.dirty;
4084
4085 m->head.uid = capsnap.uid;
4086 m->head.gid = capsnap.gid;
4087 m->head.mode = capsnap.mode;
4088 m->btime = capsnap.btime;
4089
4090 m->size = capsnap.size;
4091
4092 m->head.xattr_version = capsnap.xattr_version;
4093 encode(capsnap.xattrs, m->xattrbl);
4094
4095 m->ctime = capsnap.ctime;
4096 m->btime = capsnap.btime;
4097 m->mtime = capsnap.mtime;
4098 m->atime = capsnap.atime;
4099 m->time_warp_seq = capsnap.time_warp_seq;
4100 m->change_attr = capsnap.change_attr;
4101
4102 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
4103 m->inline_version = in->inline_version;
4104 m->inline_data = in->inline_data;
4105 }
4106
4107 ceph_assert(!session->flushing_caps_tids.empty());
4108 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
4109
4110 session->con->send_message2(std::move(m));
4111}
4112
4113void Client::flush_snaps(Inode *in)
7c673cae 4114{
eafe8130 4115 ldout(cct, 10) << "flush_snaps on " << *in << dendl;
11fdf7f2 4116 ceph_assert(in->cap_snaps.size());
7c673cae
FG
4117
4118 // pick auth mds
11fdf7f2 4119 ceph_assert(in->auth_cap);
7c673cae 4120 MetaSession *session = in->auth_cap->session;
7c673cae
FG
4121
4122 for (auto &p : in->cap_snaps) {
4123 CapSnap &capsnap = p.second;
eafe8130
TL
4124 // only do new flush
4125 if (capsnap.flush_tid > 0)
4126 continue;
7c673cae
FG
4127
4128 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
4129 << " follows " << p.first
4130 << " size " << capsnap.size
4131 << " mtime " << capsnap.mtime
4132 << " dirty_data=" << capsnap.dirty_data
4133 << " writing=" << capsnap.writing
4134 << " on " << *in << dendl;
4135 if (capsnap.dirty_data || capsnap.writing)
eafe8130 4136 break;
f67539c2 4137
eafe8130
TL
4138 capsnap.flush_tid = ++last_flush_tid;
4139 session->flushing_caps_tids.insert(capsnap.flush_tid);
4140 in->flushing_cap_tids[capsnap.flush_tid] = 0;
4141 if (!in->flushing_cap_item.is_on_list())
4142 session->flushing_caps.push_back(&in->flushing_cap_item);
7c673cae 4143
eafe8130 4144 send_flush_snap(in, session, p.first, capsnap);
7c673cae
FG
4145 }
4146}
4147
9f95a23c 4148void Client::wait_on_list(list<ceph::condition_variable*>& ls)
7c673cae 4149{
9f95a23c 4150 ceph::condition_variable cond;
7c673cae 4151 ls.push_back(&cond);
9f95a23c
TL
4152 std::unique_lock l{client_lock, std::adopt_lock};
4153 cond.wait(l);
4154 l.release();
7c673cae
FG
4155 ls.remove(&cond);
4156}
4157
9f95a23c 4158void Client::signal_cond_list(list<ceph::condition_variable*>& ls)
7c673cae 4159{
9f95a23c
TL
4160 for (auto cond : ls) {
4161 cond->notify_all();
4162 }
7c673cae
FG
4163}
4164
4165void Client::wait_on_context_list(list<Context*>& ls)
4166{
9f95a23c 4167 ceph::condition_variable cond;
7c673cae
FG
4168 bool done = false;
4169 int r;
9f95a23c
TL
4170 ls.push_back(new C_Cond(cond, &done, &r));
4171 std::unique_lock l{client_lock, std::adopt_lock};
4172 cond.wait(l, [&done] { return done;});
4173 l.release();
7c673cae
FG
4174}
4175
4176void Client::signal_context_list(list<Context*>& ls)
4177{
4178 while (!ls.empty()) {
4179 ls.front()->complete(0);
4180 ls.pop_front();
4181 }
4182}
4183
a8e16298 4184void Client::wake_up_session_caps(MetaSession *s, bool reconnect)
7c673cae 4185{
11fdf7f2
TL
4186 for (const auto &cap : s->caps) {
4187 auto &in = cap->inode;
a8e16298 4188 if (reconnect) {
11fdf7f2
TL
4189 in.requested_max_size = 0;
4190 in.wanted_max_size = 0;
a8e16298
TL
4191 } else {
4192 if (cap->gen < s->cap_gen) {
4193 // mds did not re-issue stale cap.
4194 cap->issued = cap->implemented = CEPH_CAP_PIN;
4195 // make sure mds knows what we want.
11fdf7f2
TL
4196 if (in.caps_file_wanted() & ~cap->wanted)
4197 in.flags |= I_CAP_DROPPED;
a8e16298
TL
4198 }
4199 }
11fdf7f2 4200 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4201 }
4202}
4203
4204
4205// flush dirty data (from objectcache)
4206
4207class C_Client_CacheInvalidate : public Context {
4208private:
4209 Client *client;
4210 vinodeno_t ino;
4211 int64_t offset, length;
4212public:
4213 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
4214 client(c), offset(off), length(len) {
4215 if (client->use_faked_inos())
4216 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4217 else
4218 ino = in->vino();
4219 }
4220 void finish(int r) override {
4221 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
9f95a23c 4222 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
4223 client->_async_invalidate(ino, offset, length);
4224 }
4225};
4226
4227void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
4228{
f67539c2
TL
4229 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4230 if (!mref_reader.is_state_satisfied())
7c673cae 4231 return;
f67539c2 4232
11fdf7f2 4233 ldout(cct, 10) << __func__ << " " << ino << " " << off << "~" << len << dendl;
7c673cae
FG
4234 ino_invalidate_cb(callback_handle, ino, off, len);
4235}
4236
4237void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
4238
4239 if (ino_invalidate_cb)
4240 // we queue the invalidate, which calls the callback and decrements the ref
4241 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
4242}
4243
4244void Client::_invalidate_inode_cache(Inode *in)
4245{
11fdf7f2 4246 ldout(cct, 10) << __func__ << " " << *in << dendl;
7c673cae
FG
4247
4248 // invalidate our userspace inode cache
94b18763 4249 if (cct->_conf->client_oc) {
7c673cae 4250 objectcacher->release_set(&in->oset);
94b18763
FG
4251 if (!objectcacher->set_is_empty(&in->oset))
4252 lderr(cct) << "failed to invalidate cache for " << *in << dendl;
4253 }
7c673cae
FG
4254
4255 _schedule_invalidate_callback(in, 0, 0);
4256}
4257
4258void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
4259{
11fdf7f2 4260 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
4261
4262 // invalidate our userspace inode cache
4263 if (cct->_conf->client_oc) {
4264 vector<ObjectExtent> ls;
4265 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
28e407b8 4266 objectcacher->discard_writeback(&in->oset, ls, nullptr);
7c673cae
FG
4267 }
4268
4269 _schedule_invalidate_callback(in, off, len);
4270}
4271
4272bool Client::_release(Inode *in)
4273{
4274 ldout(cct, 20) << "_release " << *in << dendl;
4275 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
4276 _invalidate_inode_cache(in);
4277 return true;
4278 }
4279 return false;
4280}
4281
4282bool Client::_flush(Inode *in, Context *onfinish)
4283{
4284 ldout(cct, 10) << "_flush " << *in << dendl;
4285
4286 if (!in->oset.dirty_or_tx) {
4287 ldout(cct, 10) << " nothing to flush" << dendl;
4288 onfinish->complete(0);
4289 return true;
4290 }
4291
4292 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
1adf2230 4293 ldout(cct, 8) << __func__ << ": FULL, purging for ENOSPC" << dendl;
7c673cae
FG
4294 objectcacher->purge_set(&in->oset);
4295 if (onfinish) {
f67539c2 4296 onfinish->complete(-CEPHFS_ENOSPC);
7c673cae
FG
4297 }
4298 return true;
4299 }
4300
4301 return objectcacher->flush_set(&in->oset, onfinish);
4302}
4303
4304void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
4305{
f67539c2 4306 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
4307 if (!in->oset.dirty_or_tx) {
4308 ldout(cct, 10) << " nothing to flush" << dendl;
4309 return;
4310 }
4311
11fdf7f2 4312 C_SaferCond onflush("Client::_flush_range flock");
7c673cae 4313 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
11fdf7f2 4314 offset, size, &onflush);
7c673cae
FG
4315 if (!ret) {
4316 // wait for flush
9f95a23c 4317 client_lock.unlock();
11fdf7f2 4318 onflush.wait();
9f95a23c 4319 client_lock.lock();
7c673cae
FG
4320 }
4321}
4322
4323void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
4324{
f67539c2
TL
4325 // std::scoped_lock l(client_lock);
4326 ceph_assert(ceph_mutex_is_locked_by_me(client_lock)); // will be called via dispatch() -> objecter -> ...
7c673cae 4327 Inode *in = static_cast<Inode *>(oset->parent);
11fdf7f2 4328 ceph_assert(in);
7c673cae
FG
4329 _flushed(in);
4330}
4331
4332void Client::_flushed(Inode *in)
4333{
4334 ldout(cct, 10) << "_flushed " << *in << dendl;
4335
4336 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
4337}
4338
4339
4340
4341// checks common to add_update_cap, handle_cap_grant
11fdf7f2 4342void Client::check_cap_issue(Inode *in, unsigned issued)
7c673cae
FG
4343{
4344 unsigned had = in->caps_issued();
4345
4346 if ((issued & CEPH_CAP_FILE_CACHE) &&
4347 !(had & CEPH_CAP_FILE_CACHE))
4348 in->cache_gen++;
4349
f91f0fd5
TL
4350 if ((issued & CEPH_CAP_FILE_SHARED) !=
4351 (had & CEPH_CAP_FILE_SHARED)) {
4352 if (issued & CEPH_CAP_FILE_SHARED)
4353 in->shared_gen++;
7c673cae
FG
4354 if (in->is_dir())
4355 clear_dir_complete_and_ordered(in, true);
4356 }
4357}
4358
4359void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
a8e16298
TL
4360 unsigned issued, unsigned wanted, unsigned seq, unsigned mseq,
4361 inodeno_t realm, int flags, const UserPerm& cap_perms)
7c673cae 4362{
11fdf7f2
TL
4363 if (!in->is_any_caps()) {
4364 ceph_assert(in->snaprealm == 0);
4365 in->snaprealm = get_snap_realm(realm);
4366 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4367 ldout(cct, 15) << __func__ << " first one, opened snaprealm " << in->snaprealm << dendl;
4368 } else {
4369 ceph_assert(in->snaprealm);
4370 if ((flags & CEPH_CAP_FLAG_AUTH) &&
4371 realm != inodeno_t(-1) && in->snaprealm->ino != realm) {
4372 in->snaprealm_item.remove_myself();
4373 auto oldrealm = in->snaprealm;
4374 in->snaprealm = get_snap_realm(realm);
4375 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
4376 put_snap_realm(oldrealm);
4377 }
4378 }
4379
7c673cae 4380 mds_rank_t mds = mds_session->mds_num;
11fdf7f2
TL
4381 const auto &capem = in->caps.emplace(std::piecewise_construct, std::forward_as_tuple(mds), std::forward_as_tuple(*in, mds_session));
4382 Cap &cap = capem.first->second;
4383 if (!capem.second) {
4384 if (cap.gen < mds_session->cap_gen)
4385 cap.issued = cap.implemented = CEPH_CAP_PIN;
7c673cae
FG
4386
4387 /*
4388 * auth mds of the inode changed. we received the cap export
4389 * message, but still haven't received the cap import message.
4390 * handle_cap_export() updated the new auth MDS' cap.
4391 *
4392 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
4393 * a message that was send before the cap import message. So
4394 * don't remove caps.
4395 */
11fdf7f2 4396 if (ceph_seq_cmp(seq, cap.seq) <= 0) {
92f5a8d4
TL
4397 if (&cap != in->auth_cap)
4398 ldout(cct, 0) << "WARNING: " << "inode " << *in << " caps on mds." << mds << " != auth_cap." << dendl;
4399
11fdf7f2
TL
4400 ceph_assert(cap.cap_id == cap_id);
4401 seq = cap.seq;
4402 mseq = cap.mseq;
4403 issued |= cap.issued;
7c673cae
FG
4404 flags |= CEPH_CAP_FLAG_AUTH;
4405 }
f67539c2
TL
4406 } else {
4407 inc_pinned_icaps();
7c673cae
FG
4408 }
4409
11fdf7f2 4410 check_cap_issue(in, issued);
7c673cae
FG
4411
4412 if (flags & CEPH_CAP_FLAG_AUTH) {
11fdf7f2 4413 if (in->auth_cap != &cap &&
7c673cae
FG
4414 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
4415 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
11fdf7f2 4416 ldout(cct, 10) << __func__ << " changing auth cap: "
7c673cae
FG
4417 << "add myself to new auth MDS' flushing caps list" << dendl;
4418 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
4419 }
11fdf7f2 4420 in->auth_cap = &cap;
7c673cae
FG
4421 }
4422 }
4423
11fdf7f2
TL
4424 unsigned old_caps = cap.issued;
4425 cap.cap_id = cap_id;
4426 cap.issued = issued;
4427 cap.implemented |= issued;
4428 if (ceph_seq_cmp(mseq, cap.mseq) > 0)
4429 cap.wanted = wanted;
a8e16298 4430 else
11fdf7f2
TL
4431 cap.wanted |= wanted;
4432 cap.seq = seq;
4433 cap.issue_seq = seq;
4434 cap.mseq = mseq;
4435 cap.gen = mds_session->cap_gen;
4436 cap.latest_perms = cap_perms;
4437 ldout(cct, 10) << __func__ << " issued " << ccap_string(old_caps) << " -> " << ccap_string(cap.issued)
4438 << " from mds." << mds
4439 << " on " << *in
4440 << dendl;
4441
4442 if ((issued & ~old_caps) && in->auth_cap == &cap) {
7c673cae 4443 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
4444 for (auto &p : in->caps) {
4445 if (&p.second == &cap)
7c673cae 4446 continue;
11fdf7f2 4447 if (p.second.implemented & ~p.second.issued & issued) {
7c673cae
FG
4448 check_caps(in, CHECK_CAPS_NODELAY);
4449 break;
4450 }
4451 }
4452 }
4453
4454 if (issued & ~old_caps)
4455 signal_cond_list(in->waitfor_caps);
4456}
4457
4458void Client::remove_cap(Cap *cap, bool queue_release)
4459{
11fdf7f2 4460 auto &in = cap->inode;
7c673cae
FG
4461 MetaSession *session = cap->session;
4462 mds_rank_t mds = cap->session->mds_num;
4463
11fdf7f2 4464 ldout(cct, 10) << __func__ << " mds." << mds << " on " << in << dendl;
7c673cae
FG
4465
4466 if (queue_release) {
4467 session->enqueue_cap_release(
11fdf7f2 4468 in.ino,
7c673cae
FG
4469 cap->cap_id,
4470 cap->issue_seq,
4471 cap->mseq,
4472 cap_epoch_barrier);
f67539c2
TL
4473 } else {
4474 dec_pinned_icaps();
7c673cae
FG
4475 }
4476
f67539c2 4477
11fdf7f2
TL
4478 if (in.auth_cap == cap) {
4479 if (in.flushing_cap_item.is_on_list()) {
7c673cae 4480 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
11fdf7f2 4481 in.flushing_cap_item.remove_myself();
7c673cae 4482 }
11fdf7f2 4483 in.auth_cap = NULL;
7c673cae 4484 }
11fdf7f2
TL
4485 size_t n = in.caps.erase(mds);
4486 ceph_assert(n == 1);
7c673cae
FG
4487 cap = nullptr;
4488
11fdf7f2
TL
4489 if (!in.is_any_caps()) {
4490 ldout(cct, 15) << __func__ << " last one, closing snaprealm " << in.snaprealm << dendl;
4491 in.snaprealm_item.remove_myself();
4492 put_snap_realm(in.snaprealm);
4493 in.snaprealm = 0;
7c673cae
FG
4494 }
4495}
4496
4497void Client::remove_all_caps(Inode *in)
4498{
4499 while (!in->caps.empty())
11fdf7f2 4500 remove_cap(&in->caps.begin()->second, true);
7c673cae
FG
4501}
4502
f6b5b4d7 4503void Client::remove_session_caps(MetaSession *s, int err)
7c673cae 4504{
11fdf7f2 4505 ldout(cct, 10) << __func__ << " mds." << s->mds_num << dendl;
7c673cae
FG
4506
4507 while (s->caps.size()) {
4508 Cap *cap = *s->caps.begin();
11fdf7f2 4509 InodeRef in(&cap->inode);
eafe8130 4510 bool dirty_caps = false;
7c673cae 4511 if (in->auth_cap == cap) {
7c673cae
FG
4512 dirty_caps = in->dirty_caps | in->flushing_caps;
4513 in->wanted_max_size = 0;
4514 in->requested_max_size = 0;
f6b5b4d7
TL
4515 if (in->has_any_filelocks())
4516 in->flags |= I_ERROR_FILELOCK;
7c673cae 4517 }
f6b5b4d7 4518 auto caps = cap->implemented;
a8e16298
TL
4519 if (cap->wanted | cap->issued)
4520 in->flags |= I_CAP_DROPPED;
7c673cae 4521 remove_cap(cap, false);
eafe8130 4522 in->cap_snaps.clear();
7c673cae 4523 if (dirty_caps) {
11fdf7f2 4524 lderr(cct) << __func__ << " still has dirty|flushing caps on " << *in << dendl;
7c673cae
FG
4525 if (in->flushing_caps) {
4526 num_flushing_caps--;
4527 in->flushing_cap_tids.clear();
4528 }
4529 in->flushing_caps = 0;
28e407b8 4530 in->mark_caps_clean();
11fdf7f2 4531 put_inode(in.get());
7c673cae 4532 }
f6b5b4d7
TL
4533 caps &= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER;
4534 if (caps && !in->caps_issued_mask(caps, true)) {
f67539c2 4535 if (err == -CEPHFS_EBLOCKLISTED) {
f6b5b4d7
TL
4536 if (in->oset.dirty_or_tx) {
4537 lderr(cct) << __func__ << " still has dirty data on " << *in << dendl;
4538 in->set_async_err(err);
4539 }
4540 objectcacher->purge_set(&in->oset);
4541 } else {
4542 objectcacher->release_set(&in->oset);
4543 }
4544 _schedule_invalidate_callback(in.get(), 0, 0);
4545 }
4546
a8e16298 4547 signal_cond_list(in->waitfor_caps);
7c673cae
FG
4548 }
4549 s->flushing_caps_tids.clear();
9f95a23c 4550 sync_cond.notify_all();
7c673cae
FG
4551}
4552
1d09f67e 4553std::pair<int, bool> Client::_do_remount(bool retry_on_error)
b32b8144 4554{
39ae355f 4555 uint64_t max_retries = cct->_conf.get_val<uint64_t>("client_max_retries_on_remount_failure");
1d09f67e 4556 bool abort_on_failure = false;
91327a77 4557
b32b8144
FG
4558 errno = 0;
4559 int r = remount_cb(callback_handle);
91327a77
AA
4560 if (r == 0) {
4561 retries_on_invalidate = 0;
4562 } else {
b32b8144
FG
4563 int e = errno;
4564 client_t whoami = get_nodeid();
4565 if (r == -1) {
4566 lderr(cct) <<
4567 "failed to remount (to trim kernel dentries): "
4568 "errno = " << e << " (" << strerror(e) << ")" << dendl;
4569 } else {
4570 lderr(cct) <<
4571 "failed to remount (to trim kernel dentries): "
4572 "return code = " << r << dendl;
4573 }
91327a77 4574 bool should_abort =
11fdf7f2
TL
4575 (cct->_conf.get_val<bool>("client_die_on_failed_remount") ||
4576 cct->_conf.get_val<bool>("client_die_on_failed_dentry_invalidate")) &&
91327a77 4577 !(retry_on_error && (++retries_on_invalidate < max_retries));
f67539c2 4578 if (should_abort && !is_unmounting()) {
b32b8144 4579 lderr(cct) << "failed to remount for kernel dentry trimming; quitting!" << dendl;
1d09f67e 4580 abort_on_failure = true;
b32b8144
FG
4581 }
4582 }
1d09f67e 4583 return std::make_pair(r, abort_on_failure);
b32b8144
FG
4584}
4585
7c673cae
FG
4586class C_Client_Remount : public Context {
4587private:
4588 Client *client;
4589public:
4590 explicit C_Client_Remount(Client *c) : client(c) {}
4591 void finish(int r) override {
11fdf7f2 4592 ceph_assert(r == 0);
39ae355f
TL
4593 auto result = client->_do_remount(true);
4594 if (result.second) {
4595 ceph_abort();
4596 }
7c673cae
FG
4597 }
4598};
4599
4600void Client::_invalidate_kernel_dcache()
4601{
f67539c2
TL
4602 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4603 if (!mref_reader.is_state_satisfied())
7c673cae 4604 return;
f67539c2 4605
94b18763
FG
4606 if (can_invalidate_dentries) {
4607 if (dentry_invalidate_cb && root->dir) {
4608 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4609 p != root->dir->dentries.end();
4610 ++p) {
4611 if (p->second->inode)
4612 _schedule_invalidate_dentry_callback(p->second, false);
4613 }
7c673cae
FG
4614 }
4615 } else if (remount_cb) {
4616 // Hacky:
4617 // when remounting a file system, linux kernel trims all unused dentries in the fs
4618 remount_finisher.queue(new C_Client_Remount(this));
4619 }
4620}
4621
91327a77
AA
4622void Client::_trim_negative_child_dentries(InodeRef& in)
4623{
4624 if (!in->is_dir())
4625 return;
4626
4627 Dir* dir = in->dir;
4628 if (dir && dir->dentries.size() == dir->num_null_dentries) {
4629 for (auto p = dir->dentries.begin(); p != dir->dentries.end(); ) {
4630 Dentry *dn = p->second;
4631 ++p;
11fdf7f2 4632 ceph_assert(!dn->inode);
91327a77
AA
4633 if (dn->lru_is_expireable())
4634 unlink(dn, true, false); // keep dir, drop dentry
4635 }
4636 if (dir->dentries.empty()) {
4637 close_dir(dir);
4638 }
4639 }
4640
4641 if (in->flags & I_SNAPDIR_OPEN) {
4642 InodeRef snapdir = open_snapdir(in.get());
4643 _trim_negative_child_dentries(snapdir);
4644 }
4645}
4646
e306af50
TL
4647class C_Client_CacheRelease : public Context {
4648private:
4649 Client *client;
4650 vinodeno_t ino;
4651public:
4652 C_Client_CacheRelease(Client *c, Inode *in) :
4653 client(c) {
4654 if (client->use_faked_inos())
4655 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
4656 else
4657 ino = in->vino();
4658 }
4659 void finish(int r) override {
4660 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
4661 client->_async_inode_release(ino);
4662 }
4663};
4664
4665void Client::_async_inode_release(vinodeno_t ino)
4666{
f67539c2
TL
4667 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
4668 if (!mref_reader.is_state_satisfied())
e306af50 4669 return;
f67539c2 4670
e306af50
TL
4671 ldout(cct, 10) << __func__ << " " << ino << dendl;
4672 ino_release_cb(callback_handle, ino);
4673}
4674
4675void Client::_schedule_ino_release_callback(Inode *in) {
4676
4677 if (ino_release_cb)
4678 // we queue the invalidate, which calls the callback and decrements the ref
4679 async_ino_releasor.queue(new C_Client_CacheRelease(this, in));
4680}
4681
28e407b8 4682void Client::trim_caps(MetaSession *s, uint64_t max)
7c673cae
FG
4683{
4684 mds_rank_t mds = s->mds_num;
28e407b8 4685 size_t caps_size = s->caps.size();
11fdf7f2 4686 ldout(cct, 10) << __func__ << " mds." << mds << " max " << max
7c673cae
FG
4687 << " caps " << caps_size << dendl;
4688
28e407b8
AA
4689 uint64_t trimmed = 0;
4690 auto p = s->caps.begin();
4691 std::set<Dentry *> to_trim; /* this avoids caps other than the one we're
4692 * looking at from getting deleted during traversal. */
7c673cae
FG
4693 while ((caps_size - trimmed) > max && !p.end()) {
4694 Cap *cap = *p;
11fdf7f2 4695 InodeRef in(&cap->inode);
7c673cae
FG
4696
4697 // Increment p early because it will be invalidated if cap
4698 // is deleted inside remove_cap
4699 ++p;
4700
4701 if (in->caps.size() > 1 && cap != in->auth_cap) {
4702 int mine = cap->issued | cap->implemented;
4703 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4704 // disposable non-auth cap
b32b8144 4705 if (!(get_caps_used(in.get()) & ~oissued & mine)) {
7c673cae 4706 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
28e407b8 4707 cap = (remove_cap(cap, true), nullptr);
7c673cae
FG
4708 trimmed++;
4709 }
4710 } else {
4711 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
91327a77 4712 _trim_negative_child_dentries(in);
7c673cae 4713 bool all = true;
11fdf7f2
TL
4714 auto q = in->dentries.begin();
4715 while (q != in->dentries.end()) {
4716 Dentry *dn = *q;
4717 ++q;
7c673cae
FG
4718 if (dn->lru_is_expireable()) {
4719 if (can_invalidate_dentries &&
b3b6e05e 4720 dn->dir->parent_inode->ino == CEPH_INO_ROOT) {
7c673cae
FG
4721 // Only issue one of these per DN for inodes in root: handle
4722 // others more efficiently by calling for root-child DNs at
4723 // the end of this function.
4724 _schedule_invalidate_dentry_callback(dn, true);
4725 }
28e407b8
AA
4726 ldout(cct, 20) << " queueing dentry for trimming: " << dn->name << dendl;
4727 to_trim.insert(dn);
7c673cae
FG
4728 } else {
4729 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4730 all = false;
4731 }
4732 }
b3b6e05e 4733 if (in->ll_ref == 1 && in->ino != CEPH_INO_ROOT) {
f91f0fd5
TL
4734 _schedule_ino_release_callback(in.get());
4735 }
b3b6e05e 4736 if (all && in->ino != CEPH_INO_ROOT) {
7c673cae
FG
4737 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4738 trimmed++;
4739 }
4740 }
4741 }
28e407b8
AA
4742 ldout(cct, 20) << " trimming queued dentries: " << dendl;
4743 for (const auto &dn : to_trim) {
4744 trim_dentry(dn);
4745 }
4746 to_trim.clear();
7c673cae 4747
b32b8144 4748 caps_size = s->caps.size();
11fdf7f2 4749 if (caps_size > (size_t)max)
7c673cae
FG
4750 _invalidate_kernel_dcache();
4751}
4752
4753void Client::force_session_readonly(MetaSession *s)
4754{
4755 s->readonly = true;
4756 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
11fdf7f2
TL
4757 auto &in = (*p)->inode;
4758 if (in.caps_wanted() & CEPH_CAP_FILE_WR)
4759 signal_cond_list(in.waitfor_caps);
7c673cae
FG
4760 }
4761}
4762
7c673cae
FG
4763int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4764{
4765 MetaSession *session = in->auth_cap->session;
4766
4767 int flushing = in->dirty_caps;
11fdf7f2 4768 ceph_assert(flushing);
7c673cae
FG
4769
4770 ceph_tid_t flush_tid = ++last_flush_tid;
4771 in->flushing_cap_tids[flush_tid] = flushing;
4772
4773 if (!in->flushing_caps) {
11fdf7f2 4774 ldout(cct, 10) << __func__ << " " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4775 num_flushing_caps++;
4776 } else {
11fdf7f2 4777 ldout(cct, 10) << __func__ << " (more) " << ccap_string(flushing) << " " << *in << dendl;
7c673cae
FG
4778 }
4779
4780 in->flushing_caps |= flushing;
28e407b8 4781 in->mark_caps_clean();
7c673cae
FG
4782
4783 if (!in->flushing_cap_item.is_on_list())
4784 session->flushing_caps.push_back(&in->flushing_cap_item);
4785 session->flushing_caps_tids.insert(flush_tid);
4786
4787 *ptid = flush_tid;
4788 return flushing;
4789}
4790
4791void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4792{
4793 for (auto &p : in->cap_snaps) {
4794 CapSnap &capsnap = p.second;
4795 if (capsnap.flush_tid > 0) {
4796 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4797 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4798 }
4799 }
4800 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4801 it != in->flushing_cap_tids.end();
4802 ++it) {
4803 old_s->flushing_caps_tids.erase(it->first);
4804 new_s->flushing_caps_tids.insert(it->first);
4805 }
4806 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4807}
4808
4809/*
20effc67
TL
4810 * Flush all the dirty caps back to the MDS. Because the callers
4811 * generally wait on the result of this function (syncfs and umount
4812 * cases), we set CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
7c673cae
FG
4813 */
4814void Client::flush_caps_sync()
4815{
4816 ldout(cct, 10) << __func__ << dendl;
20effc67
TL
4817 for (auto &q : mds_sessions) {
4818 auto s = q.second;
4819 xlist<Inode*>::iterator p = s->dirty_list.begin();
4820 while (!p.end()) {
4821 unsigned flags = CHECK_CAPS_NODELAY;
4822 Inode *in = *p;
7c673cae 4823
20effc67
TL
4824 ++p;
4825 if (p.end())
4826 flags |= CHECK_CAPS_SYNCHRONOUS;
4827 check_caps(in, flags);
4828 }
7c673cae
FG
4829 }
4830}
4831
7c673cae
FG
4832void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4833{
4834 while (in->flushing_caps) {
4835 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
11fdf7f2 4836 ceph_assert(it != in->flushing_cap_tids.end());
7c673cae
FG
4837 if (it->first > want)
4838 break;
11fdf7f2 4839 ldout(cct, 10) << __func__ << " on " << *in << " flushing "
7c673cae
FG
4840 << ccap_string(it->second) << " want " << want
4841 << " last " << it->first << dendl;
4842 wait_on_list(in->waitfor_caps);
4843 }
4844}
4845
4846void Client::wait_sync_caps(ceph_tid_t want)
4847{
4848 retry:
11fdf7f2 4849 ldout(cct, 10) << __func__ << " want " << want << " (last is " << last_flush_tid << ", "
7c673cae 4850 << num_flushing_caps << " total flushing)" << dendl;
11fdf7f2 4851 for (auto &p : mds_sessions) {
20effc67 4852 auto s = p.second;
7c673cae
FG
4853 if (s->flushing_caps_tids.empty())
4854 continue;
4855 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4856 if (oldest_tid <= want) {
11fdf7f2 4857 ldout(cct, 10) << " waiting on mds." << p.first << " tid " << oldest_tid
7c673cae 4858 << " (want " << want << ")" << dendl;
9f95a23c
TL
4859 std::unique_lock l{client_lock, std::adopt_lock};
4860 sync_cond.wait(l);
4861 l.release();
7c673cae
FG
4862 goto retry;
4863 }
4864 }
4865}
4866
eafe8130
TL
4867void Client::kick_flushing_caps(Inode *in, MetaSession *session)
4868{
4869 in->flags &= ~I_KICK_FLUSH;
4870
4871 Cap *cap = in->auth_cap;
4872 ceph_assert(cap->session == session);
4873
4874 ceph_tid_t last_snap_flush = 0;
4875 for (auto p = in->flushing_cap_tids.rbegin();
4876 p != in->flushing_cap_tids.rend();
4877 ++p) {
4878 if (!p->second) {
4879 last_snap_flush = p->first;
4880 break;
4881 }
4882 }
4883
4884 int wanted = in->caps_wanted();
4885 int used = get_caps_used(in) | in->caps_dirty();
4886 auto it = in->cap_snaps.begin();
4887 for (auto& p : in->flushing_cap_tids) {
4888 if (p.second) {
4889 int msg_flags = p.first < last_snap_flush ? MClientCaps::FLAG_PENDING_CAPSNAP : 0;
4890 send_cap(in, session, cap, msg_flags, used, wanted, (cap->issued | cap->implemented),
4891 p.second, p.first);
4892 } else {
4893 ceph_assert(it != in->cap_snaps.end());
4894 ceph_assert(it->second.flush_tid == p.first);
4895 send_flush_snap(in, session, it->first, it->second);
4896 ++it;
4897 }
4898 }
4899}
4900
7c673cae
FG
4901void Client::kick_flushing_caps(MetaSession *session)
4902{
4903 mds_rank_t mds = session->mds_num;
11fdf7f2 4904 ldout(cct, 10) << __func__ << " mds." << mds << dendl;
7c673cae
FG
4905
4906 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4907 Inode *in = *p;
eafe8130
TL
4908 if (in->flags & I_KICK_FLUSH) {
4909 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4910 kick_flushing_caps(in, session);
4911 }
7c673cae 4912 }
7c673cae
FG
4913}
4914
4915void Client::early_kick_flushing_caps(MetaSession *session)
4916{
7c673cae
FG
4917 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4918 Inode *in = *p;
11fdf7f2
TL
4919 Cap *cap = in->auth_cap;
4920 ceph_assert(cap);
7c673cae
FG
4921
4922 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4923 // stage. This guarantees that MDS processes the cap flush message before issuing
4924 // the flushing caps to other client.
eafe8130
TL
4925 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) {
4926 in->flags |= I_KICK_FLUSH;
7c673cae 4927 continue;
eafe8130 4928 }
7c673cae
FG
4929
4930 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4931 << " to mds." << session->mds_num << dendl;
11fdf7f2
TL
4932 // send_reconnect() also will reset these sequence numbers. make sure
4933 // sequence numbers in cap flush message match later reconnect message.
4934 cap->seq = 0;
4935 cap->issue_seq = 0;
4936 cap->mseq = 0;
4937 cap->issued = cap->implemented;
4938
eafe8130 4939 kick_flushing_caps(in, session);
7c673cae
FG
4940 }
4941}
4942
7c673cae
FG
4943void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4944{
4945 list<SnapRealm*> q;
4946 q.push_back(realm);
4947
4948 while (!q.empty()) {
4949 realm = q.front();
4950 q.pop_front();
4951
11fdf7f2 4952 ldout(cct, 10) << __func__ << " " << *realm << dendl;
7c673cae
FG
4953 realm->invalidate_cache();
4954
4955 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4956 p != realm->pchildren.end();
4957 ++p)
4958 q.push_back(*p);
4959 }
4960}
4961
4962SnapRealm *Client::get_snap_realm(inodeno_t r)
4963{
4964 SnapRealm *realm = snap_realms[r];
2a845540
TL
4965
4966 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref was "
4967 << (realm ? realm->nref : 0) << dendl;
4968 if (!realm) {
7c673cae 4969 snap_realms[r] = realm = new SnapRealm(r);
2a845540
TL
4970
4971 // Do not release the global snaprealm until unmounting.
4972 if (r == CEPH_INO_GLOBAL_SNAPREALM)
4973 realm->nref++;
4974 }
4975
7c673cae 4976 realm->nref++;
2a845540
TL
4977 ldout(cct, 20) << __func__ << " " << r << " " << realm << ", nref now is "
4978 << realm->nref << dendl;
7c673cae
FG
4979 return realm;
4980}
4981
4982SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4983{
4984 if (snap_realms.count(r) == 0) {
11fdf7f2 4985 ldout(cct, 20) << __func__ << " " << r << " fail" << dendl;
7c673cae
FG
4986 return NULL;
4987 }
4988 SnapRealm *realm = snap_realms[r];
11fdf7f2 4989 ldout(cct, 20) << __func__ << " " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
7c673cae
FG
4990 realm->nref++;
4991 return realm;
4992}
4993
4994void Client::put_snap_realm(SnapRealm *realm)
4995{
11fdf7f2 4996 ldout(cct, 20) << __func__ << " " << realm->ino << " " << realm
7c673cae
FG
4997 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4998 if (--realm->nref == 0) {
4999 snap_realms.erase(realm->ino);
5000 if (realm->pparent) {
5001 realm->pparent->pchildren.erase(realm);
5002 put_snap_realm(realm->pparent);
5003 }
5004 delete realm;
5005 }
5006}
5007
5008bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
5009{
5010 if (realm->parent != parent) {
11fdf7f2 5011 ldout(cct, 10) << __func__ << " " << *realm
7c673cae
FG
5012 << " " << realm->parent << " -> " << parent << dendl;
5013 realm->parent = parent;
5014 if (realm->pparent) {
5015 realm->pparent->pchildren.erase(realm);
5016 put_snap_realm(realm->pparent);
5017 }
5018 realm->pparent = get_snap_realm(parent);
5019 realm->pparent->pchildren.insert(realm);
5020 return true;
5021 }
5022 return false;
5023}
5024
5025static bool has_new_snaps(const SnapContext& old_snapc,
5026 const SnapContext& new_snapc)
5027{
5028 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
5029}
5030
1e59de90
TL
5031struct SnapRealmInfoMeta {
5032 SnapRealmInfoMeta(utime_t last_modified, uint64_t change_attr)
5033 : last_modified(last_modified),
5034 change_attr(change_attr) {
5035 }
7c673cae 5036
1e59de90
TL
5037 utime_t last_modified;
5038 uint64_t change_attr;
5039};
5040
5041static std::pair<SnapRealmInfo, std::optional<SnapRealmInfoMeta>> get_snap_realm_info(
5042 MetaSession *session, bufferlist::const_iterator &p) {
5043 if (session->mds_features.test(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
5044 SnapRealmInfoNew ninfo;
5045 decode(ninfo, p);
5046 return std::make_pair(ninfo.info, SnapRealmInfoMeta(ninfo.last_modified, ninfo.change_attr));
5047 } else {
5048 SnapRealmInfo info;
5049 decode(info, p);
5050 return std::make_pair(info, std::nullopt);
5051 }
5052}
5053
5054
5055void Client::update_snap_trace(MetaSession *session, const bufferlist& bl, SnapRealm **realm_ret, bool flush)
7c673cae
FG
5056{
5057 SnapRealm *first_realm = NULL;
11fdf7f2 5058 ldout(cct, 10) << __func__ << " len " << bl.length() << dendl;
7c673cae
FG
5059
5060 map<SnapRealm*, SnapContext> dirty_realms;
5061
11fdf7f2 5062 auto p = bl.cbegin();
7c673cae 5063 while (!p.end()) {
1e59de90 5064 auto [info, realm_info_meta] = get_snap_realm_info(session, p);
7c673cae
FG
5065 SnapRealm *realm = get_snap_realm(info.ino());
5066
5067 bool invalidate = false;
5068
1e59de90
TL
5069 if (info.seq() > realm->seq ||
5070 (realm_info_meta && (*realm_info_meta).change_attr > realm->change_attr)) {
11fdf7f2 5071 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq() << " > " << realm->seq
1e59de90 5072 << dendl;
7c673cae
FG
5073
5074 if (flush) {
5075 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
5076 // flush me + children
5077 list<SnapRealm*> q;
5078 q.push_back(realm);
5079 while (!q.empty()) {
5080 SnapRealm *realm = q.front();
5081 q.pop_front();
5082
5083 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
5084 p != realm->pchildren.end();
5085 ++p)
5086 q.push_back(*p);
5087
5088 if (dirty_realms.count(realm) == 0) {
5089 realm->nref++;
5090 dirty_realms[realm] = realm->get_snap_context();
5091 }
5092 }
5093 }
5094
5095 // update
5096 realm->seq = info.seq();
5097 realm->created = info.created();
5098 realm->parent_since = info.parent_since();
5099 realm->prior_parent_snaps = info.prior_parent_snaps;
1e59de90
TL
5100 if (realm_info_meta) {
5101 realm->last_modified = (*realm_info_meta).last_modified;
5102 realm->change_attr = (*realm_info_meta).change_attr;
5103 }
7c673cae
FG
5104 realm->my_snaps = info.my_snaps;
5105 invalidate = true;
5106 }
5107
5108 // _always_ verify parent
5109 if (adjust_realm_parent(realm, info.parent()))
5110 invalidate = true;
5111
5112 if (invalidate) {
5113 invalidate_snaprealm_and_children(realm);
11fdf7f2 5114 ldout(cct, 15) << __func__ << " " << *realm << " self|parent updated" << dendl;
7c673cae
FG
5115 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
5116 } else {
11fdf7f2 5117 ldout(cct, 10) << __func__ << " " << *realm << " seq " << info.seq()
7c673cae
FG
5118 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
5119 }
f67539c2 5120
7c673cae
FG
5121 if (!first_realm)
5122 first_realm = realm;
5123 else
5124 put_snap_realm(realm);
5125 }
5126
f67539c2 5127 for (auto &[realm, snapc] : dirty_realms) {
7c673cae 5128 // if there are new snaps ?
f67539c2 5129 if (has_new_snaps(snapc, realm->get_snap_context())) {
7c673cae 5130 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
f67539c2
TL
5131 for (auto&& in : realm->inodes_with_caps) {
5132 queue_cap_snap(in, snapc);
7c673cae
FG
5133 }
5134 } else {
5135 ldout(cct, 10) << " no new snap on " << *realm << dendl;
5136 }
5137 put_snap_realm(realm);
5138 }
5139
5140 if (realm_ret)
5141 *realm_ret = first_realm;
5142 else
5143 put_snap_realm(first_realm);
5144}
5145
11fdf7f2 5146void Client::handle_snap(const MConstRef<MClientSnap>& m)
7c673cae 5147{
11fdf7f2 5148 ldout(cct, 10) << __func__ << " " << *m << dendl;
7c673cae 5149 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5150
5151 std::scoped_lock cl(client_lock);
20effc67 5152 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5153 if (!session) {
7c673cae
FG
5154 return;
5155 }
5156
20effc67 5157 got_mds_push(session.get());
7c673cae
FG
5158
5159 map<Inode*, SnapContext> to_move;
5160 SnapRealm *realm = 0;
5161
5162 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
11fdf7f2 5163 ceph_assert(m->head.split);
11fdf7f2 5164 auto p = m->bl.cbegin();
1e59de90 5165 auto [info, _] = get_snap_realm_info(session.get(), p);
11fdf7f2 5166 ceph_assert(info.ino() == m->head.split);
7c673cae
FG
5167
5168 // flush, then move, ino's.
5169 realm = get_snap_realm(info.ino());
5170 ldout(cct, 10) << " splitting off " << *realm << dendl;
11fdf7f2
TL
5171 for (auto& ino : m->split_inos) {
5172 vinodeno_t vino(ino, CEPH_NOSNAP);
7c673cae
FG
5173 if (inode_map.count(vino)) {
5174 Inode *in = inode_map[vino];
5175 if (!in->snaprealm || in->snaprealm == realm)
5176 continue;
5177 if (in->snaprealm->created > info.created()) {
5178 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
5179 << *in->snaprealm << dendl;
5180 continue;
5181 }
5182 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
5183
5184
5185 in->snaprealm_item.remove_myself();
5186 to_move[in] = in->snaprealm->get_snap_context();
5187 put_snap_realm(in->snaprealm);
5188 }
5189 }
5190
5191 // move child snaprealms, too
11fdf7f2
TL
5192 for (auto& child_realm : m->split_realms) {
5193 ldout(cct, 10) << "adjusting snaprealm " << child_realm << " parent" << dendl;
5194 SnapRealm *child = get_snap_realm_maybe(child_realm);
7c673cae
FG
5195 if (!child)
5196 continue;
5197 adjust_realm_parent(child, realm->ino);
5198 put_snap_realm(child);
5199 }
5200 }
5201
1e59de90 5202 update_snap_trace(session.get(), m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
7c673cae
FG
5203
5204 if (realm) {
5205 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
5206 Inode *in = p->first;
5207 in->snaprealm = realm;
5208 realm->inodes_with_caps.push_back(&in->snaprealm_item);
5209 realm->nref++;
5210 // queue for snap writeback
5211 if (has_new_snaps(p->second, realm->get_snap_context()))
5212 queue_cap_snap(in, p->second);
5213 }
5214 put_snap_realm(realm);
5215 }
7c673cae
FG
5216}
5217
11fdf7f2 5218void Client::handle_quota(const MConstRef<MClientQuota>& m)
7c673cae
FG
5219{
5220 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5221
5222 std::scoped_lock cl(client_lock);
20effc67 5223 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5224 if (!session) {
7c673cae
FG
5225 return;
5226 }
5227
20effc67 5228 got_mds_push(session.get());
7c673cae 5229
11fdf7f2 5230 ldout(cct, 10) << __func__ << " " << *m << " from mds." << mds << dendl;
7c673cae
FG
5231
5232 vinodeno_t vino(m->ino, CEPH_NOSNAP);
5233 if (inode_map.count(vino)) {
5234 Inode *in = NULL;
5235 in = inode_map[vino];
5236
5237 if (in) {
5238 in->quota = m->quota;
5239 in->rstat = m->rstat;
5240 }
5241 }
7c673cae
FG
5242}
5243
11fdf7f2 5244void Client::handle_caps(const MConstRef<MClientCaps>& m)
7c673cae
FG
5245{
5246 mds_rank_t mds = mds_rank_t(m->get_source().num());
f67539c2
TL
5247
5248 std::scoped_lock cl(client_lock);
20effc67 5249 auto session = _get_mds_session(mds, m->get_connection().get());
7c673cae 5250 if (!session) {
7c673cae
FG
5251 return;
5252 }
5253
5254 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
5255 // Pause RADOS operations until we see the required epoch
5256 objecter->set_epoch_barrier(m->osd_epoch_barrier);
5257 }
5258
5259 if (m->osd_epoch_barrier > cap_epoch_barrier) {
5260 // Record the barrier so that we will transmit it to MDS when releasing
5261 set_cap_epoch_barrier(m->osd_epoch_barrier);
5262 }
5263
20effc67 5264 got_mds_push(session.get());
7c673cae 5265
11fdf7f2 5266 Inode *in;
7c673cae 5267 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
11fdf7f2
TL
5268 if (auto it = inode_map.find(vino); it != inode_map.end()) {
5269 in = it->second;
5270 } else {
7c673cae 5271 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
11fdf7f2 5272 ldout(cct, 5) << __func__ << " don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
7c673cae
FG
5273 session->enqueue_cap_release(
5274 m->get_ino(),
5275 m->get_cap_id(),
5276 m->get_seq(),
5277 m->get_mseq(),
5278 cap_epoch_barrier);
5279 } else {
11fdf7f2 5280 ldout(cct, 5) << __func__ << " don't have vino " << vino << ", dropping" << dendl;
7c673cae 5281 }
7c673cae
FG
5282
5283 // in case the mds is waiting on e.g. a revocation
5284 flush_cap_releases();
5285 return;
5286 }
5287
5288 switch (m->get_op()) {
20effc67
TL
5289 case CEPH_CAP_OP_EXPORT: return handle_cap_export(session.get(), in, m);
5290 case CEPH_CAP_OP_FLUSHSNAP_ACK: return handle_cap_flushsnap_ack(session.get(), in, m);
5291 case CEPH_CAP_OP_IMPORT: /* no return */ handle_cap_import(session.get(), in, m);
7c673cae
FG
5292 }
5293
11fdf7f2
TL
5294 if (auto it = in->caps.find(mds); it != in->caps.end()) {
5295 Cap &cap = in->caps.at(mds);
7c673cae 5296
11fdf7f2 5297 switch (m->get_op()) {
20effc67 5298 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session.get(), in, m);
11fdf7f2
TL
5299 case CEPH_CAP_OP_IMPORT:
5300 case CEPH_CAP_OP_REVOKE:
20effc67
TL
5301 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session.get(), in, &cap, m);
5302 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session.get(), in, &cap, m);
11fdf7f2
TL
5303 }
5304 } else {
5305 ldout(cct, 5) << __func__ << " don't have " << *in << " cap on mds." << mds << dendl;
5306 return;
7c673cae
FG
5307 }
5308}
5309
11fdf7f2 5310void Client::handle_cap_import(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5311{
5312 mds_rank_t mds = session->mds_num;
5313
11fdf7f2 5314 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5315 << " IMPORT from mds." << mds << dendl;
5316
5317 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
5318 Cap *cap = NULL;
5319 UserPerm cap_perms;
11fdf7f2
TL
5320 if (auto it = in->caps.find(peer_mds); m->peer.cap_id && it != in->caps.end()) {
5321 cap = &it->second;
5322 cap_perms = cap->latest_perms;
7c673cae
FG
5323 }
5324
5325 // add/update it
5326 SnapRealm *realm = NULL;
1e59de90 5327 update_snap_trace(session, m->snapbl, &realm);
7c673cae 5328
1911f103
TL
5329 int issued = m->get_caps();
5330 int wanted = m->get_wanted();
7c673cae 5331 add_update_cap(in, session, m->get_cap_id(),
1911f103 5332 issued, wanted, m->get_seq(), m->get_mseq(),
a8e16298 5333 m->get_realm(), CEPH_CAP_FLAG_AUTH, cap_perms);
7c673cae
FG
5334
5335 if (cap && cap->cap_id == m->peer.cap_id) {
5336 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
5337 }
5338
5339 if (realm)
5340 put_snap_realm(realm);
5341
eafe8130 5342 if (in->auth_cap && in->auth_cap->session == session) {
1911f103
TL
5343 if (!(wanted & CEPH_CAP_ANY_FILE_WR) ||
5344 in->requested_max_size > m->get_max_size()) {
5345 in->requested_max_size = 0;
5346 ldout(cct, 15) << "reset requested_max_size after cap import" << dendl;
5347 }
7c673cae 5348 // reflush any/all caps (if we are now the auth_cap)
eafe8130 5349 kick_flushing_caps(in, session);
7c673cae
FG
5350 }
5351}
5352
11fdf7f2 5353void Client::handle_cap_export(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5354{
5355 mds_rank_t mds = session->mds_num;
5356
11fdf7f2 5357 ldout(cct, 5) << __func__ << " ino " << m->get_ino() << " mseq " << m->get_mseq()
7c673cae
FG
5358 << " EXPORT from mds." << mds << dendl;
5359
11fdf7f2
TL
5360 auto it = in->caps.find(mds);
5361 if (it != in->caps.end()) {
5362 Cap &cap = it->second;
5363 if (cap.cap_id == m->get_cap_id()) {
5364 if (m->peer.cap_id) {
5365 const auto peer_mds = mds_rank_t(m->peer.mds);
20effc67 5366 auto tsession = _get_or_open_mds_session(peer_mds);
11fdf7f2
TL
5367 auto it = in->caps.find(peer_mds);
5368 if (it != in->caps.end()) {
5369 Cap &tcap = it->second;
5370 if (tcap.cap_id == m->peer.cap_id &&
5371 ceph_seq_cmp(tcap.seq, m->peer.seq) < 0) {
5372 tcap.cap_id = m->peer.cap_id;
5373 tcap.seq = m->peer.seq - 1;
5374 tcap.issue_seq = tcap.seq;
5375 tcap.issued |= cap.issued;
5376 tcap.implemented |= cap.issued;
5377 if (&cap == in->auth_cap)
5378 in->auth_cap = &tcap;
5379 if (in->auth_cap == &tcap && in->flushing_cap_item.is_on_list())
20effc67 5380 adjust_session_flushing_caps(in, session, tsession.get());
11fdf7f2
TL
5381 }
5382 } else {
20effc67 5383 add_update_cap(in, tsession.get(), m->peer.cap_id, cap.issued, 0,
11fdf7f2
TL
5384 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
5385 &cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
5386 cap.latest_perms);
5387 }
7c673cae 5388 } else {
11fdf7f2
TL
5389 if (cap.wanted | cap.issued)
5390 in->flags |= I_CAP_DROPPED;
7c673cae 5391 }
7c673cae 5392
11fdf7f2
TL
5393 remove_cap(&cap, false);
5394 }
7c673cae 5395 }
7c673cae
FG
5396}
5397
11fdf7f2 5398void Client::handle_cap_trunc(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae
FG
5399{
5400 mds_rank_t mds = session->mds_num;
11fdf7f2 5401 ceph_assert(in->caps.count(mds));
7c673cae 5402
1e59de90
TL
5403 uint64_t size = m->get_size();
5404 if (in->is_fscrypt_enabled()) {
5405 size = std::stoll(std::string(std::rbegin(m->fscrypt_file),
5406 std::rend(m->fscrypt_file)));
5407 }
11fdf7f2 5408 ldout(cct, 10) << __func__ << " on ino " << *in
7c673cae
FG
5409 << " size " << in->size << " -> " << m->get_size()
5410 << dendl;
1e59de90 5411
1adf2230
AA
5412 int issued;
5413 in->caps_issued(&issued);
5414 issued |= in->caps_dirty();
1e59de90
TL
5415 update_inode_file_size(in, issued, size, m->get_truncate_seq(),
5416 m->get_truncate_size());
7c673cae
FG
5417}
5418
11fdf7f2 5419void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5420{
5421 ceph_tid_t flush_ack_tid = m->get_client_tid();
5422 int dirty = m->get_dirty();
5423 int cleaned = 0;
5424 int flushed = 0;
5425
11fdf7f2
TL
5426 auto it = in->flushing_cap_tids.begin();
5427 if (it->first < flush_ack_tid) {
5428 ldout(cct, 0) << __func__ << " mds." << session->mds_num
5429 << " got unexpected flush ack tid " << flush_ack_tid
5430 << " expected is " << it->first << dendl;
5431 }
5432 for (; it != in->flushing_cap_tids.end(); ) {
eafe8130
TL
5433 if (!it->second) {
5434 // cap snap
5435 ++it;
5436 continue;
5437 }
7c673cae
FG
5438 if (it->first == flush_ack_tid)
5439 cleaned = it->second;
5440 if (it->first <= flush_ack_tid) {
5441 session->flushing_caps_tids.erase(it->first);
5442 in->flushing_cap_tids.erase(it++);
5443 ++flushed;
5444 continue;
5445 }
5446 cleaned &= ~it->second;
5447 if (!cleaned)
5448 break;
5449 ++it;
5450 }
5451
11fdf7f2 5452 ldout(cct, 5) << __func__ << " mds." << session->mds_num
7c673cae
FG
5453 << " cleaned " << ccap_string(cleaned) << " on " << *in
5454 << " with " << ccap_string(dirty) << dendl;
5455
5456 if (flushed) {
5457 signal_cond_list(in->waitfor_caps);
5458 if (session->flushing_caps_tids.empty() ||
5459 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5460 sync_cond.notify_all();
7c673cae
FG
5461 }
5462
5463 if (!dirty) {
5464 in->cap_dirtier_uid = -1;
5465 in->cap_dirtier_gid = -1;
5466 }
5467
5468 if (!cleaned) {
5469 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
5470 } else {
5471 if (in->flushing_caps) {
5472 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
5473 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
5474 in->flushing_caps &= ~cleaned;
5475 if (in->flushing_caps == 0) {
5476 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
5477 num_flushing_caps--;
eafe8130 5478 if (in->flushing_cap_tids.empty())
7c673cae
FG
5479 in->flushing_cap_item.remove_myself();
5480 }
5481 if (!in->caps_dirty())
5482 put_inode(in);
5483 }
5484 }
7c673cae
FG
5485}
5486
5487
11fdf7f2 5488void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, const MConstRef<MClientCaps>& m)
7c673cae 5489{
eafe8130 5490 ceph_tid_t flush_ack_tid = m->get_client_tid();
7c673cae 5491 mds_rank_t mds = session->mds_num;
11fdf7f2 5492 ceph_assert(in->caps.count(mds));
7c673cae
FG
5493 snapid_t follows = m->get_snap_follows();
5494
11fdf7f2
TL
5495 if (auto it = in->cap_snaps.find(follows); it != in->cap_snaps.end()) {
5496 auto& capsnap = it->second;
eafe8130
TL
5497 if (flush_ack_tid != capsnap.flush_tid) {
5498 ldout(cct, 10) << " tid " << flush_ack_tid << " != " << capsnap.flush_tid << dendl;
7c673cae 5499 } else {
eafe8130 5500 InodeRef tmp_ref(in);
11fdf7f2 5501 ldout(cct, 5) << __func__ << " mds." << mds << " flushed snap follows " << follows
7c673cae 5502 << " on " << *in << dendl;
7c673cae 5503 session->flushing_caps_tids.erase(capsnap.flush_tid);
eafe8130
TL
5504 in->flushing_cap_tids.erase(capsnap.flush_tid);
5505 if (in->flushing_caps == 0 && in->flushing_cap_tids.empty())
5506 in->flushing_cap_item.remove_myself();
11fdf7f2 5507 in->cap_snaps.erase(it);
eafe8130
TL
5508
5509 signal_cond_list(in->waitfor_caps);
5510 if (session->flushing_caps_tids.empty() ||
5511 *session->flushing_caps_tids.begin() > flush_ack_tid)
9f95a23c 5512 sync_cond.notify_all();
7c673cae
FG
5513 }
5514 } else {
11fdf7f2 5515 ldout(cct, 5) << __func__ << " DUP(?) mds." << mds << " flushed snap follows " << follows
7c673cae
FG
5516 << " on " << *in << dendl;
5517 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
5518 }
7c673cae
FG
5519}
5520
5521class C_Client_DentryInvalidate : public Context {
5522private:
5523 Client *client;
5524 vinodeno_t dirino;
5525 vinodeno_t ino;
5526 string name;
5527public:
5528 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
5529 client(c), name(dn->name) {
5530 if (client->use_faked_inos()) {
5531 dirino.ino = dn->dir->parent_inode->faked_ino;
5532 if (del)
5533 ino.ino = dn->inode->faked_ino;
5534 } else {
5535 dirino = dn->dir->parent_inode->vino();
5536 if (del)
5537 ino = dn->inode->vino();
5538 }
5539 if (!del)
5540 ino.ino = inodeno_t();
5541 }
5542 void finish(int r) override {
5543 // _async_dentry_invalidate is responsible for its own locking
9f95a23c 5544 ceph_assert(ceph_mutex_is_not_locked_by_me(client->client_lock));
7c673cae
FG
5545 client->_async_dentry_invalidate(dirino, ino, name);
5546 }
5547};
5548
5549void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
5550{
f67539c2
TL
5551 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5552 if (!mref_reader.is_state_satisfied())
7c673cae 5553 return;
f67539c2 5554
11fdf7f2 5555 ldout(cct, 10) << __func__ << " '" << name << "' ino " << ino
7c673cae 5556 << " in dir " << dirino << dendl;
e306af50 5557 dentry_invalidate_cb(callback_handle, dirino, ino, name.c_str(), name.length());
7c673cae
FG
5558}
5559
5560void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
5561{
5562 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
5563 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
5564}
5565
5566void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
5567{
b3b6e05e 5568 int ref = in->get_nref();
494da23a 5569 ldout(cct, 5) << __func__ << " in " << *in <<dendl;
7c673cae
FG
5570
5571 if (in->dir && !in->dir->dentries.empty()) {
5572 for (auto p = in->dir->dentries.begin();
5573 p != in->dir->dentries.end(); ) {
5574 Dentry *dn = p->second;
5575 ++p;
5576 /* rmsnap removes whole subtree, need trim inodes recursively.
5577 * we don't need to invalidate dentries recursively. because
5578 * invalidating a directory dentry effectively invalidate
5579 * whole subtree */
5580 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
5581 _try_to_trim_inode(dn->inode.get(), false);
5582
5583 if (dn->lru_is_expireable())
5584 unlink(dn, true, false); // keep dir, drop dentry
5585 }
5586 if (in->dir->dentries.empty()) {
5587 close_dir(in->dir);
5588 --ref;
5589 }
5590 }
5591
b3b6e05e 5592 if (ref > 1 && (in->flags & I_SNAPDIR_OPEN)) {
7c673cae
FG
5593 InodeRef snapdir = open_snapdir(in);
5594 _try_to_trim_inode(snapdir.get(), false);
5595 --ref;
5596 }
5597
b3b6e05e 5598 if (ref > 1) {
11fdf7f2
TL
5599 auto q = in->dentries.begin();
5600 while (q != in->dentries.end()) {
5601 Dentry *dn = *q;
5602 ++q;
494da23a
TL
5603 if( in->ll_ref > 0 && sched_inval) {
5604 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
5605 // so in->dentries doesn't always reflect the state of kernel's dcache.
5606 _schedule_invalidate_dentry_callback(dn, true);
5607 }
7c673cae
FG
5608 unlink(dn, true, true);
5609 }
5610 }
5611}
5612
11fdf7f2 5613void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, const MConstRef<MClientCaps>& m)
7c673cae
FG
5614{
5615 mds_rank_t mds = session->mds_num;
5616 int used = get_caps_used(in);
5617 int wanted = in->caps_wanted();
a4b75251 5618 int flags = 0;
7c673cae 5619
a8e16298
TL
5620 const unsigned new_caps = m->get_caps();
5621 const bool was_stale = session->cap_gen > cap->gen;
11fdf7f2 5622 ldout(cct, 5) << __func__ << " on in " << m->get_ino()
7c673cae
FG
5623 << " mds." << mds << " seq " << m->get_seq()
5624 << " caps now " << ccap_string(new_caps)
a8e16298 5625 << " was " << ccap_string(cap->issued)
92f5a8d4 5626 << (was_stale ? " (stale)" : "") << dendl;
a8e16298
TL
5627
5628 if (was_stale)
5629 cap->issued = cap->implemented = CEPH_CAP_PIN;
7c673cae 5630 cap->seq = m->get_seq();
28e407b8 5631 cap->gen = session->cap_gen;
7c673cae 5632
11fdf7f2 5633 check_cap_issue(in, new_caps);
a8e16298 5634
7c673cae 5635 // update inode
1adf2230
AA
5636 int issued;
5637 in->caps_issued(&issued);
5638 issued |= in->caps_dirty();
7c673cae 5639
1adf2230
AA
5640 if ((new_caps & CEPH_CAP_AUTH_SHARED) &&
5641 !(issued & CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
5642 in->mode = m->head.mode;
5643 in->uid = m->head.uid;
5644 in->gid = m->head.gid;
5645 in->btime = m->btime;
5646 }
5647 bool deleted_inode = false;
1adf2230
AA
5648 if ((new_caps & CEPH_CAP_LINK_SHARED) &&
5649 !(issued & CEPH_CAP_LINK_EXCL)) {
7c673cae 5650 in->nlink = m->head.nlink;
20effc67 5651 if (in->nlink == 0)
7c673cae
FG
5652 deleted_inode = true;
5653 }
1adf2230 5654 if (!(issued & CEPH_CAP_XATTR_EXCL) &&
7c673cae
FG
5655 m->xattrbl.length() &&
5656 m->head.xattr_version > in->xattr_version) {
11fdf7f2
TL
5657 auto p = m->xattrbl.cbegin();
5658 decode(in->xattrs, p);
7c673cae
FG
5659 in->xattr_version = m->head.xattr_version;
5660 }
28e407b8
AA
5661
5662 if ((new_caps & CEPH_CAP_FILE_SHARED) && m->dirstat_is_valid()) {
5663 in->dirstat.nfiles = m->get_nfiles();
5664 in->dirstat.nsubdirs = m->get_nsubdirs();
5665 }
5666
1adf2230
AA
5667 if (new_caps & CEPH_CAP_ANY_RD) {
5668 update_inode_file_time(in, issued, m->get_time_warp_seq(),
5669 m->get_ctime(), m->get_mtime(), m->get_atime());
5670 }
5671
5672 if (new_caps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
5673 in->layout = m->get_layout();
5674 update_inode_file_size(in, issued, m->get_size(),
5675 m->get_truncate_seq(), m->get_truncate_size());
5676 }
5677
5678 if (m->inline_version > in->inline_version) {
5679 in->inline_data = m->inline_data;
5680 in->inline_version = m->inline_version;
5681 }
5682
5683 /* always take a newer change attr */
5684 if (m->get_change_attr() > in->change_attr)
5685 in->change_attr = m->get_change_attr();
7c673cae
FG
5686
5687 // max_size
5688 if (cap == in->auth_cap &&
1adf2230
AA
5689 (new_caps & CEPH_CAP_ANY_FILE_WR) &&
5690 (m->get_max_size() != in->max_size)) {
7c673cae
FG
5691 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5692 in->max_size = m->get_max_size();
5693 if (in->max_size > in->wanted_max_size) {
5694 in->wanted_max_size = 0;
5695 in->requested_max_size = 0;
5696 }
5697 }
5698
5699 bool check = false;
a8e16298
TL
5700 if ((was_stale || m->get_op() == CEPH_CAP_OP_IMPORT) &&
5701 (wanted & ~(cap->wanted | new_caps))) {
5702 // If mds is importing cap, prior cap messages that update 'wanted'
5703 // may get dropped by mds (migrate seq mismatch).
5704 //
5705 // We don't send cap message to update 'wanted' if what we want are
5706 // already issued. If mds revokes caps, cap message that releases caps
5707 // also tells mds what we want. But if caps got revoked by mds forcedly
5708 // (session stale). We may haven't told mds what we want.
7c673cae 5709 check = true;
a8e16298 5710 }
7c673cae 5711
7c673cae
FG
5712
5713 // update caps
a8e16298 5714 auto revoked = cap->issued & ~new_caps;
b32b8144
FG
5715 if (revoked) {
5716 ldout(cct, 10) << " revocation of " << ccap_string(revoked) << dendl;
7c673cae
FG
5717 cap->issued = new_caps;
5718 cap->implemented |= new_caps;
5719
b32b8144
FG
5720 // recall delegations if we're losing caps necessary for them
5721 if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_RD))
5722 in->recall_deleg(false);
5723 else if (revoked & ceph_deleg_caps_for_type(CEPH_DELEGATION_WR))
5724 in->recall_deleg(true);
5725
11fdf7f2
TL
5726 used = adjust_caps_used_for_lazyio(used, cap->issued, cap->implemented);
5727 if ((used & revoked & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO)) &&
28e407b8 5728 !_flush(in, new C_Client_FlushComplete(this, in))) {
7c673cae 5729 // waitin' for flush
11fdf7f2 5730 } else if (used & revoked & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) {
a4b75251
TL
5731 if (_release(in)) {
5732 check = true;
5733 flags = CHECK_CAPS_NODELAY;
5734 }
7c673cae
FG
5735 } else {
5736 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5737 check = true;
a4b75251 5738 flags = CHECK_CAPS_NODELAY;
7c673cae 5739 }
a8e16298
TL
5740 } else if (cap->issued == new_caps) {
5741 ldout(cct, 10) << " caps unchanged at " << ccap_string(cap->issued) << dendl;
7c673cae 5742 } else {
a8e16298 5743 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~cap->issued) << dendl;
7c673cae
FG
5744 cap->issued = new_caps;
5745 cap->implemented |= new_caps;
5746
5747 if (cap == in->auth_cap) {
5748 // non-auth MDS is revoking the newly grant caps ?
11fdf7f2
TL
5749 for (const auto &p : in->caps) {
5750 if (&p.second == cap)
7c673cae 5751 continue;
11fdf7f2 5752 if (p.second.implemented & ~p.second.issued & new_caps) {
7c673cae
FG
5753 check = true;
5754 break;
5755 }
5756 }
5757 }
5758 }
5759
5760 if (check)
a4b75251 5761 check_caps(in, flags);
7c673cae
FG
5762
5763 // wake up waiters
5764 if (new_caps)
5765 signal_cond_list(in->waitfor_caps);
5766
5767 // may drop inode's last ref
5768 if (deleted_inode)
5769 _try_to_trim_inode(in, true);
7c673cae
FG
5770}
5771
7c673cae
FG
5772int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5773{
b3b6e05e 5774 if (perms.uid() == 0) {
2a845540
TL
5775 // For directories, DACs are overridable.
5776 // For files, Read/write DACs are always overridable but executable DACs are
5777 // overridable when there is at least one exec bit set
5778 if(!S_ISDIR(in->mode) && (want & MAY_EXEC) && !(in->mode & S_IXUGO))
b3b6e05e 5779 return -CEPHFS_EACCES;
7c673cae 5780 return 0;
b3b6e05e 5781 }
7c673cae
FG
5782
5783 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5784 int ret = _posix_acl_permission(in, perms, want);
f67539c2 5785 if (ret != -CEPHFS_EAGAIN)
7c673cae
FG
5786 return ret;
5787 }
5788
5789 // check permissions before doing anything else
5790 if (!in->check_mode(perms, want))
f67539c2 5791 return -CEPHFS_EACCES;
7c673cae
FG
5792 return 0;
5793}
5794
5795int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5796 const UserPerm& perms)
5797{
5798 int r = _getattr_for_perm(in, perms);
5799 if (r < 0)
5800 goto out;
5801
5802 r = 0;
5803 if (strncmp(name, "system.", 7) == 0) {
5804 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
f67539c2 5805 r = -CEPHFS_EPERM;
7c673cae
FG
5806 } else {
5807 r = inode_permission(in, perms, want);
5808 }
5809out:
1adf2230 5810 ldout(cct, 5) << __func__ << " " << in << " = " << r << dendl;
7c673cae
FG
5811 return r;
5812}
5813
20effc67 5814std::ostream& operator<<(std::ostream &out, const UserPerm& perm) {
7c673cae
FG
5815 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5816 return out;
5817}
5818
5819int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5820 const UserPerm& perms)
5821{
1e59de90
TL
5822 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " stx_mode: "
5823 << hex << stx->stx_mode << " mask:" << mask << dec << dendl;
7c673cae
FG
5824 int r = _getattr_for_perm(in, perms);
5825 if (r < 0)
5826 goto out;
5827
5828 if (mask & CEPH_SETATTR_SIZE) {
5829 r = inode_permission(in, perms, MAY_WRITE);
5830 if (r < 0)
5831 goto out;
5832 }
5833
f67539c2 5834 r = -CEPHFS_EPERM;
7c673cae
FG
5835 if (mask & CEPH_SETATTR_UID) {
5836 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5837 goto out;
5838 }
5839 if (mask & CEPH_SETATTR_GID) {
5840 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5841 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5842 goto out;
5843 }
5844
5845 if (mask & CEPH_SETATTR_MODE) {
1e59de90
TL
5846 uint32_t m = ~stx->stx_mode & in->mode; // mode bits removed
5847 ldout(cct, 20) << __func__ << " " << *in << " = " << hex << m << dec << dendl;
5848 if (perms.uid() != 0 && perms.uid() != in->uid &&
5849 /*
5850 * Currently the kernel fuse and libfuse code is buggy and
5851 * won't pass the ATTR_KILL_SUID/ATTR_KILL_SGID to ceph-fuse.
5852 * But will just set the ATTR_MODE and at the same time by
5853 * clearing the suid/sgid bits.
5854 *
5855 * Only allow unprivileged users to clear S_ISUID and S_ISUID.
5856 */
5857 (m & ~(S_ISUID | S_ISGID)))
7c673cae
FG
5858 goto out;
5859
5860 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5861 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5862 stx->stx_mode &= ~S_ISGID;
5863 }
5864
5865 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5866 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5867 if (perms.uid() != 0 && perms.uid() != in->uid) {
5868 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5869 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5870 check_mask |= CEPH_SETATTR_MTIME;
5871 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5872 check_mask |= CEPH_SETATTR_ATIME;
5873 if (check_mask & mask) {
5874 goto out;
5875 } else {
5876 r = inode_permission(in, perms, MAY_WRITE);
5877 if (r < 0)
5878 goto out;
5879 }
5880 }
5881 }
5882 r = 0;
5883out:
5884 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5885 return r;
5886}
5887
5888int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5889{
181888fb 5890 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
5891 unsigned want = 0;
5892
5893 if ((flags & O_ACCMODE) == O_WRONLY)
5894 want = MAY_WRITE;
5895 else if ((flags & O_ACCMODE) == O_RDWR)
5896 want = MAY_READ | MAY_WRITE;
5897 else if ((flags & O_ACCMODE) == O_RDONLY)
5898 want = MAY_READ;
5899 if (flags & O_TRUNC)
5900 want |= MAY_WRITE;
5901
5902 int r = 0;
5903 switch (in->mode & S_IFMT) {
5904 case S_IFLNK:
f67539c2 5905 r = -CEPHFS_ELOOP;
7c673cae
FG
5906 goto out;
5907 case S_IFDIR:
5908 if (want & MAY_WRITE) {
f67539c2 5909 r = -CEPHFS_EISDIR;
7c673cae
FG
5910 goto out;
5911 }
5912 break;
5913 }
5914
5915 r = _getattr_for_perm(in, perms);
5916 if (r < 0)
5917 goto out;
5918
5919 r = inode_permission(in, perms, want);
5920out:
5921 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5922 return r;
5923}
5924
5925int Client::may_lookup(Inode *dir, const UserPerm& perms)
5926{
181888fb 5927 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5928 int r = _getattr_for_perm(dir, perms);
5929 if (r < 0)
5930 goto out;
5931
5932 r = inode_permission(dir, perms, MAY_EXEC);
5933out:
5934 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5935 return r;
5936}
5937
5938int Client::may_create(Inode *dir, const UserPerm& perms)
5939{
181888fb 5940 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
7c673cae
FG
5941 int r = _getattr_for_perm(dir, perms);
5942 if (r < 0)
5943 goto out;
5944
5945 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5946out:
5947 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5948 return r;
5949}
5950
5951int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5952{
181888fb 5953 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
7c673cae
FG
5954 int r = _getattr_for_perm(dir, perms);
5955 if (r < 0)
5956 goto out;
5957
5958 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5959 if (r < 0)
5960 goto out;
5961
f67539c2 5962 /* 'name == NULL' means rmsnap w/o permission checks */
7c673cae
FG
5963 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5964 InodeRef otherin;
5965 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5966 if (r < 0)
5967 goto out;
5968 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
f67539c2 5969 r = -CEPHFS_EPERM;
7c673cae
FG
5970 }
5971out:
5972 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5973 return r;
5974}
5975
f67539c2
TL
5976int Client::may_delete(const char *relpath, const UserPerm& perms) {
5977 ldout(cct, 20) << __func__ << " " << relpath << "; " << perms << dendl;
5978
5979 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
5980 if (!mref_reader.is_state_satisfied())
1e59de90 5981 return -CEPHFS_ENOTCONN;
f67539c2
TL
5982
5983 filepath path(relpath);
5984 string name = path.last_dentry();
5985 path.pop_dentry();
5986 InodeRef dir;
5987
5988 std::scoped_lock lock(client_lock);
5989 int r = path_walk(path, &dir, perms);
5990 if (r < 0)
5991 return r;
5992 if (cct->_conf->client_permissions) {
5993 int r = may_delete(dir.get(), name.c_str(), perms);
5994 if (r < 0)
5995 return r;
5996 }
5997
5998 return 0;
5999}
6000
7c673cae
FG
6001int Client::may_hardlink(Inode *in, const UserPerm& perms)
6002{
181888fb 6003 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
7c673cae
FG
6004 int r = _getattr_for_perm(in, perms);
6005 if (r < 0)
6006 goto out;
6007
6008 if (perms.uid() == 0 || perms.uid() == in->uid) {
6009 r = 0;
6010 goto out;
6011 }
6012
f67539c2 6013 r = -CEPHFS_EPERM;
7c673cae
FG
6014 if (!S_ISREG(in->mode))
6015 goto out;
6016
6017 if (in->mode & S_ISUID)
6018 goto out;
6019
6020 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
6021 goto out;
6022
6023 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
6024out:
6025 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
6026 return r;
6027}
6028
6029int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
6030{
6031 int mask = CEPH_STAT_CAP_MODE;
6032 bool force = false;
6033 if (acl_type != NO_ACL) {
6034 mask |= CEPH_STAT_CAP_XATTR;
6035 force = in->xattr_version == 0;
6036 }
6037 return _getattr(in, mask, perms, force);
6038}
6039
6040vinodeno_t Client::_get_vino(Inode *in)
6041{
6042 /* The caller must hold the client lock */
6043 return vinodeno_t(in->ino, in->snapid);
6044}
6045
7c673cae
FG
6046/**
6047 * Resolve an MDS spec to a list of MDS daemon GIDs.
6048 *
6049 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
6050 * It may be '*' in which case it matches all GIDs.
6051 *
6052 * If no error is returned, the `targets` vector will be populated with at least
6053 * one MDS.
6054 */
6055int Client::resolve_mds(
6056 const std::string &mds_spec,
6057 std::vector<mds_gid_t> *targets)
6058{
11fdf7f2
TL
6059 ceph_assert(fsmap);
6060 ceph_assert(targets != nullptr);
7c673cae
FG
6061
6062 mds_role_t role;
f67539c2
TL
6063 CachedStackStringStream css;
6064 int role_r = fsmap->parse_role(mds_spec, &role, *css);
7c673cae
FG
6065 if (role_r == 0) {
6066 // We got a role, resolve it to a GID
f67539c2
TL
6067 auto& info = fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank);
6068 ldout(cct, 10) << __func__ << ": resolved " << mds_spec << " to role '"
6069 << role << "' aka " << info.human_name() << dendl;
6070 targets->push_back(info.global_id);
7c673cae
FG
6071 return 0;
6072 }
6073
6074 std::string strtol_err;
6075 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
6076 if (strtol_err.empty()) {
6077 // It is a possible GID
6078 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
6079 if (fsmap->gid_exists(mds_gid)) {
f67539c2
TL
6080 auto& info = fsmap->get_info_gid(mds_gid);
6081 ldout(cct, 10) << __func__ << ": validated gid " << mds_gid << " aka "
6082 << info.human_name() << dendl;
7c673cae 6083 targets->push_back(mds_gid);
f67539c2 6084 return 0;
7c673cae 6085 } else {
f67539c2 6086 lderr(cct) << __func__ << ": gid " << mds_gid << " not in MDS map"
7c673cae 6087 << dendl;
f67539c2
TL
6088 lderr(cct) << "FSMap: " << *fsmap << dendl;
6089 return -CEPHFS_ENOENT;
7c673cae
FG
6090 }
6091 } else if (mds_spec == "*") {
6092 // It is a wildcard: use all MDSs
f67539c2 6093 const auto& mds_info = fsmap->get_mds_info();
7c673cae 6094
f67539c2 6095 ldout(cct, 10) << __func__ << ": resolving `*' to all MDS daemons" << dendl;
7c673cae 6096 if (mds_info.empty()) {
f67539c2
TL
6097 lderr(cct) << __func__ << ": no MDS daemons found" << dendl;
6098 lderr(cct) << "FSMap: " << *fsmap << dendl;
6099 return -CEPHFS_ENOENT;
7c673cae
FG
6100 }
6101
f67539c2
TL
6102 for (const auto& [gid, info] : mds_info) {
6103 ldout(cct, 10) << __func__ << ": appending " << info.human_name() << " to targets" << dendl;
6104 targets->push_back(gid);
7c673cae 6105 }
f67539c2 6106 return 0;
7c673cae
FG
6107 } else {
6108 // It did not parse as an integer, it is not a wildcard, it must be a name
6109 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
1e59de90 6110 if (mds_gid == mds_gid_t{0}) {
f67539c2 6111 lderr(cct) << __func__ << ": no MDS daemons found by name `" << mds_spec << "'" << dendl;
7c673cae 6112 lderr(cct) << "FSMap: " << *fsmap << dendl;
f67539c2 6113 return -CEPHFS_ENOENT;
7c673cae 6114 } else {
f67539c2
TL
6115 auto& info = fsmap->get_info_gid(mds_gid);
6116 ldout(cct, 10) << __func__ << ": resolved name '" << mds_spec
6117 << "' to " << info.human_name() << dendl;
7c673cae
FG
6118 targets->push_back(mds_gid);
6119 }
f67539c2 6120 return 0;
7c673cae 6121 }
7c673cae
FG
6122}
6123
6124
6125/**
6126 * Authenticate with mon and establish global ID
6127 */
6128int Client::authenticate()
6129{
9f95a23c 6130 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
6131
6132 if (monclient->is_authenticated()) {
6133 return 0;
6134 }
6135
9f95a23c 6136 client_lock.unlock();
2a845540 6137 int r = monclient->authenticate(std::chrono::duration<double>(mount_timeout).count());
9f95a23c 6138 client_lock.lock();
7c673cae
FG
6139 if (r < 0) {
6140 return r;
6141 }
6142
6143 whoami = monclient->get_global_id();
6144 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
6145
6146 return 0;
6147}
6148
6149int Client::fetch_fsmap(bool user)
6150{
f67539c2
TL
6151 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6152
7c673cae
FG
6153 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
6154 // rather than MDSMap because no one MDSMap contains all the daemons, and
6155 // a `tell` can address any daemon.
6156 version_t fsmap_latest;
f67539c2 6157 bs::error_code ec;
7c673cae 6158 do {
9f95a23c 6159 client_lock.unlock();
f67539c2
TL
6160 std::tie(fsmap_latest, std::ignore) =
6161 monclient->get_version("fsmap", ca::use_blocked[ec]);
9f95a23c 6162 client_lock.lock();
f67539c2 6163 } while (ec == bs::errc::resource_unavailable_try_again);
7c673cae 6164
f67539c2
TL
6165 if (ec) {
6166 lderr(cct) << "Failed to learn FSMap version: " << ec << dendl;
6167 return ceph::from_error_code(ec);
7c673cae
FG
6168 }
6169
6170 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
6171
6172 if (user) {
6173 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
6174 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6175 monclient->renew_subs();
6176 wait_on_list(waiting_for_fsmap);
6177 }
11fdf7f2
TL
6178 ceph_assert(fsmap_user);
6179 ceph_assert(fsmap_user->get_epoch() >= fsmap_latest);
7c673cae
FG
6180 } else {
6181 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
6182 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
6183 monclient->renew_subs();
6184 wait_on_list(waiting_for_fsmap);
6185 }
11fdf7f2
TL
6186 ceph_assert(fsmap);
6187 ceph_assert(fsmap->get_epoch() >= fsmap_latest);
7c673cae
FG
6188 }
6189 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
6190 << fsmap_latest << dendl;
6191 return 0;
6192}
6193
6194/**
6195 *
6196 * @mds_spec one of ID, rank, GID, "*"
6197 *
6198 */
6199int Client::mds_command(
6200 const std::string &mds_spec,
6201 const vector<string>& cmd,
6202 const bufferlist& inbl,
6203 bufferlist *outbl,
6204 string *outs,
6205 Context *onfinish)
6206{
f67539c2
TL
6207 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
6208 if (!iref_reader.is_state_satisfied())
6209 return -CEPHFS_ENOTCONN;
7c673cae 6210
f67539c2 6211 std::unique_lock cl(client_lock);
7c673cae
FG
6212
6213 int r;
6214 r = authenticate();
6215 if (r < 0) {
6216 return r;
6217 }
6218
6219 r = fetch_fsmap(false);
6220 if (r < 0) {
6221 return r;
6222 }
6223
6224 // Look up MDS target(s) of the command
6225 std::vector<mds_gid_t> targets;
6226 r = resolve_mds(mds_spec, &targets);
6227 if (r < 0) {
6228 return r;
6229 }
6230
6231 // If daemons are laggy, we won't send them commands. If all
6232 // are laggy then we fail.
6233 std::vector<mds_gid_t> non_laggy;
f67539c2 6234 for (const auto& gid : targets) {
7c673cae
FG
6235 const auto info = fsmap->get_info_gid(gid);
6236 if (!info.laggy()) {
6237 non_laggy.push_back(gid);
6238 }
6239 }
6240 if (non_laggy.size() == 0) {
6241 *outs = "All targeted MDS daemons are laggy";
f67539c2 6242 return -CEPHFS_ENOENT;
7c673cae
FG
6243 }
6244
6245 if (metadata.empty()) {
6246 // We are called on an unmounted client, so metadata
6247 // won't be initialized yet.
6248 populate_metadata("");
6249 }
6250
6251 // Send commands to targets
6252 C_GatherBuilder gather(cct, onfinish);
f67539c2 6253 for (const auto& target_gid : non_laggy) {
7c673cae
FG
6254 const auto info = fsmap->get_info_gid(target_gid);
6255
6256 // Open a connection to the target MDS
11fdf7f2 6257 ConnectionRef conn = messenger->connect_to_mds(info.get_addrs());
7c673cae 6258
f67539c2
TL
6259 cl.unlock();
6260 {
6261 std::scoped_lock cmd_lock(command_lock);
6262 // Generate MDSCommandOp state
6263 auto &op = command_table.start_command();
7c673cae 6264
f67539c2
TL
6265 op.on_finish = gather.new_sub();
6266 op.cmd = cmd;
6267 op.outbl = outbl;
6268 op.outs = outs;
6269 op.inbl = inbl;
6270 op.mds_gid = target_gid;
6271 op.con = conn;
7c673cae 6272
f67539c2
TL
6273 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
6274 << " tid=" << op.tid << cmd << dendl;
7c673cae 6275
f67539c2
TL
6276 // Construct and send MCommand
6277 MessageRef m = op.get_message(monclient->get_fsid());
6278 conn->send_message2(std::move(m));
6279 }
6280 cl.lock();
7c673cae
FG
6281 }
6282 gather.activate();
6283
6284 return 0;
6285}
6286
11fdf7f2 6287void Client::handle_command_reply(const MConstRef<MCommandReply>& m)
7c673cae
FG
6288{
6289 ceph_tid_t const tid = m->get_tid();
6290
6291 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
6292
f67539c2 6293 std::scoped_lock cmd_lock(command_lock);
7c673cae
FG
6294 if (!command_table.exists(tid)) {
6295 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
7c673cae
FG
6296 return;
6297 }
6298
6299 auto &op = command_table.get_command(tid);
6300 if (op.outbl) {
11fdf7f2 6301 *op.outbl = m->get_data();
7c673cae
FG
6302 }
6303 if (op.outs) {
6304 *op.outs = m->rs;
6305 }
6306
6307 if (op.on_finish) {
6308 op.on_finish->complete(m->r);
6309 }
6310
6311 command_table.erase(tid);
7c673cae
FG
6312}
6313
6314// -------------------
6315// MOUNT
6316
11fdf7f2 6317int Client::subscribe_mdsmap(const std::string &fs_name)
7c673cae 6318{
7c673cae
FG
6319 int r = authenticate();
6320 if (r < 0) {
6321 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
6322 return r;
6323 }
6324
11fdf7f2
TL
6325 std::string resolved_fs_name;
6326 if (fs_name.empty()) {
9f95a23c
TL
6327 resolved_fs_name = cct->_conf.get_val<std::string>("client_fs");
6328 if (resolved_fs_name.empty())
6329 // Try the backwards compatibility fs name option
6330 resolved_fs_name = cct->_conf.get_val<std::string>("client_mds_namespace");
11fdf7f2
TL
6331 } else {
6332 resolved_fs_name = fs_name;
6333 }
6334
7c673cae 6335 std::string want = "mdsmap";
11fdf7f2 6336 if (!resolved_fs_name.empty()) {
7c673cae
FG
6337 r = fetch_fsmap(true);
6338 if (r < 0)
6339 return r;
11fdf7f2
TL
6340 fscid = fsmap_user->get_fs_cid(resolved_fs_name);
6341 if (fscid == FS_CLUSTER_ID_NONE) {
f67539c2 6342 return -CEPHFS_ENOENT;
11fdf7f2 6343 }
7c673cae
FG
6344
6345 std::ostringstream oss;
11fdf7f2 6346 oss << want << "." << fscid;
7c673cae
FG
6347 want = oss.str();
6348 }
6349 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
6350
6351 monclient->sub_want(want, 0, 0);
6352 monclient->renew_subs();
6353
11fdf7f2
TL
6354 return 0;
6355}
6356
6357int Client::mount(const std::string &mount_root, const UserPerm& perms,
6358 bool require_mds, const std::string &fs_name)
6359{
f67539c2 6360 ceph_assert(is_initialized());
11fdf7f2 6361
f67539c2
TL
6362 /*
6363 * To make sure that the _unmount() must wait until the mount()
6364 * is done.
6365 */
6366 RWRef_t mref_writer(mount_state, CLIENT_MOUNTING, false);
6367 if (!mref_writer.is_first_writer()) // already mounting or mounted
11fdf7f2 6368 return 0;
11fdf7f2 6369
f67539c2 6370 std::unique_lock cl(client_lock);
11fdf7f2
TL
6371
6372 int r = subscribe_mdsmap(fs_name);
6373 if (r < 0) {
6374 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
6375 return r;
6376 }
6377
f67539c2
TL
6378 start_tick_thread(); // start tick thread
6379
7c673cae
FG
6380 if (require_mds) {
6381 while (1) {
6382 auto availability = mdsmap->is_cluster_available();
6383 if (availability == MDSMap::STUCK_UNAVAILABLE) {
6384 // Error out
6385 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
6386 return CEPH_FUSE_NO_MDS_UP;
6387 } else if (availability == MDSMap::AVAILABLE) {
6388 // Continue to mount
6389 break;
6390 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
6391 // Else, wait. MDSMonitor will update the map to bring
6392 // us to a conclusion eventually.
6393 wait_on_list(waiting_for_mdsmap);
6394 } else {
6395 // Unexpected value!
6396 ceph_abort();
6397 }
6398 }
6399 }
6400
1e59de90
TL
6401 if(mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
6402 lderr(cct) << "connections cannot be made while"
6403 " the flag refuse_client_session is set" << dendl;
6404 return -CEPHFS_EACCES;
6405 }
6406
7c673cae
FG
6407 populate_metadata(mount_root.empty() ? "/" : mount_root);
6408
6409 filepath fp(CEPH_INO_ROOT);
6410 if (!mount_root.empty()) {
6411 fp = filepath(mount_root.c_str());
6412 }
6413 while (true) {
6414 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6415 req->set_filepath(fp);
6416 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
6417 int res = make_request(req, perms);
6418 if (res < 0) {
f67539c2 6419 if (res == -CEPHFS_EACCES && root) {
7c673cae
FG
6420 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
6421 break;
6422 }
6423 return res;
6424 }
6425
6426 if (fp.depth())
6427 fp.pop_dentry();
6428 else
6429 break;
6430 }
6431
11fdf7f2 6432 ceph_assert(root);
b3b6e05e 6433 _ll_get(root.get());
7c673cae 6434
7c673cae
FG
6435 // trace?
6436 if (!cct->_conf->client_trace.empty()) {
6437 traceout.open(cct->_conf->client_trace.c_str());
6438 if (traceout.is_open()) {
6439 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
6440 } else {
6441 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
6442 }
6443 }
6444
6445 /*
6446 ldout(cct, 3) << "op: // client trace data structs" << dendl;
6447 ldout(cct, 3) << "op: struct stat st;" << dendl;
6448 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
6449 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
6450 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
6451 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
6452 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
6453 ldout(cct, 3) << "op: int fd;" << dendl;
6454 */
f67539c2
TL
6455
6456 mref_writer.update_state(CLIENT_MOUNTED);
7c673cae
FG
6457 return 0;
6458}
6459
6460// UNMOUNT
6461
6462void Client::_close_sessions()
6463{
f6b5b4d7 6464 for (auto it = mds_sessions.begin(); it != mds_sessions.end(); ) {
20effc67 6465 if (it->second->state == MetaSession::STATE_REJECTED)
f6b5b4d7
TL
6466 mds_sessions.erase(it++);
6467 else
6468 ++it;
6469 }
6470
7c673cae
FG
6471 while (!mds_sessions.empty()) {
6472 // send session closes!
11fdf7f2 6473 for (auto &p : mds_sessions) {
20effc67
TL
6474 if (p.second->state != MetaSession::STATE_CLOSING) {
6475 _close_mds_session(p.second.get());
f6b5b4d7 6476 mds_ranks_closing.insert(p.first);
7c673cae
FG
6477 }
6478 }
6479
6480 // wait for sessions to close
f6b5b4d7
TL
6481 double timo = cct->_conf.get_val<std::chrono::seconds>("client_shutdown_timeout").count();
6482 ldout(cct, 2) << "waiting for " << mds_ranks_closing.size() << " mds session(s) to close (timeout: "
6483 << timo << "s)" << dendl;
9f95a23c 6484 std::unique_lock l{client_lock, std::adopt_lock};
f6b5b4d7
TL
6485 if (!timo) {
6486 mount_cond.wait(l);
6487 } else if (!mount_cond.wait_for(l, ceph::make_timespan(timo), [this] { return mds_ranks_closing.empty(); })) {
6488 ldout(cct, 1) << mds_ranks_closing.size() << " mds(s) did not respond to session close -- timing out." << dendl;
6489 while (!mds_ranks_closing.empty()) {
6490 auto session = mds_sessions.at(*mds_ranks_closing.begin());
6491 // this prunes entry from mds_sessions and mds_ranks_closing
20effc67 6492 _closed_mds_session(session.get(), -CEPHFS_ETIMEDOUT);
f6b5b4d7
TL
6493 }
6494 }
6495
6496 mds_ranks_closing.clear();
9f95a23c 6497 l.release();
7c673cae
FG
6498 }
6499}
6500
522d829b
TL
6501void Client::flush_mdlog_sync(Inode *in)
6502{
6503 if (in->unsafe_ops.empty()) {
6504 return;
6505 }
6506
6507 std::set<mds_rank_t> anchor;
6508 for (auto &&p : in->unsafe_ops) {
6509 anchor.emplace(p->mds);
6510 }
6511 if (in->auth_cap) {
6512 anchor.emplace(in->auth_cap->session->mds_num);
6513 }
6514
6515 for (auto &rank : anchor) {
6516 auto session = &mds_sessions.at(rank);
20effc67 6517 flush_mdlog(session->get());
522d829b
TL
6518 }
6519}
6520
31f18b77
FG
6521void Client::flush_mdlog_sync()
6522{
522d829b 6523 if (mds_requests.empty())
31f18b77 6524 return;
11fdf7f2 6525 for (auto &p : mds_sessions) {
20effc67 6526 flush_mdlog(p.second.get());
31f18b77
FG
6527 }
6528}
6529
6530void Client::flush_mdlog(MetaSession *session)
6531{
6532 // Only send this to Luminous or newer MDS daemons, older daemons
6533 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
6534 const uint64_t features = session->con->get_features();
6535 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
9f95a23c 6536 auto m = make_message<MClientSession>(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
11fdf7f2 6537 session->con->send_message2(std::move(m));
31f18b77
FG
6538 }
6539}
6540
6541
11fdf7f2
TL
6542void Client::_abort_mds_sessions(int err)
6543{
6544 for (auto p = mds_requests.begin(); p != mds_requests.end(); ) {
6545 auto req = p->second;
6546 ++p;
6547 // unsafe requests will be removed during close session below.
6548 if (req->got_unsafe)
6549 continue;
6550
6551 req->abort(err);
6552 if (req->caller_cond) {
6553 req->kick = true;
9f95a23c 6554 req->caller_cond->notify_all();
11fdf7f2
TL
6555 }
6556 }
6557
6558 // Process aborts on any requests that were on this waitlist.
6559 // Any requests that were on a waiting_for_open session waitlist
6560 // will get kicked during close session below.
6561 signal_cond_list(waiting_for_mdsmap);
6562
6563 // Force-close all sessions
6564 while(!mds_sessions.empty()) {
20effc67
TL
6565 auto session = mds_sessions.begin()->second;
6566 _closed_mds_session(session.get(), err);
11fdf7f2
TL
6567 }
6568}
6569
6570void Client::_unmount(bool abort)
7c673cae 6571{
f67539c2
TL
6572 /*
6573 * We are unmounting the client.
6574 *
6575 * Just declare the state to STATE_UNMOUNTING to block and fail
6576 * any new comming "reader" and then try to wait all the in-flight
6577 * "readers" to finish.
6578 */
6579 RWRef_t mref_writer(mount_state, CLIENT_UNMOUNTING, false);
6580 if (!mref_writer.is_first_writer())
181888fb 6581 return;
f67539c2 6582 mref_writer.wait_readers_done();
7c673cae 6583
f67539c2
TL
6584 std::unique_lock lock{client_lock};
6585
6586 if (abort || blocklisted) {
6587 ldout(cct, 2) << "unmounting (" << (abort ? "abort)" : "blocklisted)") << dendl;
11fdf7f2
TL
6588 } else {
6589 ldout(cct, 2) << "unmounting" << dendl;
6590 }
7c673cae 6591
b32b8144
FG
6592 deleg_timeout = 0;
6593
11fdf7f2 6594 if (abort) {
f67539c2 6595 mount_aborted = true;
11fdf7f2 6596 // Abort all mds sessions
f67539c2 6597 _abort_mds_sessions(-CEPHFS_ENOTCONN);
11fdf7f2 6598
f67539c2 6599 objecter->op_cancel_writes(-CEPHFS_ENOTCONN);
11fdf7f2
TL
6600 } else {
6601 // flush the mdlog for pending requests, if any
6602 flush_mdlog_sync();
6603 }
6604
9f95a23c 6605 mount_cond.wait(lock, [this] {
05a536ef
TL
6606 // Only wait for write OPs
6607 for (auto& [tid, req] : mds_requests) {
6608 if (req->is_write()) {
6609 ldout(cct, 10) << "waiting for write request '" << tid
6610 << "' to complete, currently there are "
6611 << mds_requests.size()
6612 << " outstanding read/write requests"
6613 << dendl;
6614 return false;
6615 }
9f95a23c 6616 }
05a536ef 6617 return true;
9f95a23c 6618 });
7c673cae
FG
6619
6620 cwd.reset();
b3b6e05e 6621 root.reset();
7c673cae
FG
6622
6623 // clean up any unclosed files
6624 while (!fd_map.empty()) {
6625 Fh *fh = fd_map.begin()->second;
6626 fd_map.erase(fd_map.begin());
6627 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
6628 _release_fh(fh);
6629 }
05a536ef 6630
7c673cae
FG
6631 while (!ll_unclosed_fh_set.empty()) {
6632 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
6633 Fh *fh = *it;
6634 ll_unclosed_fh_set.erase(fh);
6635 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
6636 _release_fh(fh);
6637 }
6638
6639 while (!opened_dirs.empty()) {
6640 dir_result_t *dirp = *opened_dirs.begin();
6641 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
6642 _closedir(dirp);
6643 }
6644
6645 _ll_drop_pins();
6646
7c673cae
FG
6647 if (cct->_conf->client_oc) {
6648 // flush/release all buffered data
11fdf7f2
TL
6649 std::list<InodeRef> anchor;
6650 for (auto& p : inode_map) {
6651 Inode *in = p.second;
7c673cae 6652 if (!in) {
11fdf7f2
TL
6653 ldout(cct, 0) << "null inode_map entry ino " << p.first << dendl;
6654 ceph_assert(in);
7c673cae 6655 }
11fdf7f2
TL
6656
6657 // prevent inode from getting freed
6658 anchor.emplace_back(in);
6659
f67539c2 6660 if (abort || blocklisted) {
11fdf7f2
TL
6661 objectcacher->purge_set(&in->oset);
6662 } else if (!in->caps.empty()) {
7c673cae
FG
6663 _release(in);
6664 _flush(in, new C_Client_FlushComplete(this, in));
6665 }
6666 }
6667 }
6668
f67539c2 6669 if (abort || blocklisted) {
20effc67
TL
6670 for (auto &q : mds_sessions) {
6671 auto s = q.second;
6672 for (auto p = s->dirty_list.begin(); !p.end(); ) {
6673 Inode *in = *p;
6674 ++p;
6675 if (in->dirty_caps) {
6676 ldout(cct, 0) << " drop dirty caps on " << *in << dendl;
6677 in->mark_caps_clean();
6678 put_inode(in);
6679 }
11fdf7f2
TL
6680 }
6681 }
6682 } else {
6683 flush_caps_sync();
6684 wait_sync_caps(last_flush_tid);
6685 }
7c673cae
FG
6686
6687 // empty lru cache
7c673cae
FG
6688 trim_cache();
6689
f67539c2
TL
6690 delay_put_inodes();
6691
7c673cae
FG
6692 while (lru.lru_get_size() > 0 ||
6693 !inode_map.empty()) {
6694 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
6695 << "+" << inode_map.size() << " items"
6696 << ", waiting (for caps to release?)"
6697 << dendl;
f67539c2 6698
9f95a23c
TL
6699 if (auto r = mount_cond.wait_for(lock, ceph::make_timespan(5));
6700 r == std::cv_status::timeout) {
7c673cae
FG
6701 dump_cache(NULL);
6702 }
6703 }
11fdf7f2
TL
6704 ceph_assert(lru.lru_get_size() == 0);
6705 ceph_assert(inode_map.empty());
7c673cae
FG
6706
6707 // stop tracing
6708 if (!cct->_conf->client_trace.empty()) {
6709 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
6710 traceout.close();
6711 }
6712
f67539c2
TL
6713 // stop the tick thread
6714 tick_thread_stopped = true;
6715 upkeep_cond.notify_one();
6716
7c673cae
FG
6717 _close_sessions();
6718
2a845540
TL
6719 // release the global snapshot realm
6720 SnapRealm *global_realm = snap_realms[CEPH_INO_GLOBAL_SNAPREALM];
6721 if (global_realm) {
6722 ceph_assert(global_realm->nref == 1);
6723 put_snap_realm(global_realm);
6724 }
6725
f67539c2 6726 mref_writer.update_state(CLIENT_UNMOUNTED);
7c673cae 6727
39ae355f
TL
6728 /*
6729 * Stop the remount_queue before clearing the mountpoint memory
6730 * to avoid possible use-after-free bug.
6731 */
6732 if (remount_cb) {
6733 ldout(cct, 10) << "unmount stopping remount finisher" << dendl;
6734 remount_finisher.wait_for_empty();
6735 remount_finisher.stop();
6736 remount_cb = nullptr;
6737 }
6738
7c673cae
FG
6739 ldout(cct, 2) << "unmounted." << dendl;
6740}
6741
b32b8144
FG
6742void Client::unmount()
6743{
11fdf7f2
TL
6744 _unmount(false);
6745}
6746
6747void Client::abort_conn()
6748{
11fdf7f2 6749 _unmount(true);
b32b8144
FG
6750}
6751
7c673cae
FG
6752void Client::flush_cap_releases()
6753{
f67539c2
TL
6754 uint64_t nr_caps = 0;
6755
7c673cae 6756 // send any cap releases
11fdf7f2 6757 for (auto &p : mds_sessions) {
20effc67
TL
6758 auto session = p.second;
6759 if (session->release && mdsmap->is_clientreplay_or_active_or_stopping(
11fdf7f2 6760 p.first)) {
20effc67 6761 nr_caps += session->release->caps.size();
7c673cae
FG
6762 if (cct->_conf->client_inject_release_failure) {
6763 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
7c673cae 6764 } else {
20effc67 6765 session->con->send_message2(std::move(session->release));
7c673cae 6766 }
20effc67 6767 session->release.reset();
7c673cae
FG
6768 }
6769 }
f67539c2
TL
6770
6771 if (nr_caps > 0) {
6772 dec_pinned_icaps(nr_caps);
6773 }
7c673cae
FG
6774}
6775
f67539c2 6776void Client::renew_and_flush_cap_releases()
7c673cae 6777{
f67539c2
TL
6778 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6779
6780 if (!mount_aborted && mdsmap->get_epoch()) {
6781 // renew caps?
2a845540
TL
6782 auto el = ceph::coarse_mono_clock::now() - last_cap_renew;
6783 if (unlikely(utime_t(el) > mdsmap->get_session_timeout() / 3.0))
f67539c2
TL
6784 renew_caps();
6785
6786 flush_cap_releases();
7c673cae 6787 }
f67539c2
TL
6788}
6789
6790void Client::tick()
6791{
6792 ldout(cct, 20) << "tick" << dendl;
7c673cae 6793
2a845540 6794 auto now = ceph::coarse_mono_clock::now();
7c673cae 6795
f67539c2
TL
6796 /*
6797 * If the mount() is not finished
6798 */
6799 if (is_mounting() && !mds_requests.empty()) {
7c673cae 6800 MetaRequest *req = mds_requests.begin()->second;
f67539c2 6801
2a845540 6802 if (req->created + mount_timeout < now) {
f67539c2 6803 req->abort(-CEPHFS_ETIMEDOUT);
7c673cae 6804 if (req->caller_cond) {
f67539c2
TL
6805 req->kick = true;
6806 req->caller_cond->notify_all();
7c673cae
FG
6807 }
6808 signal_cond_list(waiting_for_mdsmap);
11fdf7f2 6809 for (auto &p : mds_sessions) {
20effc67 6810 signal_context_list(p.second->waiting_for_open);
11fdf7f2 6811 }
7c673cae
FG
6812 }
6813 }
6814
f67539c2 6815 renew_and_flush_cap_releases();
7c673cae
FG
6816
6817 // delayed caps
28e407b8 6818 xlist<Inode*>::iterator p = delayed_list.begin();
7c673cae
FG
6819 while (!p.end()) {
6820 Inode *in = *p;
6821 ++p;
f67539c2 6822 if (!mount_aborted && in->hold_caps_until > now)
7c673cae 6823 break;
28e407b8 6824 delayed_list.pop_front();
f67539c2
TL
6825 if (!mount_aborted)
6826 check_caps(in, CHECK_CAPS_NODELAY);
7c673cae
FG
6827 }
6828
f67539c2
TL
6829 if (!mount_aborted)
6830 collect_and_send_metrics();
6831
6832 delay_put_inodes(is_unmounting());
7c673cae 6833 trim_cache(true);
f6b5b4d7 6834
f67539c2 6835 if (blocklisted && (is_mounted() || is_unmounting()) &&
2a845540 6836 last_auto_reconnect + std::chrono::seconds(30 * 60) < now &&
f6b5b4d7
TL
6837 cct->_conf.get_val<bool>("client_reconnect_stale")) {
6838 messenger->client_reset();
6839 fd_gen++; // invalidate open files
f67539c2 6840 blocklisted = false;
f6b5b4d7
TL
6841 _kick_stale_sessions();
6842 last_auto_reconnect = now;
6843 }
7c673cae
FG
6844}
6845
f67539c2
TL
6846void Client::start_tick_thread()
6847{
6848 upkeeper = std::thread([this]() {
6849 using time = ceph::coarse_mono_time;
6850 using sec = std::chrono::seconds;
6851
6852 auto last_tick = time::min();
6853
6854 std::unique_lock cl(client_lock);
6855 while (!tick_thread_stopped) {
6856 auto now = clock::now();
6857 auto since = now - last_tick;
6858
6859 auto t_interval = clock::duration(cct->_conf.get_val<sec>("client_tick_interval"));
6860 auto d_interval = clock::duration(cct->_conf.get_val<sec>("client_debug_inject_tick_delay"));
6861
6862 auto interval = std::max(t_interval, d_interval);
6863 if (likely(since >= interval*.90)) {
6864 tick();
6865 last_tick = clock::now();
6866 } else {
6867 interval -= since;
6868 }
6869
6870 ldout(cct, 20) << "upkeep thread waiting interval " << interval << dendl;
6871 if (!tick_thread_stopped)
6872 upkeep_cond.wait_for(cl, interval);
6873 }
6874 });
6875}
6876
6877void Client::collect_and_send_metrics() {
6878 ldout(cct, 20) << __func__ << dendl;
6879
6880 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6881
6882 // right now, we only track and send global metrics. its sufficient
6883 // to send these metrics to MDS rank0.
6884 collect_and_send_global_metrics();
6885}
6886
6887void Client::collect_and_send_global_metrics() {
6888 ldout(cct, 20) << __func__ << dendl;
6889 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
6890
6891 if (!have_open_session((mds_rank_t)0)) {
6892 ldout(cct, 5) << __func__ << ": no session with rank=0 -- not sending metric"
6893 << dendl;
6894 return;
6895 }
6896 auto session = _get_or_open_mds_session((mds_rank_t)0);
6897 if (!session->mds_features.test(CEPHFS_FEATURE_METRIC_COLLECT)) {
6898 ldout(cct, 5) << __func__ << ": rank=0 does not support metrics" << dendl;
6899 return;
6900 }
6901
6902 ClientMetricMessage metric;
6903 std::vector<ClientMetricMessage> message;
6904
6905 // read latency
33c7a0ef
TL
6906 if (_collect_and_send_global_metrics ||
6907 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_LATENCY)) {
2a845540
TL
6908 metric = ClientMetricMessage(ReadLatencyPayload(logger->tget(l_c_read),
6909 logger->tget(l_c_rd_avg),
6910 logger->get(l_c_rd_sqsum),
6911 nr_read_request));
33c7a0ef
TL
6912 message.push_back(metric);
6913 }
f67539c2
TL
6914
6915 // write latency
33c7a0ef
TL
6916 if (_collect_and_send_global_metrics ||
6917 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_LATENCY)) {
2a845540 6918 metric = ClientMetricMessage(WriteLatencyPayload(logger->tget(l_c_wrlat),
1e59de90
TL
6919 logger->tget(l_c_wr_avg),
6920 logger->get(l_c_wr_sqsum),
6921 nr_write_request));
33c7a0ef
TL
6922 message.push_back(metric);
6923 }
f67539c2
TL
6924
6925 // metadata latency
33c7a0ef
TL
6926 if (_collect_and_send_global_metrics ||
6927 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_METADATA_LATENCY)) {
2a845540
TL
6928 metric = ClientMetricMessage(MetadataLatencyPayload(logger->tget(l_c_lat),
6929 logger->tget(l_c_md_avg),
6930 logger->get(l_c_md_sqsum),
6931 nr_metadata_request));
33c7a0ef
TL
6932 message.push_back(metric);
6933 }
f67539c2
TL
6934
6935 // cap hit ratio -- nr_caps is unused right now
33c7a0ef
TL
6936 if (_collect_and_send_global_metrics ||
6937 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_CAP_INFO)) {
6938 auto [cap_hits, cap_misses] = get_cap_hit_rates();
6939 metric = ClientMetricMessage(CapInfoPayload(cap_hits, cap_misses, 0));
6940 message.push_back(metric);
6941 }
f67539c2
TL
6942
6943 // dentry lease hit ratio
33c7a0ef
TL
6944 if (_collect_and_send_global_metrics ||
6945 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_DENTRY_LEASE)) {
6946 auto [dlease_hits, dlease_misses, nr] = get_dlease_hit_rates();
6947 metric = ClientMetricMessage(DentryLeasePayload(dlease_hits, dlease_misses, nr));
6948 message.push_back(metric);
6949 }
f67539c2
TL
6950
6951 // opened files
33c7a0ef
TL
6952 if (_collect_and_send_global_metrics ||
6953 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_FILES)) {
f67539c2
TL
6954 auto [opened_files, total_inodes] = get_opened_files_rates();
6955 metric = ClientMetricMessage(OpenedFilesPayload(opened_files, total_inodes));
33c7a0ef 6956 message.push_back(metric);
f67539c2 6957 }
f67539c2
TL
6958
6959 // pinned i_caps
33c7a0ef
TL
6960 if (_collect_and_send_global_metrics ||
6961 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_PINNED_ICAPS)) {
f67539c2
TL
6962 auto [pinned_icaps, total_inodes] = get_pinned_icaps_rates();
6963 metric = ClientMetricMessage(PinnedIcapsPayload(pinned_icaps, total_inodes));
33c7a0ef 6964 message.push_back(metric);
f67539c2 6965 }
f67539c2
TL
6966
6967 // opened inodes
33c7a0ef
TL
6968 if (_collect_and_send_global_metrics ||
6969 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_OPENED_INODES)) {
f67539c2
TL
6970 auto [opened_inodes, total_inodes] = get_opened_inodes_rates();
6971 metric = ClientMetricMessage(OpenedInodesPayload(opened_inodes, total_inodes));
33c7a0ef 6972 message.push_back(metric);
f67539c2 6973 }
f67539c2 6974
a4b75251 6975 // read io sizes
33c7a0ef
TL
6976 if (_collect_and_send_global_metrics ||
6977 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_READ_IO_SIZES)) {
6978 metric = ClientMetricMessage(ReadIoSizesPayload(total_read_ops,
6979 total_read_size));
6980 message.push_back(metric);
6981 }
a4b75251
TL
6982
6983 // write io sizes
33c7a0ef
TL
6984 if (_collect_and_send_global_metrics ||
6985 session->mds_metric_flags.test(CLIENT_METRIC_TYPE_WRITE_IO_SIZES)) {
6986 metric = ClientMetricMessage(WriteIoSizesPayload(total_write_ops,
6987 total_write_size));
6988 message.push_back(metric);
6989 }
a4b75251 6990
f67539c2
TL
6991 session->con->send_message2(make_message<MClientMetrics>(std::move(message)));
6992}
6993
7c673cae
FG
6994void Client::renew_caps()
6995{
6996 ldout(cct, 10) << "renew_caps()" << dendl;
2a845540 6997 last_cap_renew = ceph::coarse_mono_clock::now();
7c673cae 6998
11fdf7f2
TL
6999 for (auto &p : mds_sessions) {
7000 ldout(cct, 15) << "renew_caps requesting from mds." << p.first << dendl;
7001 if (mdsmap->get_state(p.first) >= MDSMap::STATE_REJOIN)
20effc67 7002 renew_caps(p.second.get());
7c673cae
FG
7003 }
7004}
7005
7006void Client::renew_caps(MetaSession *session)
7007{
7008 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
7009 session->last_cap_renew_request = ceph_clock_now();
7010 uint64_t seq = ++session->cap_renew_seq;
9f95a23c 7011 session->con->send_message2(make_message<MClientSession>(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
7c673cae
FG
7012}
7013
7014
7015// ===============================================================
7016// high level (POSIXy) interface
7017
7018int Client::_do_lookup(Inode *dir, const string& name, int mask,
7019 InodeRef *target, const UserPerm& perms)
7020{
7021 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
7022 MetaRequest *req = new MetaRequest(op);
7023 filepath path;
7024 dir->make_nosnap_relative_path(path);
7025 path.push_dentry(name);
7026 req->set_filepath(path);
7027 req->set_inode(dir);
7028 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
7029 mask |= DEBUG_GETATTR_CAPS;
7030 req->head.args.getattr.mask = mask;
7031
11fdf7f2 7032 ldout(cct, 10) << __func__ << " on " << path << dendl;
7c673cae
FG
7033
7034 int r = make_request(req, perms, target);
11fdf7f2 7035 ldout(cct, 10) << __func__ << " res is " << r << dendl;
7c673cae
FG
7036 return r;
7037}
7038
f67539c2
TL
7039bool Client::_dentry_valid(const Dentry *dn)
7040{
7041 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7042
7043 // is dn lease valid?
7044 utime_t now = ceph_clock_now();
7045 if (dn->lease_mds >= 0 && dn->lease_ttl > now &&
7046 mds_sessions.count(dn->lease_mds)) {
20effc67
TL
7047 auto s = mds_sessions.at(dn->lease_mds);
7048 if (s->cap_ttl > now && s->cap_gen == dn->lease_gen) {
f67539c2
TL
7049 dlease_hit();
7050 return true;
7051 }
7052
20effc67 7053 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
f67539c2
TL
7054 << " vs lease_gen " << dn->lease_gen << dendl;
7055 }
7056
7057 dlease_miss();
7058 return false;
7059}
7060
7c673cae 7061int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
f67539c2 7062 const UserPerm& perms, std::string* alternate_name)
7c673cae
FG
7063{
7064 int r = 0;
7065 Dentry *dn = NULL;
f67539c2 7066 bool did_lookup_request = false;
f91f0fd5
TL
7067 // can only request shared caps
7068 mask &= CEPH_CAP_ANY_SHARED | CEPH_STAT_RSTAT;
7c673cae 7069
7c673cae 7070 if (dname == "..") {
11fdf7f2
TL
7071 if (dir->dentries.empty()) {
7072 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
7073 filepath path(dir->ino);
7074 req->set_filepath(path);
7075
7076 InodeRef tmptarget;
7077 int r = make_request(req, perms, &tmptarget, NULL, rand() % mdsmap->get_num_in_mds());
7078
7079 if (r == 0) {
f91f0fd5 7080 *target = std::move(tmptarget);
11fdf7f2
TL
7081 ldout(cct, 8) << __func__ << " found target " << (*target)->ino << dendl;
7082 } else {
7083 *target = dir;
7084 }
7085 }
7c673cae
FG
7086 else
7087 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
7088 goto done;
7089 }
7090
7091 if (dname == ".") {
7092 *target = dir;
7093 goto done;
7094 }
7095
11fdf7f2 7096 if (!dir->is_dir()) {
f67539c2 7097 r = -CEPHFS_ENOTDIR;
11fdf7f2
TL
7098 goto done;
7099 }
7100
7c673cae 7101 if (dname.length() > NAME_MAX) {
f67539c2 7102 r = -CEPHFS_ENAMETOOLONG;
7c673cae
FG
7103 goto done;
7104 }
7105
7106 if (dname == cct->_conf->client_snapdir &&
7107 dir->snapid == CEPH_NOSNAP) {
7108 *target = open_snapdir(dir);
7109 goto done;
7110 }
7111
f67539c2 7112relookup:
7c673cae
FG
7113 if (dir->dir &&
7114 dir->dir->dentries.count(dname)) {
7115 dn = dir->dir->dentries[dname];
7116
f67539c2
TL
7117 ldout(cct, 20) << __func__ << " have " << *dn << " from mds." << dn->lease_mds
7118 << " ttl " << dn->lease_ttl << " seq " << dn->lease_seq << dendl;
7c673cae 7119
94b18763 7120 if (!dn->inode || dn->inode->caps_issued_mask(mask, true)) {
f67539c2
TL
7121 if (_dentry_valid(dn)) {
7122 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
7123 // make trim_caps() behave.
7124 dir->try_touch_cap(dn->lease_mds);
7125 goto hit_dn;
7c673cae 7126 }
92f5a8d4 7127 // dir shared caps?
94b18763 7128 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 7129 if (dn->cap_shared_gen == dir->shared_gen &&
94b18763 7130 (!dn->inode || dn->inode->caps_issued_mask(mask, true)))
7c673cae
FG
7131 goto hit_dn;
7132 if (!dn->inode && (dir->flags & I_COMPLETE)) {
11fdf7f2 7133 ldout(cct, 10) << __func__ << " concluded ENOENT locally for "
7c673cae 7134 << *dir << " dn '" << dname << "'" << dendl;
f67539c2 7135 return -CEPHFS_ENOENT;
7c673cae
FG
7136 }
7137 }
7138 } else {
7139 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
7140 }
7141 } else {
7142 // can we conclude ENOENT locally?
94b18763 7143 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED, true) &&
7c673cae 7144 (dir->flags & I_COMPLETE)) {
11fdf7f2 7145 ldout(cct, 10) << __func__ << " concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
f67539c2 7146 return -CEPHFS_ENOENT;
7c673cae
FG
7147 }
7148 }
7149
f67539c2
TL
7150 if (did_lookup_request) {
7151 r = 0;
7152 goto done;
7153 }
7c673cae 7154 r = _do_lookup(dir, dname, mask, target, perms);
f67539c2
TL
7155 did_lookup_request = true;
7156 if (r == 0) {
7157 /* complete lookup to get dentry for alternate_name */
7158 goto relookup;
7159 } else {
7160 goto done;
7161 }
7162
7163 hit_dn:
7164 if (dn->inode) {
7c673cae 7165 *target = dn->inode;
f67539c2
TL
7166 if (alternate_name)
7167 *alternate_name = dn->alternate_name;
7c673cae 7168 } else {
f67539c2 7169 r = -CEPHFS_ENOENT;
7c673cae
FG
7170 }
7171 touch_dn(dn);
f67539c2 7172 goto done;
7c673cae
FG
7173
7174 done:
7175 if (r < 0)
11fdf7f2 7176 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << r << dendl;
7c673cae 7177 else
11fdf7f2 7178 ldout(cct, 10) << __func__ << " " << *dir << " " << dname << " = " << **target << dendl;
7c673cae
FG
7179 return r;
7180}
7181
1e59de90 7182Dentry *Client::get_or_create(Inode *dir, const char* name)
7c673cae
FG
7183{
7184 // lookup
11fdf7f2 7185 ldout(cct, 20) << __func__ << " " << *dir << " name " << name << dendl;
7c673cae 7186 dir->open_dir();
1e59de90
TL
7187 if (dir->dir->dentries.count(name))
7188 return dir->dir->dentries[name];
7189 else // otherwise link up a new one
7190 return link(dir->dir, name, NULL, NULL);
7c673cae
FG
7191}
7192
f67539c2
TL
7193int Client::walk(std::string_view path, walk_dentry_result* wdr, const UserPerm& perms, bool followsym)
7194{
7195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7196 if (!mref_reader.is_state_satisfied())
7197 return -CEPHFS_ENOTCONN;
7198
7199 ldout(cct, 10) << __func__ << ": " << path << dendl;
7200
7201 std::scoped_lock lock(client_lock);
7202
7203 return path_walk(path, wdr, perms, followsym);
7204}
7205
7c673cae 7206int Client::path_walk(const filepath& origpath, InodeRef *end,
b3b6e05e 7207 const UserPerm& perms, bool followsym, int mask, InodeRef dirinode)
f67539c2
TL
7208{
7209 walk_dentry_result wdr;
b3b6e05e 7210 int rc = path_walk(origpath, &wdr, perms, followsym, mask, dirinode);
f67539c2
TL
7211 *end = std::move(wdr.in);
7212 return rc;
7213}
7214
b3b6e05e
TL
7215int Client::path_walk(const filepath& origpath, walk_dentry_result* result, const UserPerm& perms,
7216 bool followsym, int mask, InodeRef dirinode)
7c673cae
FG
7217{
7218 filepath path = origpath;
7219 InodeRef cur;
f67539c2 7220 std::string alternate_name;
7c673cae
FG
7221 if (origpath.absolute())
7222 cur = root;
b3b6e05e 7223 else if (!dirinode)
7c673cae 7224 cur = cwd;
b3b6e05e
TL
7225 else {
7226 cur = dirinode;
7227 }
11fdf7f2 7228 ceph_assert(cur);
7c673cae 7229
b3b6e05e 7230 ldout(cct, 20) << __func__ << " cur=" << *cur << dendl;
11fdf7f2 7231 ldout(cct, 10) << __func__ << " " << path << dendl;
7c673cae
FG
7232
7233 int symlinks = 0;
7234
7235 unsigned i=0;
7236 while (i < path.depth() && cur) {
7237 int caps = 0;
7238 const string &dname = path[i];
7239 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
7240 ldout(cct, 20) << " (path is " << path << ")" << dendl;
7241 InodeRef next;
7242 if (cct->_conf->client_permissions) {
7243 int r = may_lookup(cur.get(), perms);
7244 if (r < 0)
7245 return r;
7246 caps = CEPH_CAP_AUTH_SHARED;
7247 }
7248
7249 /* Get extra requested caps on the last component */
7250 if (i == (path.depth() - 1))
7251 caps |= mask;
f67539c2 7252 int r = _lookup(cur.get(), dname, caps, &next, perms, &alternate_name);
7c673cae
FG
7253 if (r < 0)
7254 return r;
7255 // only follow trailing symlink if followsym. always follow
7256 // 'directory' symlinks.
7257 if (next && next->is_symlink()) {
7258 symlinks++;
7259 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
7260 if (symlinks > MAXSYMLINKS) {
f67539c2 7261 return -CEPHFS_ELOOP;
7c673cae
FG
7262 }
7263
7264 if (i < path.depth() - 1) {
7265 // dir symlink
7266 // replace consumed components of path with symlink dir target
7267 filepath resolved(next->symlink.c_str());
7268 resolved.append(path.postfixpath(i + 1));
7269 path = resolved;
7270 i = 0;
7271 if (next->symlink[0] == '/') {
7272 cur = root;
7273 }
7274 continue;
7275 } else if (followsym) {
7276 if (next->symlink[0] == '/') {
7277 path = next->symlink.c_str();
7278 i = 0;
7279 // reset position
7280 cur = root;
7281 } else {
7282 filepath more(next->symlink.c_str());
7283 // we need to remove the symlink component from off of the path
7284 // before adding the target that the symlink points to. remain
7285 // at the same position in the path.
7286 path.pop_dentry();
7287 path.append(more);
7288 }
7289 continue;
7290 }
7291 }
7292 cur.swap(next);
7293 i++;
7294 }
7295 if (!cur)
f67539c2
TL
7296 return -CEPHFS_ENOENT;
7297 if (result) {
7298 result->in = std::move(cur);
7299 result->alternate_name = std::move(alternate_name);
7300 }
7c673cae
FG
7301 return 0;
7302}
7303
7304
7305// namespace ops
7306
f67539c2 7307int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm, std::string alternate_name)
7c673cae 7308{
f67539c2
TL
7309 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7310 if (!mref_reader.is_state_satisfied())
7311 return -CEPHFS_ENOTCONN;
7312
7c673cae
FG
7313 tout(cct) << "link" << std::endl;
7314 tout(cct) << relexisting << std::endl;
7315 tout(cct) << relpath << std::endl;
7316
7317 filepath existing(relexisting);
7318
7319 InodeRef in, dir;
f67539c2
TL
7320
7321 std::scoped_lock lock(client_lock);
7c673cae
FG
7322 int r = path_walk(existing, &in, perm, true);
7323 if (r < 0)
7324 return r;
7325 if (std::string(relpath) == "/") {
f67539c2 7326 r = -CEPHFS_EEXIST;
7c673cae
FG
7327 return r;
7328 }
7329 filepath path(relpath);
7330 string name = path.last_dentry();
7331 path.pop_dentry();
7332
7333 r = path_walk(path, &dir, perm, true);
7334 if (r < 0)
7335 return r;
7336 if (cct->_conf->client_permissions) {
7337 if (S_ISDIR(in->mode)) {
f67539c2 7338 r = -CEPHFS_EPERM;
7c673cae
FG
7339 return r;
7340 }
7341 r = may_hardlink(in.get(), perm);
7342 if (r < 0)
7343 return r;
7344 r = may_create(dir.get(), perm);
7345 if (r < 0)
7346 return r;
7347 }
f67539c2 7348 r = _link(in.get(), dir.get(), name.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7349 return r;
7350}
7351
7352int Client::unlink(const char *relpath, const UserPerm& perm)
b3b6e05e
TL
7353{
7354 return unlinkat(CEPHFS_AT_FDCWD, relpath, 0, perm);
7355}
7356
7357int Client::unlinkat(int dirfd, const char *relpath, int flags, const UserPerm& perm)
7c673cae 7358{
f67539c2 7359 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7360 if (!mref_reader.is_state_satisfied()) {
f67539c2 7361 return -CEPHFS_ENOTCONN;
b3b6e05e 7362 }
f67539c2 7363
11fdf7f2 7364 tout(cct) << __func__ << std::endl;
b3b6e05e 7365 tout(cct) << dirfd << std::endl;
7c673cae 7366 tout(cct) << relpath << std::endl;
b3b6e05e 7367 tout(cct) << flags << std::endl;
7c673cae 7368
b3b6e05e
TL
7369 if (std::string(relpath) == "/") {
7370 return flags & AT_REMOVEDIR ? -CEPHFS_EBUSY : -CEPHFS_EISDIR;
7371 }
7c673cae
FG
7372
7373 filepath path(relpath);
7374 string name = path.last_dentry();
7375 path.pop_dentry();
7376 InodeRef dir;
f67539c2
TL
7377
7378 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7379
7380 InodeRef dirinode;
7381 int r = get_fd_inode(dirfd, &dirinode);
7382 if (r < 0) {
7383 return r;
7384 }
7385
7386 r = path_walk(path, &dir, perm, true, 0, dirinode);
7387 if (r < 0) {
7c673cae 7388 return r;
b3b6e05e 7389 }
7c673cae
FG
7390 if (cct->_conf->client_permissions) {
7391 r = may_delete(dir.get(), name.c_str(), perm);
b3b6e05e 7392 if (r < 0) {
7c673cae 7393 return r;
b3b6e05e 7394 }
7c673cae 7395 }
b3b6e05e
TL
7396 if (flags & AT_REMOVEDIR) {
7397 r = _rmdir(dir.get(), name.c_str(), perm);
7398 } else {
7399 r = _unlink(dir.get(), name.c_str(), perm);
7400 }
7401 return r;
7c673cae
FG
7402}
7403
f67539c2 7404int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm, std::string alternate_name)
7c673cae 7405{
f67539c2
TL
7406 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7407 if (!mref_reader.is_state_satisfied())
7408 return -CEPHFS_ENOTCONN;
7409
11fdf7f2 7410 tout(cct) << __func__ << std::endl;
7c673cae
FG
7411 tout(cct) << relfrom << std::endl;
7412 tout(cct) << relto << std::endl;
7413
7414 if (std::string(relfrom) == "/" || std::string(relto) == "/")
f67539c2 7415 return -CEPHFS_EBUSY;
7c673cae
FG
7416
7417 filepath from(relfrom);
7418 filepath to(relto);
7419 string fromname = from.last_dentry();
7420 from.pop_dentry();
7421 string toname = to.last_dentry();
7422 to.pop_dentry();
7423
7424 InodeRef fromdir, todir;
f67539c2
TL
7425
7426 std::scoped_lock lock(client_lock);
7c673cae
FG
7427 int r = path_walk(from, &fromdir, perm);
7428 if (r < 0)
7429 goto out;
7430 r = path_walk(to, &todir, perm);
7431 if (r < 0)
7432 goto out;
7433
7434 if (cct->_conf->client_permissions) {
7435 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
7436 if (r < 0)
7437 return r;
7438 r = may_delete(todir.get(), toname.c_str(), perm);
f67539c2 7439 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
7440 return r;
7441 }
f67539c2 7442 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm, std::move(alternate_name));
7c673cae
FG
7443out:
7444 return r;
7445}
7446
7447// dirs
7448
f67539c2 7449int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm, std::string alternate_name)
b3b6e05e
TL
7450{
7451 return mkdirat(CEPHFS_AT_FDCWD, relpath, mode, perm, alternate_name);
7452}
7453
7454int Client::mkdirat(int dirfd, const char *relpath, mode_t mode, const UserPerm& perm,
7455 std::string alternate_name)
7c673cae 7456{
f67539c2
TL
7457 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7458 if (!mref_reader.is_state_satisfied())
7459 return -CEPHFS_ENOTCONN;
7460
11fdf7f2 7461 tout(cct) << __func__ << std::endl;
b3b6e05e 7462 tout(cct) << dirfd << std::endl;
7c673cae
FG
7463 tout(cct) << relpath << std::endl;
7464 tout(cct) << mode << std::endl;
11fdf7f2 7465 ldout(cct, 10) << __func__ << ": " << relpath << dendl;
7c673cae 7466
b3b6e05e 7467 if (std::string(relpath) == "/") {
f67539c2 7468 return -CEPHFS_EEXIST;
b3b6e05e 7469 }
7c673cae
FG
7470
7471 filepath path(relpath);
7472 string name = path.last_dentry();
7473 path.pop_dentry();
7474 InodeRef dir;
f67539c2
TL
7475
7476 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7477
7478 InodeRef dirinode;
7479 int r = get_fd_inode(dirfd, &dirinode);
7480 if (r < 0) {
7c673cae 7481 return r;
b3b6e05e
TL
7482 }
7483
7484 r = path_walk(path, &dir, perm, true, 0, dirinode);
7485 if (r < 0) {
7486 return r;
7487 }
7c673cae
FG
7488 if (cct->_conf->client_permissions) {
7489 r = may_create(dir.get(), perm);
b3b6e05e 7490 if (r < 0) {
7c673cae 7491 return r;
b3b6e05e 7492 }
7c673cae 7493 }
f67539c2 7494 return _mkdir(dir.get(), name.c_str(), mode, perm, 0, {}, std::move(alternate_name));
7c673cae
FG
7495}
7496
7497int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
7498{
f67539c2
TL
7499 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7500 if (!mref_reader.is_state_satisfied())
7501 return -CEPHFS_ENOTCONN;
7502
7c673cae 7503 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
11fdf7f2 7504 tout(cct) << __func__ << std::endl;
7c673cae
FG
7505 tout(cct) << relpath << std::endl;
7506 tout(cct) << mode << std::endl;
7507
7508 //get through existing parts of path
7509 filepath path(relpath);
7510 unsigned int i;
7511 int r = 0, caps = 0;
7512 InodeRef cur, next;
f67539c2
TL
7513
7514 std::scoped_lock lock(client_lock);
7c673cae
FG
7515 cur = cwd;
7516 for (i=0; i<path.depth(); ++i) {
7517 if (cct->_conf->client_permissions) {
7518 r = may_lookup(cur.get(), perms);
7519 if (r < 0)
7520 break;
7521 caps = CEPH_CAP_AUTH_SHARED;
7522 }
7523 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
7524 if (r < 0)
7525 break;
7526 cur.swap(next);
7527 }
f67539c2 7528 if (r!=-CEPHFS_ENOENT) return r;
11fdf7f2 7529 ldout(cct, 20) << __func__ << " got through " << i << " directories on path " << relpath << dendl;
7c673cae
FG
7530 //make new directory at each level
7531 for (; i<path.depth(); ++i) {
7532 if (cct->_conf->client_permissions) {
7533 r = may_create(cur.get(), perms);
7534 if (r < 0)
7535 return r;
7536 }
7537 //make new dir
7538 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
c07f9fc5 7539
7c673cae 7540 //check proper creation/existence
f67539c2 7541 if(-CEPHFS_EEXIST == r && i < path.depth() - 1) {
c07f9fc5
FG
7542 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
7543 }
7544 if (r < 0)
7545 return r;
7c673cae
FG
7546 //move to new dir and continue
7547 cur.swap(next);
11fdf7f2 7548 ldout(cct, 20) << __func__ << ": successfully created directory "
7c673cae
FG
7549 << filepath(cur->ino).get_path() << dendl;
7550 }
7551 return 0;
7552}
7553
7554int Client::rmdir(const char *relpath, const UserPerm& perms)
7555{
b3b6e05e 7556 return unlinkat(CEPHFS_AT_FDCWD, relpath, AT_REMOVEDIR, perms);
7c673cae
FG
7557}
7558
7559int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
f67539c2
TL
7560{
7561 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
7562 if (!mref_reader.is_state_satisfied())
7563 return -CEPHFS_ENOTCONN;
7564
11fdf7f2 7565 tout(cct) << __func__ << std::endl;
7c673cae
FG
7566 tout(cct) << relpath << std::endl;
7567 tout(cct) << mode << std::endl;
7568 tout(cct) << rdev << std::endl;
7569
7570 if (std::string(relpath) == "/")
f67539c2 7571 return -CEPHFS_EEXIST;
7c673cae
FG
7572
7573 filepath path(relpath);
7574 string name = path.last_dentry();
7575 path.pop_dentry();
7576 InodeRef dir;
f67539c2
TL
7577
7578 std::scoped_lock lock(client_lock);
7c673cae
FG
7579 int r = path_walk(path, &dir, perms);
7580 if (r < 0)
7581 return r;
7582 if (cct->_conf->client_permissions) {
7583 int r = may_create(dir.get(), perms);
7584 if (r < 0)
7585 return r;
7586 }
7587 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
7588}
7589
7590// symlinks
7591
f67539c2 7592int Client::symlink(const char *target, const char *relpath, const UserPerm& perms, std::string alternate_name)
b3b6e05e
TL
7593{
7594 return symlinkat(target, CEPHFS_AT_FDCWD, relpath, perms, alternate_name);
7595}
7596
7597int Client::symlinkat(const char *target, int dirfd, const char *relpath, const UserPerm& perms,
7598 std::string alternate_name)
7c673cae 7599{
f67539c2 7600 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7601 if (!mref_reader.is_state_satisfied()) {
f67539c2 7602 return -CEPHFS_ENOTCONN;
b3b6e05e 7603 }
f67539c2 7604
11fdf7f2 7605 tout(cct) << __func__ << std::endl;
7c673cae 7606 tout(cct) << target << std::endl;
b3b6e05e 7607 tout(cct) << dirfd << std::endl;
7c673cae
FG
7608 tout(cct) << relpath << std::endl;
7609
b3b6e05e 7610 if (std::string(relpath) == "/") {
f67539c2 7611 return -CEPHFS_EEXIST;
b3b6e05e 7612 }
7c673cae
FG
7613
7614 filepath path(relpath);
7615 string name = path.last_dentry();
7616 path.pop_dentry();
7617 InodeRef dir;
f67539c2
TL
7618
7619 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7620
7621 InodeRef dirinode;
7622 int r = get_fd_inode(dirfd, &dirinode);
7623 if (r < 0) {
7c673cae 7624 return r;
b3b6e05e
TL
7625 }
7626 r = path_walk(path, &dir, perms, true, 0, dirinode);
7627 if (r < 0) {
7628 return r;
7629 }
7c673cae
FG
7630 if (cct->_conf->client_permissions) {
7631 int r = may_create(dir.get(), perms);
b3b6e05e 7632 if (r < 0) {
7c673cae 7633 return r;
b3b6e05e 7634 }
7c673cae 7635 }
f67539c2 7636 return _symlink(dir.get(), name.c_str(), target, perms, std::move(alternate_name));
7c673cae
FG
7637}
7638
7639int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
7640{
b3b6e05e
TL
7641 return readlinkat(CEPHFS_AT_FDCWD, relpath, buf, size, perms);
7642}
7643
7644int Client::readlinkat(int dirfd, const char *relpath, char *buf, loff_t size, const UserPerm& perms) {
f67539c2 7645 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 7646 if (!mref_reader.is_state_satisfied()) {
f67539c2 7647 return -CEPHFS_ENOTCONN;
b3b6e05e 7648 }
f67539c2 7649
11fdf7f2 7650 tout(cct) << __func__ << std::endl;
b3b6e05e 7651 tout(cct) << dirfd << std::endl;
7c673cae
FG
7652 tout(cct) << relpath << std::endl;
7653
b3b6e05e 7654 InodeRef dirinode;
f67539c2 7655 std::scoped_lock lock(client_lock);
b3b6e05e
TL
7656 int r = get_fd_inode(dirfd, &dirinode);
7657 if (r < 0) {
7c673cae 7658 return r;
b3b6e05e
TL
7659 }
7660
7661 InodeRef in;
7662 filepath path(relpath);
7663 r = path_walk(path, &in, perms, false, 0, dirinode);
7664 if (r < 0) {
7665 return r;
7666 }
7c673cae
FG
7667
7668 return _readlink(in.get(), buf, size);
7669}
7670
7671int Client::_readlink(Inode *in, char *buf, size_t size)
7672{
7673 if (!in->is_symlink())
f67539c2 7674 return -CEPHFS_EINVAL;
7c673cae
FG
7675
7676 // copy into buf (at most size bytes)
7677 int r = in->symlink.length();
7678 if (r > (int)size)
7679 r = size;
7680 memcpy(buf, in->symlink.c_str(), r);
7681 return r;
7682}
7683
7684
7685// inode stuff
7686
7687int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
7688{
94b18763 7689 bool yes = in->caps_issued_mask(mask, true);
7c673cae 7690
11fdf7f2 7691 ldout(cct, 10) << __func__ << " mask " << ccap_string(mask) << " issued=" << yes << dendl;
7c673cae
FG
7692 if (yes && !force)
7693 return 0;
7694
7695 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
7696 filepath path;
7697 in->make_nosnap_relative_path(path);
7698 req->set_filepath(path);
7699 req->set_inode(in);
7700 req->head.args.getattr.mask = mask;
7701
7702 int res = make_request(req, perms);
11fdf7f2 7703 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7c673cae
FG
7704 return res;
7705}
7706
1d09f67e
TL
7707int Client::_getvxattr(
7708 Inode *in,
7709 const UserPerm& perms,
7710 const char *xattr_name,
7711 ssize_t size,
7712 void *value,
7713 mds_rank_t rank)
7714{
7715 if (!xattr_name || strlen(xattr_name) <= 0 || strlen(xattr_name) > 255) {
7716 return -CEPHFS_ENODATA;
7717 }
7718
7719 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETVXATTR);
7720 filepath path;
7721 in->make_nosnap_relative_path(path);
7722 req->set_filepath(path);
7723 req->set_inode(in);
7724 req->set_string2(xattr_name);
7725
7726 bufferlist bl;
39ae355f
TL
7727 int res = make_request(req, perms, nullptr, nullptr, rank, &bl,
7728 CEPHFS_FEATURE_OP_GETVXATTR);
1d09f67e
TL
7729 ldout(cct, 10) << __func__ << " result=" << res << dendl;
7730
7731 if (res < 0) {
39ae355f
TL
7732 if (res == -CEPHFS_EOPNOTSUPP) {
7733 return -CEPHFS_ENODATA;
7734 }
1d09f67e
TL
7735 return res;
7736 }
7737
7738 std::string buf;
7739 auto p = bl.cbegin();
7740
7741 DECODE_START(1, p);
7742 decode(buf, p);
7743 DECODE_FINISH(p);
7744
7745 ssize_t len = buf.length();
7746
7747 res = len; // refer to man getxattr(2) for output buffer size == 0
7748
7749 if (size > 0) {
7750 if (len > size) {
7751 res = -CEPHFS_ERANGE; // insufficient output buffer space
7752 } else {
7753 memcpy(value, buf.c_str(), len);
7754 }
7755 }
7756 return res;
7757}
7758
7c673cae 7759int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
1e59de90
TL
7760 const UserPerm& perms, InodeRef *inp,
7761 std::vector<uint8_t>* aux)
7c673cae
FG
7762{
7763 int issued = in->caps_issued();
20effc67
TL
7764 union ceph_mds_request_args args;
7765 bool kill_sguid = false;
7766 int inode_drop = 0;
1e59de90
TL
7767 size_t auxsize = 0;
7768
7769 if (aux)
7770 auxsize = aux->size();
7c673cae 7771
11fdf7f2 7772 ldout(cct, 10) << __func__ << " mask " << mask << " issued " <<
1e59de90 7773 ccap_string(issued) << " aux size " << auxsize << dendl;
7c673cae
FG
7774
7775 if (in->snapid != CEPH_NOSNAP) {
f67539c2 7776 return -CEPHFS_EROFS;
7c673cae
FG
7777 }
7778 if ((mask & CEPH_SETATTR_SIZE) &&
f67539c2
TL
7779 (uint64_t)stx->stx_size > in->size &&
7780 is_quota_bytes_exceeded(in, (uint64_t)stx->stx_size - in->size,
7c673cae 7781 perms)) {
f67539c2 7782 return -CEPHFS_EDQUOT;
7c673cae
FG
7783 }
7784
1e59de90
TL
7785 // Can't set fscrypt_auth and file at the same time!
7786 if ((mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)) ==
7787 (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE))
7788 return -CEPHFS_EINVAL;
7789
7790 if (!aux && (mask & (CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_FSCRYPT_FILE)))
7791 return -CEPHFS_EINVAL;
7792
20effc67
TL
7793 memset(&args, 0, sizeof(args));
7794
7c673cae
FG
7795 // make the change locally?
7796 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
7797 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
7798 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
7799 << " != cap dirtier " << in->cap_dirtier_uid << ":"
7800 << in->cap_dirtier_gid << ", forcing sync setattr"
7801 << dendl;
7802 /*
7803 * This works because we implicitly flush the caps as part of the
7804 * request, so the cap update check will happen with the writeback
7805 * cap context, and then the setattr check will happen with the
7806 * caller's context.
7807 *
7808 * In reality this pattern is likely pretty rare (different users
7809 * setattr'ing the same file). If that turns out not to be the
7810 * case later, we can build a more complex pipelined cap writeback
7811 * infrastructure...
7812 */
20effc67 7813 mask |= CEPH_SETATTR_CTIME;
7c673cae
FG
7814 }
7815
7816 if (!mask) {
7817 // caller just needs us to bump the ctime
7818 in->ctime = ceph_clock_now();
7819 in->cap_dirtier_uid = perms.uid();
7820 in->cap_dirtier_gid = perms.gid();
7821 if (issued & CEPH_CAP_AUTH_EXCL)
28e407b8 7822 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7823 else if (issued & CEPH_CAP_FILE_EXCL)
28e407b8 7824 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7c673cae 7825 else if (issued & CEPH_CAP_XATTR_EXCL)
28e407b8 7826 in->mark_caps_dirty(CEPH_CAP_XATTR_EXCL);
7c673cae
FG
7827 else
7828 mask |= CEPH_SETATTR_CTIME;
7829 }
7830
7831 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
1e59de90 7832 kill_sguid = !!(mask & CEPH_SETATTR_KILL_SGUID);
20effc67
TL
7833 }
7834
7835 if (mask & CEPH_SETATTR_UID) {
7836 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
7c673cae 7837
20effc67 7838 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7839 in->ctime = ceph_clock_now();
7840 in->cap_dirtier_uid = perms.uid();
7841 in->cap_dirtier_gid = perms.gid();
7842 in->uid = stx->stx_uid;
28e407b8 7843 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7844 mask &= ~CEPH_SETATTR_UID;
7845 kill_sguid = true;
20effc67
TL
7846 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7847 in->uid != stx->stx_uid) {
7848 args.setattr.uid = stx->stx_uid;
7849 inode_drop |= CEPH_CAP_AUTH_SHARED;
7850 } else {
7851 mask &= ~CEPH_SETATTR_UID;
7c673cae 7852 }
20effc67
TL
7853 }
7854
7855 if (mask & CEPH_SETATTR_GID) {
7856 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
7857
7858 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7859 in->ctime = ceph_clock_now();
7860 in->cap_dirtier_uid = perms.uid();
7861 in->cap_dirtier_gid = perms.gid();
7862 in->gid = stx->stx_gid;
28e407b8 7863 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae
FG
7864 mask &= ~CEPH_SETATTR_GID;
7865 kill_sguid = true;
20effc67
TL
7866 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7867 in->gid != stx->stx_gid) {
7868 args.setattr.gid = stx->stx_gid;
7869 inode_drop |= CEPH_CAP_AUTH_SHARED;
7870 } else {
7871 mask &= ~CEPH_SETATTR_GID;
7c673cae 7872 }
20effc67 7873 }
7c673cae 7874
20effc67
TL
7875 if (mask & CEPH_SETATTR_MODE) {
7876 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
7877
7878 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7879 in->ctime = ceph_clock_now();
7880 in->cap_dirtier_uid = perms.uid();
7881 in->cap_dirtier_gid = perms.gid();
7882 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
28e407b8 7883 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7884 mask &= ~CEPH_SETATTR_MODE;
20effc67
TL
7885 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7886 in->mode != stx->stx_mode) {
7887 args.setattr.mode = stx->stx_mode;
7888 inode_drop |= CEPH_CAP_AUTH_SHARED;
7889 } else {
7890 mask &= ~CEPH_SETATTR_MODE;
7c673cae 7891 }
1e59de90
TL
7892 } else if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL) && S_ISREG(in->mode)) {
7893 if (kill_sguid && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
7894 in->mode &= ~(S_ISUID|S_ISGID);
7895 } else {
7896 if (mask & CEPH_SETATTR_KILL_SUID) {
7897 in->mode &= ~S_ISUID;
7898 }
7899 if (mask & CEPH_SETATTR_KILL_SGID) {
7900 in->mode &= ~S_ISGID;
7901 }
7902 }
7903 mask &= ~(CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID);
20effc67
TL
7904 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7905 }
7906
7907 if (mask & CEPH_SETATTR_BTIME) {
7908 ldout(cct,10) << "changing btime to " << in->btime << dendl;
7c673cae 7909
20effc67 7910 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7c673cae
FG
7911 in->ctime = ceph_clock_now();
7912 in->cap_dirtier_uid = perms.uid();
7913 in->cap_dirtier_gid = perms.gid();
7914 in->btime = utime_t(stx->stx_btime);
28e407b8 7915 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7c673cae 7916 mask &= ~CEPH_SETATTR_BTIME;
20effc67
TL
7917 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7918 in->btime != utime_t(stx->stx_btime)) {
7919 args.setattr.btime = utime_t(stx->stx_btime);
7920 inode_drop |= CEPH_CAP_AUTH_SHARED;
7921 } else {
7922 mask &= ~CEPH_SETATTR_BTIME;
7923 }
7924 }
7925
1e59de90
TL
7926 if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
7927 ldout(cct,10) << "resetting cached fscrypt_auth field. size now "
7928 << in->fscrypt_auth.size() << dendl;
7929
7930 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
7931 in->ctime = ceph_clock_now();
7932 in->cap_dirtier_uid = perms.uid();
7933 in->cap_dirtier_gid = perms.gid();
7934 in->fscrypt_auth = *aux;
7935 in->mark_caps_dirty(CEPH_CAP_AUTH_EXCL);
7936 mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
7937 } else if (!in->caps_issued_mask(CEPH_CAP_AUTH_SHARED) ||
7938 in->fscrypt_auth != *aux) {
7939 inode_drop |= CEPH_CAP_AUTH_SHARED;
7940 } else {
7941 mask &= ~CEPH_SETATTR_FSCRYPT_AUTH;
7942 }
7943 }
7944
20effc67
TL
7945 if (mask & CEPH_SETATTR_SIZE) {
7946 if ((uint64_t)stx->stx_size >= mdsmap->get_max_filesize()) {
7947 //too big!
7948 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
7949 return -CEPHFS_EFBIG;
7950 }
7951
7952 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
7953 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL) &&
7954 !(mask & CEPH_SETATTR_KILL_SGUID) &&
7955 stx->stx_size >= in->size) {
7956 if (stx->stx_size > in->size) {
7957 in->size = in->reported_size = stx->stx_size;
7958 in->cap_dirtier_uid = perms.uid();
7959 in->cap_dirtier_gid = perms.gid();
7960 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7961 mask &= ~(CEPH_SETATTR_SIZE);
7962 mask |= CEPH_SETATTR_MTIME;
7963 } else {
7964 // ignore it when size doesn't change
7965 mask &= ~(CEPH_SETATTR_SIZE);
7966 }
7967 } else {
7968 args.setattr.size = stx->stx_size;
7969 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
7970 CEPH_CAP_FILE_WR;
7971 }
7972 }
7973
1e59de90
TL
7974 if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
7975 ldout(cct,10) << "resetting cached fscrypt_file field. size now "
7976 << in->fscrypt_file.size() << dendl;
7977
7978 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7979 in->ctime = ceph_clock_now();
7980 in->cap_dirtier_uid = perms.uid();
7981 in->cap_dirtier_gid = perms.gid();
7982 in->fscrypt_file = *aux;
7983 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
7984 mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
7985 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
7986 in->fscrypt_file != *aux) {
7987 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR;
7988 } else {
7989 mask &= ~CEPH_SETATTR_FSCRYPT_FILE;
7990 }
7991 }
7992
20effc67
TL
7993 if (mask & CEPH_SETATTR_MTIME) {
7994 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
7995 in->mtime = utime_t(stx->stx_mtime);
7996 in->ctime = ceph_clock_now();
7997 in->cap_dirtier_uid = perms.uid();
7998 in->cap_dirtier_gid = perms.gid();
7999 in->time_warp_seq++;
8000 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
8001 mask &= ~CEPH_SETATTR_MTIME;
8002 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8003 utime_t(stx->stx_mtime) > in->mtime) {
8004 in->mtime = utime_t(stx->stx_mtime);
8005 in->ctime = ceph_clock_now();
8006 in->cap_dirtier_uid = perms.uid();
8007 in->cap_dirtier_gid = perms.gid();
8008 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8009 mask &= ~CEPH_SETATTR_MTIME;
8010 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8011 in->mtime != utime_t(stx->stx_mtime)) {
8012 args.setattr.mtime = utime_t(stx->stx_mtime);
8013 inode_drop |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
8014 CEPH_CAP_FILE_WR;
8015 } else {
8016 mask &= ~CEPH_SETATTR_MTIME;
7c673cae 8017 }
7c673cae
FG
8018 }
8019
20effc67
TL
8020 if (mask & CEPH_SETATTR_ATIME) {
8021 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
8022 in->atime = utime_t(stx->stx_atime);
7c673cae
FG
8023 in->ctime = ceph_clock_now();
8024 in->cap_dirtier_uid = perms.uid();
8025 in->cap_dirtier_gid = perms.gid();
8026 in->time_warp_seq++;
28e407b8 8027 in->mark_caps_dirty(CEPH_CAP_FILE_EXCL);
20effc67
TL
8028 mask &= ~CEPH_SETATTR_ATIME;
8029 } else if (in->caps_issued_mask(CEPH_CAP_FILE_WR) &&
8030 utime_t(stx->stx_atime) > in->atime) {
8031 in->atime = utime_t(stx->stx_atime);
8032 in->ctime = ceph_clock_now();
8033 in->cap_dirtier_uid = perms.uid();
8034 in->cap_dirtier_gid = perms.gid();
8035 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
8036 mask &= ~CEPH_SETATTR_ATIME;
8037 } else if (!in->caps_issued_mask(CEPH_CAP_FILE_SHARED) ||
8038 in->atime != utime_t(stx->stx_atime)) {
8039 args.setattr.atime = utime_t(stx->stx_atime);
8040 inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
8041 CEPH_CAP_FILE_WR;
8042 } else {
8043 mask &= ~CEPH_SETATTR_ATIME;
7c673cae
FG
8044 }
8045 }
20effc67 8046
7c673cae
FG
8047 if (!mask) {
8048 in->change_attr++;
39ae355f
TL
8049 if (in->is_dir() && in->snapid == CEPH_NOSNAP) {
8050 vinodeno_t vino(in->ino, CEPH_SNAPDIR);
8051 if (inode_map.count(vino)) {
8052 refresh_snapdir_attrs(inode_map[vino], in);
8053 }
8054 }
7c673cae
FG
8055 return 0;
8056 }
8057
7c673cae
FG
8058 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
8059
8060 filepath path;
8061
8062 in->make_nosnap_relative_path(path);
8063 req->set_filepath(path);
8064 req->set_inode(in);
8065
20effc67
TL
8066 req->head.args = args;
8067 req->inode_drop = inode_drop;
1e59de90
TL
8068 if (mask & CEPH_SETATTR_FSCRYPT_AUTH) {
8069 req->fscrypt_auth = *aux;
8070 } else if (mask & CEPH_SETATTR_FSCRYPT_FILE) {
8071 req->fscrypt_file = *aux;
8072 }
7c673cae 8073 req->head.args.setattr.mask = mask;
7c673cae
FG
8074 req->regetattr_mask = mask;
8075
8076 int res = make_request(req, perms, inp);
8077 ldout(cct, 10) << "_setattr result=" << res << dendl;
8078 return res;
8079}
8080
8081/* Note that we only care about attrs that setattr cares about */
8082void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
8083{
8084 stx->stx_size = st->st_size;
8085 stx->stx_mode = st->st_mode;
8086 stx->stx_uid = st->st_uid;
8087 stx->stx_gid = st->st_gid;
11fdf7f2
TL
8088#ifdef __APPLE__
8089 stx->stx_mtime = st->st_mtimespec;
8090 stx->stx_atime = st->st_atimespec;
f67539c2
TL
8091#elif __WIN32
8092 stx->stx_mtime.tv_sec = st->st_mtime;
1e59de90 8093 stx->stx_mtime.tv_nsec = 0;
f67539c2 8094 stx->stx_atime.tv_sec = st->st_atime;
1e59de90 8095 stx->stx_atime.tv_nsec = 0;
11fdf7f2 8096#else
7c673cae
FG
8097 stx->stx_mtime = st->st_mtim;
8098 stx->stx_atime = st->st_atim;
11fdf7f2 8099#endif
7c673cae
FG
8100}
8101
8102int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
8103 const UserPerm& perms, InodeRef *inp)
8104{
1e59de90
TL
8105 if (mask & CEPH_SETATTR_SIZE) {
8106 mask |= clear_suid_sgid(in, perms, true);
8107 }
8108
7c673cae
FG
8109 int ret = _do_setattr(in, stx, mask, perms, inp);
8110 if (ret < 0)
8111 return ret;
8112 if (mask & CEPH_SETATTR_MODE)
8113 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
8114 return ret;
8115}
8116
8117int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
8118 const UserPerm& perms)
8119{
8120 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
8121 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
8122 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
8123 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
8124 if (cct->_conf->client_permissions) {
8125 int r = may_setattr(in.get(), stx, mask, perms);
8126 if (r < 0)
8127 return r;
8128 }
8129 return __setattrx(in.get(), stx, mask, perms);
8130}
8131
8132int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
8133 const UserPerm& perms)
8134{
8135 struct ceph_statx stx;
8136
8137 stat_to_statx(attr, &stx);
8138 mask &= ~CEPH_SETATTR_BTIME;
181888fb
FG
8139
8140 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
8141 mask &= ~CEPH_SETATTR_UID;
8142 }
8143 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
8144 mask &= ~CEPH_SETATTR_GID;
8145 }
8146
7c673cae
FG
8147 return _setattrx(in, &stx, mask, perms);
8148}
8149
8150int Client::setattr(const char *relpath, struct stat *attr, int mask,
8151 const UserPerm& perms)
8152{
f67539c2
TL
8153 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8154 if (!mref_reader.is_state_satisfied())
8155 return -CEPHFS_ENOTCONN;
8156
11fdf7f2 8157 tout(cct) << __func__ << std::endl;
7c673cae
FG
8158 tout(cct) << relpath << std::endl;
8159 tout(cct) << mask << std::endl;
8160
8161 filepath path(relpath);
8162 InodeRef in;
f67539c2
TL
8163
8164 std::scoped_lock lock(client_lock);
7c673cae
FG
8165 int r = path_walk(path, &in, perms);
8166 if (r < 0)
8167 return r;
8168 return _setattr(in, attr, mask, perms);
8169}
8170
8171int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
8172 const UserPerm& perms, int flags)
8173{
f67539c2
TL
8174 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8175 if (!mref_reader.is_state_satisfied())
8176 return -CEPHFS_ENOTCONN;
8177
11fdf7f2 8178 tout(cct) << __func__ << std::endl;
7c673cae
FG
8179 tout(cct) << relpath << std::endl;
8180 tout(cct) << mask << std::endl;
8181
8182 filepath path(relpath);
8183 InodeRef in;
f67539c2
TL
8184
8185 std::scoped_lock lock(client_lock);
7c673cae
FG
8186 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
8187 if (r < 0)
8188 return r;
8189 return _setattrx(in, stx, mask, perms);
8190}
8191
8192int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
8193{
f67539c2
TL
8194 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8195 if (!mref_reader.is_state_satisfied())
8196 return -CEPHFS_ENOTCONN;
8197
11fdf7f2 8198 tout(cct) << __func__ << std::endl;
7c673cae
FG
8199 tout(cct) << fd << std::endl;
8200 tout(cct) << mask << std::endl;
8201
f67539c2 8202 std::scoped_lock lock(client_lock);
7c673cae
FG
8203 Fh *f = get_filehandle(fd);
8204 if (!f)
f67539c2 8205 return -CEPHFS_EBADF;
7c673cae
FG
8206#if defined(__linux__) && defined(O_PATH)
8207 if (f->flags & O_PATH)
f67539c2 8208 return -CEPHFS_EBADF;
7c673cae
FG
8209#endif
8210 return _setattr(f->inode, attr, mask, perms);
8211}
8212
8213int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
8214{
f67539c2
TL
8215 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8216 if (!mref_reader.is_state_satisfied())
8217 return -CEPHFS_ENOTCONN;
8218
11fdf7f2 8219 tout(cct) << __func__ << std::endl;
7c673cae
FG
8220 tout(cct) << fd << std::endl;
8221 tout(cct) << mask << std::endl;
8222
f67539c2 8223 std::scoped_lock lock(client_lock);
7c673cae
FG
8224 Fh *f = get_filehandle(fd);
8225 if (!f)
f67539c2 8226 return -CEPHFS_EBADF;
7c673cae
FG
8227#if defined(__linux__) && defined(O_PATH)
8228 if (f->flags & O_PATH)
f67539c2 8229 return -CEPHFS_EBADF;
7c673cae
FG
8230#endif
8231 return _setattrx(f->inode, stx, mask, perms);
8232}
8233
8234int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
8235 frag_info_t *dirstat, int mask)
8236{
f67539c2
TL
8237 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8238 if (!mref_reader.is_state_satisfied())
8239 return -CEPHFS_ENOTCONN;
8240
11fdf7f2 8241 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8242 tout(cct) << "stat" << std::endl;
8243 tout(cct) << relpath << std::endl;
181888fb 8244
7c673cae
FG
8245 filepath path(relpath);
8246 InodeRef in;
f67539c2
TL
8247
8248 std::scoped_lock lock(client_lock);
7c673cae
FG
8249 int r = path_walk(path, &in, perms, true, mask);
8250 if (r < 0)
8251 return r;
8252 r = _getattr(in, mask, perms);
8253 if (r < 0) {
11fdf7f2 8254 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
8255 return r;
8256 }
8257 fill_stat(in, stbuf, dirstat);
11fdf7f2 8258 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8259 return r;
8260}
8261
8262unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
8263{
8264 unsigned mask = 0;
8265
2a845540
TL
8266 /* The AT_STATX_FORCE_SYNC is always in higher priority than AT_STATX_DONT_SYNC. */
8267 if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_DONT_SYNC)
7c673cae
FG
8268 goto out;
8269
2a845540 8270 /* Always set PIN to distinguish from AT_STATX_DONT_SYNC case */
7c673cae
FG
8271 mask |= CEPH_CAP_PIN;
8272 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8273 mask |= CEPH_CAP_AUTH_SHARED;
8274 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
8275 mask |= CEPH_CAP_LINK_SHARED;
adb31ebb 8276 if (want & (CEPH_STATX_NLINK|CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
7c673cae
FG
8277 mask |= CEPH_CAP_FILE_SHARED;
8278 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
8279 mask |= CEPH_CAP_XATTR_SHARED;
8280out:
8281 return mask;
8282}
8283
8284int Client::statx(const char *relpath, struct ceph_statx *stx,
8285 const UserPerm& perms,
8286 unsigned int want, unsigned int flags)
8287{
b3b6e05e 8288 return statxat(CEPHFS_AT_FDCWD, relpath, stx, perms, want, flags);
7c673cae
FG
8289}
8290
8291int Client::lstat(const char *relpath, struct stat *stbuf,
8292 const UserPerm& perms, frag_info_t *dirstat, int mask)
8293{
f67539c2
TL
8294 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8295 if (!mref_reader.is_state_satisfied())
8296 return -CEPHFS_ENOTCONN;
8297
11fdf7f2 8298 ldout(cct, 3) << __func__ << " enter (relpath " << relpath << " mask " << mask << ")" << dendl;
11fdf7f2 8299 tout(cct) << __func__ << std::endl;
7c673cae 8300 tout(cct) << relpath << std::endl;
181888fb 8301
7c673cae
FG
8302 filepath path(relpath);
8303 InodeRef in;
f67539c2
TL
8304
8305 std::scoped_lock lock(client_lock);
7c673cae
FG
8306 // don't follow symlinks
8307 int r = path_walk(path, &in, perms, false, mask);
8308 if (r < 0)
8309 return r;
8310 r = _getattr(in, mask, perms);
8311 if (r < 0) {
11fdf7f2 8312 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
7c673cae
FG
8313 return r;
8314 }
8315 fill_stat(in, stbuf, dirstat);
11fdf7f2 8316 ldout(cct, 3) << __func__ << " exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7c673cae
FG
8317 return r;
8318}
8319
8320int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
8321{
11fdf7f2 8322 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae
FG
8323 << " mode 0" << oct << in->mode << dec
8324 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
8325 memset(st, 0, sizeof(struct stat));
8326 if (use_faked_inos())
8327 st->st_ino = in->faked_ino;
8328 else
8329 st->st_ino = in->ino;
8330 st->st_dev = in->snapid;
8331 st->st_mode = in->mode;
8332 st->st_rdev = in->rdev;
28e407b8
AA
8333 if (in->is_dir()) {
8334 switch (in->nlink) {
8335 case 0:
8336 st->st_nlink = 0; /* dir is unlinked */
8337 break;
8338 case 1:
8339 st->st_nlink = 1 /* parent dentry */
8340 + 1 /* <dir>/. */
8341 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8342 break;
8343 default:
8344 ceph_abort();
8345 }
8346 } else {
8347 st->st_nlink = in->nlink;
8348 }
7c673cae
FG
8349 st->st_uid = in->uid;
8350 st->st_gid = in->gid;
8351 if (in->ctime > in->mtime) {
8352 stat_set_ctime_sec(st, in->ctime.sec());
8353 stat_set_ctime_nsec(st, in->ctime.nsec());
8354 } else {
8355 stat_set_ctime_sec(st, in->mtime.sec());
8356 stat_set_ctime_nsec(st, in->mtime.nsec());
8357 }
8358 stat_set_atime_sec(st, in->atime.sec());
8359 stat_set_atime_nsec(st, in->atime.nsec());
8360 stat_set_mtime_sec(st, in->mtime.sec());
8361 stat_set_mtime_nsec(st, in->mtime.nsec());
8362 if (in->is_dir()) {
39ae355f 8363 if (cct->_conf->client_dirsize_rbytes) {
7c673cae 8364 st->st_size = in->rstat.rbytes;
39ae355f
TL
8365 } else if (in->snapid == CEPH_SNAPDIR) {
8366 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8367 if (realm) {
8368 st->st_size = realm->my_snaps.size();
8369 put_snap_realm(realm);
8370 }
8371 } else {
7c673cae 8372 st->st_size = in->dirstat.size();
39ae355f 8373 }
f67539c2
TL
8374// The Windows "stat" structure provides just a subset of the fields that are
8375// available on Linux.
8376#ifndef _WIN32
7c673cae 8377 st->st_blocks = 1;
f67539c2 8378#endif
7c673cae
FG
8379 } else {
8380 st->st_size = in->size;
f67539c2 8381#ifndef _WIN32
7c673cae 8382 st->st_blocks = (in->size + 511) >> 9;
f67539c2 8383#endif
7c673cae 8384 }
f67539c2 8385#ifndef _WIN32
11fdf7f2 8386 st->st_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
f67539c2 8387#endif
7c673cae
FG
8388
8389 if (dirstat)
8390 *dirstat = in->dirstat;
8391 if (rstat)
8392 *rstat = in->rstat;
8393
8394 return in->caps_issued();
8395}
8396
8397void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
8398{
11fdf7f2 8399 ldout(cct, 10) << __func__ << " on " << in->ino << " snap/dev" << in->snapid
7c673cae 8400 << " mode 0" << oct << in->mode << dec
39ae355f 8401 << " mtime " << in->mtime << " ctime " << in->ctime << " change_attr " << in->change_attr << dendl;
7c673cae
FG
8402 memset(stx, 0, sizeof(struct ceph_statx));
8403
8404 /*
2a845540 8405 * If mask is 0, then the caller set AT_STATX_DONT_SYNC. Reset the mask
7c673cae
FG
8406 * so that all bits are set.
8407 */
8408 if (!mask)
8409 mask = ~0;
8410
8411 /* These are always considered to be available */
8412 stx->stx_dev = in->snapid;
11fdf7f2 8413 stx->stx_blksize = std::max<uint32_t>(in->layout.stripe_unit, 4096);
7c673cae
FG
8414
8415 /* Type bits are always set, even when CEPH_STATX_MODE is not */
8416 stx->stx_mode = S_IFMT & in->mode;
1e59de90 8417 stx->stx_ino = use_faked_inos() ? in->faked_ino : (uint64_t)in->ino;
7c673cae
FG
8418 stx->stx_rdev = in->rdev;
8419 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
8420
8421 if (mask & CEPH_CAP_AUTH_SHARED) {
8422 stx->stx_uid = in->uid;
8423 stx->stx_gid = in->gid;
8424 stx->stx_mode = in->mode;
8425 in->btime.to_timespec(&stx->stx_btime);
8426 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
8427 }
8428
8429 if (mask & CEPH_CAP_LINK_SHARED) {
28e407b8
AA
8430 if (in->is_dir()) {
8431 switch (in->nlink) {
8432 case 0:
8433 stx->stx_nlink = 0; /* dir is unlinked */
8434 break;
8435 case 1:
8436 stx->stx_nlink = 1 /* parent dentry */
8437 + 1 /* <dir>/. */
8438 + in->dirstat.nsubdirs; /* include <dir>/. self-reference */
8439 break;
8440 default:
8441 ceph_abort();
8442 }
8443 } else {
8444 stx->stx_nlink = in->nlink;
8445 }
7c673cae
FG
8446 stx->stx_mask |= CEPH_STATX_NLINK;
8447 }
8448
8449 if (mask & CEPH_CAP_FILE_SHARED) {
8450
8451 in->atime.to_timespec(&stx->stx_atime);
8452 in->mtime.to_timespec(&stx->stx_mtime);
8453
8454 if (in->is_dir()) {
39ae355f 8455 if (cct->_conf->client_dirsize_rbytes) {
7c673cae 8456 stx->stx_size = in->rstat.rbytes;
39ae355f
TL
8457 } else if (in->snapid == CEPH_SNAPDIR) {
8458 SnapRealm *realm = get_snap_realm_maybe(in->vino().ino);
8459 if (realm) {
8460 stx->stx_size = realm->my_snaps.size();
8461 put_snap_realm(realm);
8462 }
8463 } else {
7c673cae 8464 stx->stx_size = in->dirstat.size();
39ae355f 8465 }
7c673cae
FG
8466 stx->stx_blocks = 1;
8467 } else {
8468 stx->stx_size = in->size;
8469 stx->stx_blocks = (in->size + 511) >> 9;
8470 }
8471 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
8472 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
8473 }
8474
8475 /* Change time and change_attr both require all shared caps to view */
8476 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
8477 stx->stx_version = in->change_attr;
8478 if (in->ctime > in->mtime)
8479 in->ctime.to_timespec(&stx->stx_ctime);
8480 else
8481 in->mtime.to_timespec(&stx->stx_ctime);
8482 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
8483 }
8484
8485}
8486
8487void Client::touch_dn(Dentry *dn)
8488{
8489 lru.lru_touch(dn);
8490}
8491
8492int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
8493{
b3b6e05e 8494 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, 0, perms);
7c673cae
FG
8495}
8496
8497int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
8498{
f67539c2
TL
8499 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8500 if (!mref_reader.is_state_satisfied())
8501 return -CEPHFS_ENOTCONN;
8502
11fdf7f2 8503 tout(cct) << __func__ << std::endl;
7c673cae
FG
8504 tout(cct) << fd << std::endl;
8505 tout(cct) << mode << std::endl;
181888fb 8506
f67539c2 8507 std::scoped_lock lock(client_lock);
7c673cae
FG
8508 Fh *f = get_filehandle(fd);
8509 if (!f)
f67539c2 8510 return -CEPHFS_EBADF;
7c673cae
FG
8511#if defined(__linux__) && defined(O_PATH)
8512 if (f->flags & O_PATH)
f67539c2 8513 return -CEPHFS_EBADF;
7c673cae
FG
8514#endif
8515 struct stat attr;
8516 attr.st_mode = mode;
8517 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
8518}
8519
b3b6e05e
TL
8520int Client::chmodat(int dirfd, const char *relpath, mode_t mode, int flags,
8521 const UserPerm& perms) {
f67539c2 8522 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8523 if (!mref_reader.is_state_satisfied()) {
f67539c2 8524 return -CEPHFS_ENOTCONN;
b3b6e05e 8525 }
f67539c2 8526
11fdf7f2 8527 tout(cct) << __func__ << std::endl;
b3b6e05e 8528 tout(cct) << dirfd << std::endl;
7c673cae
FG
8529 tout(cct) << relpath << std::endl;
8530 tout(cct) << mode << std::endl;
b3b6e05e 8531 tout(cct) << flags << std::endl;
181888fb 8532
7c673cae
FG
8533 filepath path(relpath);
8534 InodeRef in;
b3b6e05e 8535 InodeRef dirinode;
f67539c2
TL
8536
8537 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8538 int r = get_fd_inode(dirfd, &dirinode);
8539 if (r < 0) {
8540 return r;
8541 }
8542
8543 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8544 if (r < 0) {
7c673cae 8545 return r;
b3b6e05e 8546 }
7c673cae
FG
8547 struct stat attr;
8548 attr.st_mode = mode;
8549 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
8550}
8551
b3b6e05e
TL
8552int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
8553{
8554 return chmodat(CEPHFS_AT_FDCWD, relpath, mode, AT_SYMLINK_NOFOLLOW, perms);
8555}
8556
7c673cae
FG
8557int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
8558 const UserPerm& perms)
8559{
b3b6e05e 8560 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, 0, perms);
7c673cae
FG
8561}
8562
8563int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
8564{
f67539c2
TL
8565 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8566 if (!mref_reader.is_state_satisfied())
8567 return -CEPHFS_ENOTCONN;
8568
11fdf7f2 8569 tout(cct) << __func__ << std::endl;
7c673cae
FG
8570 tout(cct) << fd << std::endl;
8571 tout(cct) << new_uid << std::endl;
8572 tout(cct) << new_gid << std::endl;
181888fb 8573
f67539c2 8574 std::scoped_lock lock(client_lock);
7c673cae
FG
8575 Fh *f = get_filehandle(fd);
8576 if (!f)
f67539c2 8577 return -CEPHFS_EBADF;
7c673cae
FG
8578#if defined(__linux__) && defined(O_PATH)
8579 if (f->flags & O_PATH)
f67539c2 8580 return -CEPHFS_EBADF;
7c673cae
FG
8581#endif
8582 struct stat attr;
8583 attr.st_uid = new_uid;
8584 attr.st_gid = new_gid;
8585 int mask = 0;
8586 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
8587 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
8588 return _setattr(f->inode, &attr, mask, perms);
8589}
8590
8591int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
8592 const UserPerm& perms)
8593{
b3b6e05e
TL
8594 return chownat(CEPHFS_AT_FDCWD, relpath, new_uid, new_gid, AT_SYMLINK_NOFOLLOW, perms);
8595}
8596
8597int Client::chownat(int dirfd, const char *relpath, uid_t new_uid, gid_t new_gid,
8598 int flags, const UserPerm& perms) {
f67539c2 8599 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
b3b6e05e 8600 if (!mref_reader.is_state_satisfied()) {
f67539c2 8601 return -CEPHFS_ENOTCONN;
b3b6e05e 8602 }
f67539c2 8603
11fdf7f2 8604 tout(cct) << __func__ << std::endl;
b3b6e05e 8605 tout(cct) << dirfd << std::endl;
7c673cae
FG
8606 tout(cct) << relpath << std::endl;
8607 tout(cct) << new_uid << std::endl;
8608 tout(cct) << new_gid << std::endl;
b3b6e05e 8609 tout(cct) << flags << std::endl;
181888fb 8610
7c673cae
FG
8611 filepath path(relpath);
8612 InodeRef in;
b3b6e05e 8613 InodeRef dirinode;
f67539c2
TL
8614
8615 std::scoped_lock lock(client_lock);
b3b6e05e
TL
8616 int r = get_fd_inode(dirfd, &dirinode);
8617 if (r < 0) {
7c673cae 8618 return r;
b3b6e05e
TL
8619 }
8620
8621 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8622 if (r < 0) {
8623 return r;
8624 }
7c673cae
FG
8625 struct stat attr;
8626 attr.st_uid = new_uid;
8627 attr.st_gid = new_gid;
b3b6e05e 8628 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7c673cae
FG
8629}
8630
11fdf7f2
TL
8631static void attr_set_atime_and_mtime(struct stat *attr,
8632 const utime_t &atime,
8633 const utime_t &mtime)
8634{
8635 stat_set_atime_sec(attr, atime.tv.tv_sec);
8636 stat_set_atime_nsec(attr, atime.tv.tv_nsec);
8637 stat_set_mtime_sec(attr, mtime.tv.tv_sec);
8638 stat_set_mtime_nsec(attr, mtime.tv.tv_nsec);
8639}
8640
8641// for [l]utime() invoke the timeval variant as the timespec
8642// variant are not yet implemented. for futime[s](), invoke
8643// the timespec variant.
7c673cae
FG
8644int Client::utime(const char *relpath, struct utimbuf *buf,
8645 const UserPerm& perms)
8646{
11fdf7f2
TL
8647 struct timeval tv[2];
8648 tv[0].tv_sec = buf->actime;
8649 tv[0].tv_usec = 0;
8650 tv[1].tv_sec = buf->modtime;
8651 tv[1].tv_usec = 0;
8652
8653 return utimes(relpath, tv, perms);
8654}
8655
8656int Client::lutime(const char *relpath, struct utimbuf *buf,
8657 const UserPerm& perms)
8658{
8659 struct timeval tv[2];
8660 tv[0].tv_sec = buf->actime;
8661 tv[0].tv_usec = 0;
8662 tv[1].tv_sec = buf->modtime;
8663 tv[1].tv_usec = 0;
8664
8665 return lutimes(relpath, tv, perms);
8666}
8667
8668int Client::futime(int fd, struct utimbuf *buf, const UserPerm& perms)
8669{
8670 struct timespec ts[2];
8671 ts[0].tv_sec = buf->actime;
8672 ts[0].tv_nsec = 0;
8673 ts[1].tv_sec = buf->modtime;
8674 ts[1].tv_nsec = 0;
8675
8676 return futimens(fd, ts, perms);
8677}
8678
8679int Client::utimes(const char *relpath, struct timeval times[2],
8680 const UserPerm& perms)
8681{
f67539c2
TL
8682 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8683 if (!mref_reader.is_state_satisfied())
8684 return -CEPHFS_ENOTCONN;
8685
11fdf7f2 8686 tout(cct) << __func__ << std::endl;
7c673cae 8687 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8688 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8689 << std::endl;
8690 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8691 << std::endl;
181888fb 8692
7c673cae
FG
8693 filepath path(relpath);
8694 InodeRef in;
f67539c2
TL
8695
8696 std::scoped_lock lock(client_lock);
7c673cae
FG
8697 int r = path_walk(path, &in, perms);
8698 if (r < 0)
8699 return r;
1e59de90
TL
8700 struct ceph_statx attr;
8701 utime_t(times[0]).to_timespec(&attr.stx_atime);
8702 utime_t(times[1]).to_timespec(&attr.stx_mtime);
11fdf7f2 8703
1e59de90 8704 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7c673cae
FG
8705}
8706
11fdf7f2
TL
8707int Client::lutimes(const char *relpath, struct timeval times[2],
8708 const UserPerm& perms)
7c673cae 8709{
f67539c2
TL
8710 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8711 if (!mref_reader.is_state_satisfied())
8712 return -CEPHFS_ENOTCONN;
8713
11fdf7f2 8714 tout(cct) << __func__ << std::endl;
7c673cae 8715 tout(cct) << relpath << std::endl;
11fdf7f2
TL
8716 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_usec
8717 << std::endl;
8718 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_usec
8719 << std::endl;
181888fb 8720
7c673cae
FG
8721 filepath path(relpath);
8722 InodeRef in;
f67539c2
TL
8723
8724 std::scoped_lock lock(client_lock);
7c673cae
FG
8725 int r = path_walk(path, &in, perms, false);
8726 if (r < 0)
8727 return r;
1e59de90
TL
8728 struct ceph_statx attr;
8729 utime_t(times[0]).to_timespec(&attr.stx_atime);
8730 utime_t(times[1]).to_timespec(&attr.stx_mtime);
11fdf7f2 8731
1e59de90 8732 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7c673cae
FG
8733}
8734
11fdf7f2
TL
8735int Client::futimes(int fd, struct timeval times[2], const UserPerm& perms)
8736{
8737 struct timespec ts[2];
8738 ts[0].tv_sec = times[0].tv_sec;
8739 ts[0].tv_nsec = times[0].tv_usec * 1000;
8740 ts[1].tv_sec = times[1].tv_sec;
8741 ts[1].tv_nsec = times[1].tv_usec * 1000;
8742
8743 return futimens(fd, ts, perms);
8744}
8745
8746int Client::futimens(int fd, struct timespec times[2], const UserPerm& perms)
8747{
f67539c2
TL
8748 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8749 if (!mref_reader.is_state_satisfied())
8750 return -CEPHFS_ENOTCONN;
8751
11fdf7f2
TL
8752 tout(cct) << __func__ << std::endl;
8753 tout(cct) << fd << std::endl;
8754 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8755 << std::endl;
8756 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8757 << std::endl;
8758
f67539c2 8759 std::scoped_lock lock(client_lock);
11fdf7f2
TL
8760 Fh *f = get_filehandle(fd);
8761 if (!f)
f67539c2 8762 return -CEPHFS_EBADF;
11fdf7f2
TL
8763#if defined(__linux__) && defined(O_PATH)
8764 if (f->flags & O_PATH)
f67539c2 8765 return -CEPHFS_EBADF;
11fdf7f2 8766#endif
1e59de90
TL
8767 struct ceph_statx attr;
8768 utime_t(times[0]).to_timespec(&attr.stx_atime);
8769 utime_t(times[1]).to_timespec(&attr.stx_mtime);
11fdf7f2 8770
1e59de90 8771 return _setattrx(f->inode, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
11fdf7f2
TL
8772}
8773
b3b6e05e
TL
8774int Client::utimensat(int dirfd, const char *relpath, struct timespec times[2], int flags,
8775 const UserPerm& perms) {
8776 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8777 if (!mref_reader.is_state_satisfied()) {
8778 return -CEPHFS_ENOTCONN;
8779 }
8780
8781 tout(cct) << __func__ << std::endl;
8782 tout(cct) << dirfd << std::endl;
8783 tout(cct) << relpath << std::endl;
8784 tout(cct) << "atime: " << times[0].tv_sec << "." << times[0].tv_nsec
8785 << std::endl;
8786 tout(cct) << "mtime: " << times[1].tv_sec << "." << times[1].tv_nsec
8787 << std::endl;
8788 tout(cct) << flags << std::endl;
8789
8790 filepath path(relpath);
8791 InodeRef in;
8792 InodeRef dirinode;
8793
8794 std::scoped_lock lock(client_lock);
8795 int r = get_fd_inode(dirfd, &dirinode);
8796 if (r < 0) {
8797 return r;
8798 }
8799
8800#if defined(__linux__) && defined(O_PATH)
8801 if (flags & O_PATH) {
8802 return -CEPHFS_EBADF;
8803 }
8804#endif
8805
8806 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), 0, dirinode);
8807 if (r < 0) {
8808 return r;
8809 }
1e59de90
TL
8810 struct ceph_statx attr;
8811 utime_t(times[0]).to_timespec(&attr.stx_atime);
8812 utime_t(times[1]).to_timespec(&attr.stx_mtime);
b3b6e05e 8813
1e59de90 8814 return _setattrx(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
b3b6e05e
TL
8815}
8816
7c673cae
FG
8817int Client::flock(int fd, int operation, uint64_t owner)
8818{
f67539c2
TL
8819 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8820 if (!mref_reader.is_state_satisfied())
8821 return -CEPHFS_ENOTCONN;
8822
11fdf7f2 8823 tout(cct) << __func__ << std::endl;
7c673cae
FG
8824 tout(cct) << fd << std::endl;
8825 tout(cct) << operation << std::endl;
8826 tout(cct) << owner << std::endl;
181888fb 8827
f67539c2 8828 std::scoped_lock lock(client_lock);
7c673cae
FG
8829 Fh *f = get_filehandle(fd);
8830 if (!f)
f67539c2 8831 return -CEPHFS_EBADF;
7c673cae
FG
8832
8833 return _flock(f, operation, owner);
8834}
8835
8836int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
8837{
f67539c2
TL
8838 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8839 if (!mref_reader.is_state_satisfied())
8840 return -CEPHFS_ENOTCONN;
8841
11fdf7f2 8842 tout(cct) << __func__ << std::endl;
7c673cae 8843 tout(cct) << relpath << std::endl;
181888fb 8844
7c673cae
FG
8845 filepath path(relpath);
8846 InodeRef in;
f67539c2
TL
8847
8848 std::scoped_lock lock(client_lock);
7c673cae
FG
8849 int r = path_walk(path, &in, perms, true);
8850 if (r < 0)
8851 return r;
8852 if (cct->_conf->client_permissions) {
8853 int r = may_open(in.get(), O_RDONLY, perms);
8854 if (r < 0)
8855 return r;
8856 }
8857 r = _opendir(in.get(), dirpp, perms);
8858 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
f67539c2
TL
8859 if (r != -CEPHFS_ENOTDIR)
8860 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
8861 return r;
8862}
8863
b3b6e05e
TL
8864int Client::fdopendir(int dirfd, dir_result_t **dirpp, const UserPerm &perms) {
8865 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8866 if (!mref_reader.is_state_satisfied()) {
8867 return -CEPHFS_ENOTCONN;
8868 }
8869
8870 tout(cct) << __func__ << std::endl;
8871 tout(cct) << dirfd << std::endl;
8872
8873 InodeRef dirinode;
8874 std::scoped_lock locker(client_lock);
8875 int r = get_fd_inode(dirfd, &dirinode);
8876 if (r < 0) {
8877 return r;
8878 }
8879
8880 if (cct->_conf->client_permissions) {
8881 r = may_open(dirinode.get(), O_RDONLY, perms);
8882 if (r < 0) {
8883 return r;
8884 }
8885 }
8886 r = _opendir(dirinode.get(), dirpp, perms);
8887 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
8888 if (r != -CEPHFS_ENOTDIR) {
8889 tout(cct) << (uintptr_t)*dirpp << std::endl;
8890 }
8891 return r;
8892}
8893
7c673cae
FG
8894int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
8895{
8896 if (!in->is_dir())
f67539c2 8897 return -CEPHFS_ENOTDIR;
7c673cae
FG
8898 *dirpp = new dir_result_t(in, perms);
8899 opened_dirs.insert(*dirpp);
11fdf7f2 8900 ldout(cct, 8) << __func__ << "(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7c673cae
FG
8901 return 0;
8902}
8903
8904
8905int Client::closedir(dir_result_t *dir)
8906{
11fdf7f2 8907 tout(cct) << __func__ << std::endl;
f67539c2 8908 tout(cct) << (uintptr_t)dir << std::endl;
7c673cae 8909
11fdf7f2 8910 ldout(cct, 3) << __func__ << "(" << dir << ") = 0" << dendl;
f67539c2 8911 std::scoped_lock lock(client_lock);
7c673cae
FG
8912 _closedir(dir);
8913 return 0;
8914}
8915
8916void Client::_closedir(dir_result_t *dirp)
8917{
11fdf7f2 8918 ldout(cct, 10) << __func__ << "(" << dirp << ")" << dendl;
f67539c2 8919
7c673cae 8920 if (dirp->inode) {
11fdf7f2 8921 ldout(cct, 10) << __func__ << " detaching inode " << dirp->inode << dendl;
7c673cae
FG
8922 dirp->inode.reset();
8923 }
8924 _readdir_drop_dirp_buffer(dirp);
8925 opened_dirs.erase(dirp);
8926 delete dirp;
8927}
8928
8929void Client::rewinddir(dir_result_t *dirp)
8930{
11fdf7f2 8931 ldout(cct, 3) << __func__ << "(" << dirp << ")" << dendl;
181888fb 8932
f67539c2
TL
8933 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8934 if (!mref_reader.is_state_satisfied())
181888fb
FG
8935 return;
8936
f67539c2 8937 std::scoped_lock lock(client_lock);
7c673cae
FG
8938 dir_result_t *d = static_cast<dir_result_t*>(dirp);
8939 _readdir_drop_dirp_buffer(d);
8940 d->reset();
8941}
8942
8943loff_t Client::telldir(dir_result_t *dirp)
8944{
8945 dir_result_t *d = static_cast<dir_result_t*>(dirp);
11fdf7f2 8946 ldout(cct, 3) << __func__ << "(" << dirp << ") = " << d->offset << dendl;
7c673cae
FG
8947 return d->offset;
8948}
8949
8950void Client::seekdir(dir_result_t *dirp, loff_t offset)
8951{
11fdf7f2 8952 ldout(cct, 3) << __func__ << "(" << dirp << ", " << offset << ")" << dendl;
7c673cae 8953
f67539c2
TL
8954 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
8955 if (!mref_reader.is_state_satisfied())
181888fb
FG
8956 return;
8957
f67539c2
TL
8958 std::scoped_lock lock(client_lock);
8959
7c673cae
FG
8960 if (offset == dirp->offset)
8961 return;
8962
8963 if (offset > dirp->offset)
8964 dirp->release_count = 0; // bump if we do a forward seek
8965 else
8966 dirp->ordered_count = 0; // disable filling readdir cache
8967
8968 if (dirp->hash_order()) {
8969 if (dirp->offset > offset) {
8970 _readdir_drop_dirp_buffer(dirp);
8971 dirp->reset();
8972 }
8973 } else {
8974 if (offset == 0 ||
8975 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
8976 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
8977 _readdir_drop_dirp_buffer(dirp);
8978 dirp->reset();
8979 }
8980 }
8981
8982 dirp->offset = offset;
8983}
8984
8985
8986//struct dirent {
8987// ino_t d_ino; /* inode number */
8988// off_t d_off; /* offset to the next dirent */
8989// unsigned short d_reclen; /* length of this record */
8990// unsigned char d_type; /* type of file */
8991// char d_name[256]; /* filename */
8992//};
8993void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
8994{
8995 strncpy(de->d_name, name, 255);
8996 de->d_name[255] = '\0';
f67539c2 8997#if !defined(__CYGWIN__) && !(defined(_WIN32))
7c673cae 8998 de->d_ino = ino;
11fdf7f2 8999#if !defined(__APPLE__) && !defined(__FreeBSD__)
7c673cae
FG
9000 de->d_off = next_off;
9001#endif
9002 de->d_reclen = 1;
9003 de->d_type = IFTODT(type);
11fdf7f2 9004 ldout(cct, 10) << __func__ << " '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7c673cae
FG
9005 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
9006#endif
9007}
9008
9009void Client::_readdir_next_frag(dir_result_t *dirp)
9010{
9011 frag_t fg = dirp->buffer_frag;
9012
9013 if (fg.is_rightmost()) {
11fdf7f2 9014 ldout(cct, 10) << __func__ << " advance from " << fg << " to END" << dendl;
7c673cae
FG
9015 dirp->set_end();
9016 return;
9017 }
9018
9019 // advance
9020 fg = fg.next();
11fdf7f2 9021 ldout(cct, 10) << __func__ << " advance from " << dirp->buffer_frag << " to " << fg << dendl;
7c673cae
FG
9022
9023 if (dirp->hash_order()) {
9024 // keep last_name
9025 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
9026 if (dirp->offset < new_offset) // don't decrease offset
9027 dirp->offset = new_offset;
9028 } else {
9029 dirp->last_name.clear();
9030 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9031 _readdir_rechoose_frag(dirp);
9032 }
9033}
9034
9035void Client::_readdir_rechoose_frag(dir_result_t *dirp)
9036{
11fdf7f2 9037 ceph_assert(dirp->inode);
7c673cae
FG
9038
9039 if (dirp->hash_order())
9040 return;
9041
9042 frag_t cur = frag_t(dirp->offset_high());
9043 frag_t fg = dirp->inode->dirfragtree[cur.value()];
9044 if (fg != cur) {
11fdf7f2 9045 ldout(cct, 10) << __func__ << " frag " << cur << " maps to " << fg << dendl;
7c673cae
FG
9046 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
9047 dirp->last_name.clear();
9048 dirp->next_offset = 2;
9049 }
9050}
9051
9052void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
9053{
11fdf7f2 9054 ldout(cct, 10) << __func__ << " " << dirp << dendl;
7c673cae
FG
9055 dirp->buffer.clear();
9056}
9057
9058int Client::_readdir_get_frag(dir_result_t *dirp)
9059{
11fdf7f2
TL
9060 ceph_assert(dirp);
9061 ceph_assert(dirp->inode);
7c673cae
FG
9062
9063 // get the current frag.
9064 frag_t fg;
9065 if (dirp->hash_order())
9066 fg = dirp->inode->dirfragtree[dirp->offset_high()];
9067 else
9068 fg = frag_t(dirp->offset_high());
9069
11fdf7f2 9070 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " fg " << fg
7c673cae
FG
9071 << " offset " << hex << dirp->offset << dec << dendl;
9072
9073 int op = CEPH_MDS_OP_READDIR;
9074 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
9075 op = CEPH_MDS_OP_LSSNAP;
9076
9077 InodeRef& diri = dirp->inode;
9078
9079 MetaRequest *req = new MetaRequest(op);
9080 filepath path;
9081 diri->make_nosnap_relative_path(path);
9082 req->set_filepath(path);
9083 req->set_inode(diri.get());
9084 req->head.args.readdir.frag = fg;
9085 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
9086 if (dirp->last_name.length()) {
94b18763 9087 req->path2.set_path(dirp->last_name);
7c673cae
FG
9088 } else if (dirp->hash_order()) {
9089 req->head.args.readdir.offset_hash = dirp->offset_high();
9090 }
9091 req->dirp = dirp;
9092
9093 bufferlist dirbl;
9094 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
9095
f67539c2 9096 if (res == -CEPHFS_EAGAIN) {
11fdf7f2 9097 ldout(cct, 10) << __func__ << " got EAGAIN, retrying" << dendl;
7c673cae
FG
9098 _readdir_rechoose_frag(dirp);
9099 return _readdir_get_frag(dirp);
9100 }
9101
9102 if (res == 0) {
11fdf7f2 9103 ldout(cct, 10) << __func__ << " " << dirp << " got frag " << dirp->buffer_frag
7c673cae
FG
9104 << " size " << dirp->buffer.size() << dendl;
9105 } else {
11fdf7f2 9106 ldout(cct, 10) << __func__ << " got error " << res << ", setting end flag" << dendl;
7c673cae
FG
9107 dirp->set_end();
9108 }
9109
9110 return res;
9111}
9112
9113struct dentry_off_lt {
9114 bool operator()(const Dentry* dn, int64_t off) const {
9115 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
9116 }
9117};
9118
9119int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
9120 int caps, bool getref)
9121{
f67539c2 9122 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11fdf7f2 9123 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino
7c673cae
FG
9124 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
9125 << dendl;
9126 Dir *dir = dirp->inode->dir;
9127
9128 if (!dir) {
9129 ldout(cct, 10) << " dir is empty" << dendl;
9130 dirp->set_end();
9131 return 0;
9132 }
9133
9134 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
9135 dir->readdir_cache.end(),
9136 dirp->offset, dentry_off_lt());
9137
9138 string dn_name;
9139 while (true) {
adb31ebb 9140 int mask = caps;
7c673cae 9141 if (!dirp->inode->is_complete_and_ordered())
f67539c2 9142 return -CEPHFS_EAGAIN;
7c673cae
FG
9143 if (pd == dir->readdir_cache.end())
9144 break;
9145 Dentry *dn = *pd;
9146 if (dn->inode == NULL) {
9147 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
9148 ++pd;
9149 continue;
9150 }
9151 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
9152 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
9153 ++pd;
9154 continue;
9155 }
9156
92f5a8d4 9157 int idx = pd - dir->readdir_cache.begin();
adb31ebb
TL
9158 if (dn->inode->is_dir()) {
9159 mask |= CEPH_STAT_RSTAT;
9160 }
9161 int r = _getattr(dn->inode, mask, dirp->perms);
7c673cae
FG
9162 if (r < 0)
9163 return r;
92f5a8d4
TL
9164
9165 // the content of readdir_cache may change after _getattr(), so pd may be invalid iterator
9166 pd = dir->readdir_cache.begin() + idx;
9167 if (pd >= dir->readdir_cache.end() || *pd != dn)
f67539c2 9168 return -CEPHFS_EAGAIN;
7c673cae
FG
9169
9170 struct ceph_statx stx;
9171 struct dirent de;
9172 fill_statx(dn->inode, caps, &stx);
9173
9174 uint64_t next_off = dn->offset + 1;
eafe8130 9175 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7c673cae
FG
9176 ++pd;
9177 if (pd == dir->readdir_cache.end())
9178 next_off = dir_result_t::END;
9179
9180 Inode *in = NULL;
7c673cae
FG
9181 if (getref) {
9182 in = dn->inode.get();
9183 _ll_get(in);
9184 }
9185
9186 dn_name = dn->name; // fill in name while we have lock
9187
9f95a23c 9188 client_lock.unlock();
7c673cae 9189 r = cb(p, &de, &stx, next_off, in); // _next_ offset
9f95a23c 9190 client_lock.lock();
7c673cae
FG
9191 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
9192 << " = " << r << dendl;
9193 if (r < 0) {
9194 return r;
9195 }
9196
9197 dirp->offset = next_off;
9198 if (dirp->at_end())
9199 dirp->next_offset = 2;
9200 else
9201 dirp->next_offset = dirp->offset_low();
9202 dirp->last_name = dn_name; // we successfully returned this one; update!
28e407b8 9203 dirp->release_count = 0; // last_name no longer match cache index
7c673cae
FG
9204 if (r > 0)
9205 return r;
9206 }
9207
11fdf7f2 9208 ldout(cct, 10) << __func__ << " " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7c673cae
FG
9209 dirp->set_end();
9210 return 0;
9211}
9212
9213int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
9214 unsigned want, unsigned flags, bool getref)
9215{
9216 int caps = statx_to_mask(flags, want);
9217
f67539c2
TL
9218 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9219 if (!mref_reader.is_state_satisfied())
9220 return -CEPHFS_ENOTCONN;
7c673cae 9221
f67539c2 9222 std::unique_lock cl(client_lock);
181888fb 9223
7c673cae
FG
9224 dir_result_t *dirp = static_cast<dir_result_t*>(d);
9225
11fdf7f2 9226 ldout(cct, 10) << __func__ << " " << *dirp->inode << " offset " << hex << dirp->offset
7c673cae
FG
9227 << dec << " at_end=" << dirp->at_end()
9228 << " hash_order=" << dirp->hash_order() << dendl;
9229
9230 struct dirent de;
9231 struct ceph_statx stx;
9232 memset(&de, 0, sizeof(de));
9233 memset(&stx, 0, sizeof(stx));
9234
9235 InodeRef& diri = dirp->inode;
9236
9237 if (dirp->at_end())
9238 return 0;
9239
9240 if (dirp->offset == 0) {
9241 ldout(cct, 15) << " including ." << dendl;
11fdf7f2 9242 ceph_assert(diri->dentries.size() < 2); // can't have multiple hard-links to a dir
7c673cae
FG
9243 uint64_t next_off = 1;
9244
9245 int r;
adb31ebb 9246 r = _getattr(diri, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
9247 if (r < 0)
9248 return r;
9249
9250 fill_statx(diri, caps, &stx);
9251 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
9252
9253 Inode *inode = NULL;
9254 if (getref) {
9255 inode = diri.get();
9256 _ll_get(inode);
9257 }
9258
f67539c2 9259 cl.unlock();
7c673cae 9260 r = cb(p, &de, &stx, next_off, inode);
f67539c2 9261 cl.lock();
7c673cae
FG
9262 if (r < 0)
9263 return r;
9264
9265 dirp->offset = next_off;
9266 if (r > 0)
9267 return r;
9268 }
9269 if (dirp->offset == 1) {
9270 ldout(cct, 15) << " including .." << dendl;
9271 uint64_t next_off = 2;
9272 InodeRef in;
11fdf7f2 9273 if (diri->dentries.empty())
7c673cae
FG
9274 in = diri;
9275 else
94b18763 9276 in = diri->get_first_parent()->dir->parent_inode;
7c673cae
FG
9277
9278 int r;
adb31ebb 9279 r = _getattr(in, caps | CEPH_STAT_RSTAT, dirp->perms);
7c673cae
FG
9280 if (r < 0)
9281 return r;
9282
9283 fill_statx(in, caps, &stx);
9284 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
9285
9286 Inode *inode = NULL;
9287 if (getref) {
9288 inode = in.get();
9289 _ll_get(inode);
9290 }
9291
f67539c2 9292 cl.unlock();
7c673cae 9293 r = cb(p, &de, &stx, next_off, inode);
f67539c2 9294 cl.lock();
7c673cae
FG
9295 if (r < 0)
9296 return r;
9297
9298 dirp->offset = next_off;
9299 if (r > 0)
9300 return r;
9301 }
9302
9303 // can we read from our cache?
9304 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
9305 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
9306 << dirp->inode->is_complete_and_ordered()
9307 << " issued " << ccap_string(dirp->inode->caps_issued())
9308 << dendl;
9309 if (dirp->inode->snapid != CEPH_SNAPDIR &&
9310 dirp->inode->is_complete_and_ordered() &&
94b18763 9311 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED, true)) {
7c673cae 9312 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
f67539c2 9313 if (err != -CEPHFS_EAGAIN)
7c673cae
FG
9314 return err;
9315 }
9316
9317 while (1) {
9318 if (dirp->at_end())
9319 return 0;
9320
9321 bool check_caps = true;
9322 if (!dirp->is_cached()) {
9323 int r = _readdir_get_frag(dirp);
9324 if (r)
9325 return r;
9326 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
9327 // different than the requested one. (our dirfragtree was outdated)
9328 check_caps = false;
9329 }
9330 frag_t fg = dirp->buffer_frag;
9331
9332 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
9333 << " offset " << hex << dirp->offset << dendl;
9334
9335 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
9336 dirp->offset, dir_result_t::dentry_off_lt());
9337 it != dirp->buffer.end();
9338 ++it) {
9339 dir_result_t::dentry &entry = *it;
9340
9341 uint64_t next_off = entry.offset + 1;
9342
9343 int r;
9344 if (check_caps) {
adb31ebb
TL
9345 int mask = caps;
9346 if(entry.inode->is_dir()){
9347 mask |= CEPH_STAT_RSTAT;
9348 }
9349 r = _getattr(entry.inode, mask, dirp->perms);
7c673cae
FG
9350 if (r < 0)
9351 return r;
9352 }
9353
9354 fill_statx(entry.inode, caps, &stx);
9355 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
9356
9357 Inode *inode = NULL;
9358 if (getref) {
9359 inode = entry.inode.get();
9360 _ll_get(inode);
9361 }
9362
f67539c2 9363 cl.unlock();
7c673cae 9364 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
f67539c2 9365 cl.lock();
7c673cae
FG
9366
9367 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
9368 << " = " << r << dendl;
9369 if (r < 0)
9370 return r;
9371
9372 dirp->offset = next_off;
9373 if (r > 0)
9374 return r;
9375 }
9376
9377 if (dirp->next_offset > 2) {
9378 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
9379 _readdir_drop_dirp_buffer(dirp);
9380 continue; // more!
9381 }
9382
9383 if (!fg.is_rightmost()) {
9384 // next frag!
9385 _readdir_next_frag(dirp);
9386 continue;
9387 }
9388
9389 if (diri->shared_gen == dirp->start_shared_gen &&
9390 diri->dir_release_count == dirp->release_count) {
9391 if (diri->dir_ordered_count == dirp->ordered_count) {
9392 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
9393 if (diri->dir) {
11fdf7f2 9394 ceph_assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7c673cae
FG
9395 diri->dir->readdir_cache.resize(dirp->cache_index);
9396 }
9397 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
9398 } else {
9399 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
9400 diri->flags |= I_COMPLETE;
9401 }
9402 }
9403
9404 dirp->set_end();
9405 return 0;
9406 }
9407 ceph_abort();
9408 return 0;
9409}
9410
9411
9412int Client::readdir_r(dir_result_t *d, struct dirent *de)
9413{
9414 return readdirplus_r(d, de, 0, 0, 0, NULL);
9415}
9416
9417/*
9418 * readdirplus_r
9419 *
9420 * returns
9421 * 1 if we got a dirent
9422 * 0 for end of directory
9423 * <0 on error
9424 */
9425
9426struct single_readdir {
9427 struct dirent *de;
9428 struct ceph_statx *stx;
9429 Inode *inode;
9430 bool full;
9431};
9432
9433static int _readdir_single_dirent_cb(void *p, struct dirent *de,
9434 struct ceph_statx *stx, off_t off,
9435 Inode *in)
9436{
9437 single_readdir *c = static_cast<single_readdir *>(p);
9438
9439 if (c->full)
9440 return -1; // already filled this dirent
9441
9442 *c->de = *de;
9443 if (c->stx)
9444 *c->stx = *stx;
9445 c->inode = in;
9446 c->full = true;
9447 return 1;
9448}
9449
9450struct dirent *Client::readdir(dir_result_t *d)
9451{
9452 int ret;
f91f0fd5 9453 auto& de = d->de;
7c673cae
FG
9454 single_readdir sr;
9455 sr.de = &de;
9456 sr.stx = NULL;
9457 sr.inode = NULL;
9458 sr.full = false;
9459
9460 // our callback fills the dirent and sets sr.full=true on first
9461 // call, and returns -1 the second time around.
9462 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
9463 if (ret < -1) {
9464 errno = -ret; // this sucks.
9465 return (dirent *) NULL;
9466 }
9467 if (sr.full) {
9468 return &de;
9469 }
9470 return (dirent *) NULL;
9471}
9472
9473int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
9474 struct ceph_statx *stx, unsigned want,
9475 unsigned flags, Inode **out)
9476{
9477 single_readdir sr;
9478 sr.de = de;
9479 sr.stx = stx;
9480 sr.inode = NULL;
9481 sr.full = false;
9482
9483 // our callback fills the dirent and sets sr.full=true on first
9484 // call, and returns -1 the second time around.
9485 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
9486 if (r < -1)
9487 return r;
9488 if (out)
9489 *out = sr.inode;
9490 if (sr.full)
9491 return 1;
9492 return 0;
9493}
9494
9495
9496/* getdents */
9497struct getdents_result {
9498 char *buf;
9499 int buflen;
9500 int pos;
9501 bool fullent;
9502};
9503
9504static int _readdir_getdent_cb(void *p, struct dirent *de,
9505 struct ceph_statx *stx, off_t off, Inode *in)
9506{
9507 struct getdents_result *c = static_cast<getdents_result *>(p);
9508
9509 int dlen;
9510 if (c->fullent)
9511 dlen = sizeof(*de);
9512 else
9513 dlen = strlen(de->d_name) + 1;
9514
9515 if (c->pos + dlen > c->buflen)
9516 return -1; // doesn't fit
9517
9518 if (c->fullent) {
9519 memcpy(c->buf + c->pos, de, sizeof(*de));
9520 } else {
9521 memcpy(c->buf + c->pos, de->d_name, dlen);
9522 }
9523 c->pos += dlen;
9524 return 0;
9525}
9526
9527int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
9528{
9529 getdents_result gr;
9530 gr.buf = buf;
9531 gr.buflen = buflen;
9532 gr.fullent = fullent;
9533 gr.pos = 0;
9534
9535 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
9536
9537 if (r < 0) { // some error
9538 if (r == -1) { // buffer ran out of space
9539 if (gr.pos) { // but we got some entries already!
9540 return gr.pos;
9541 } // or we need a larger buffer
f67539c2 9542 return -CEPHFS_ERANGE;
7c673cae
FG
9543 } else { // actual error, return it
9544 return r;
9545 }
9546 }
9547 return gr.pos;
9548}
9549
9550
9551/* getdir */
9552struct getdir_result {
9553 list<string> *contents;
9554 int num;
9555};
9556
9557static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
9558{
9559 getdir_result *r = static_cast<getdir_result *>(p);
9560
9561 r->contents->push_back(de->d_name);
9562 r->num++;
9563 return 0;
9564}
9565
9566int Client::getdir(const char *relpath, list<string>& contents,
9567 const UserPerm& perms)
9568{
9569 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
f67539c2
TL
9570 tout(cct) << "getdir" << std::endl;
9571 tout(cct) << relpath << std::endl;
7c673cae
FG
9572
9573 dir_result_t *d;
9574 int r = opendir(relpath, &d, perms);
9575 if (r < 0)
9576 return r;
9577
9578 getdir_result gr;
9579 gr.contents = &contents;
9580 gr.num = 0;
9581 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
9582
9583 closedir(d);
9584
9585 if (r < 0)
9586 return r;
9587 return gr.num;
9588}
9589
9590
9591/****** file i/o **********/
f67539c2 9592
b3b6e05e 9593// common parts for open and openat. call with client_lock locked.
20effc67 9594int Client::create_and_open(int dirfd, const char *relpath, int flags,
b3b6e05e
TL
9595 const UserPerm& perms, mode_t mode, int stripe_unit,
9596 int stripe_count, int object_size, const char *data_pool,
9597 std::string alternate_name) {
9598 ceph_assert(ceph_mutex_is_locked(client_lock));
f91f0fd5 9599 int cflags = ceph_flags_sys2wire(flags);
f91f0fd5 9600 tout(cct) << cflags << std::endl;
7c673cae
FG
9601
9602 Fh *fh = NULL;
9603
9604#if defined(__linux__) && defined(O_PATH)
9605 /* When the O_PATH is being specified, others flags than O_DIRECTORY
9606 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
9607 * in kernel (fs/open.c). */
9608 if (flags & O_PATH)
9609 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
9610#endif
9611
9612 filepath path(relpath);
9613 InodeRef in;
9614 bool created = false;
9615 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
9616 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
f91f0fd5
TL
9617 int mask = ceph_caps_for_mode(ceph_flags_to_mode(cflags));
9618
b3b6e05e 9619 InodeRef dirinode = nullptr;
20effc67
TL
9620 int r = get_fd_inode(dirfd, &dirinode);
9621 if (r < 0) {
9622 return r;
b3b6e05e 9623 }
7c673cae 9624
20effc67 9625 r = path_walk(path, &in, perms, followsym, mask, dirinode);
7c673cae 9626 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 9627 return -CEPHFS_EEXIST;
7c673cae
FG
9628
9629#if defined(__linux__) && defined(O_PATH)
9630 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
9631#else
b3b6e05e 9632 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
7c673cae 9633#endif
f67539c2 9634 return -CEPHFS_ELOOP;
7c673cae 9635
f67539c2 9636 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
7c673cae
FG
9637 filepath dirpath = path;
9638 string dname = dirpath.last_dentry();
9639 dirpath.pop_dentry();
9640 InodeRef dir;
9641 r = path_walk(dirpath, &dir, perms, true,
b3b6e05e
TL
9642 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0, dirinode);
9643 if (r < 0) {
7c673cae 9644 goto out;
b3b6e05e 9645 }
7c673cae
FG
9646 if (cct->_conf->client_permissions) {
9647 r = may_create(dir.get(), perms);
9648 if (r < 0)
b3b6e05e 9649 goto out;
7c673cae
FG
9650 }
9651 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
f67539c2
TL
9652 stripe_count, object_size, data_pool, &created, perms,
9653 std::move(alternate_name));
7c673cae
FG
9654 }
9655 if (r < 0)
9656 goto out;
9657
9658 if (!created) {
9659 // posix says we can only check permissions of existing files
9660 if (cct->_conf->client_permissions) {
9661 r = may_open(in.get(), flags, perms);
9662 if (r < 0)
b3b6e05e 9663 goto out;
7c673cae
FG
9664 }
9665 }
9666
9667 if (!fh)
9668 r = _open(in.get(), flags, mode, &fh, perms);
9669 if (r >= 0) {
9670 // allocate a integer file descriptor
11fdf7f2 9671 ceph_assert(fh);
7c673cae 9672 r = get_fd();
11fdf7f2 9673 ceph_assert(fd_map.count(r) == 0);
7c673cae
FG
9674 fd_map[r] = fh;
9675 }
9676
9677 out:
b3b6e05e
TL
9678 return r;
9679}
9680
9681int Client::open(const char *relpath, int flags, const UserPerm& perms,
9682 mode_t mode, int stripe_unit, int stripe_count,
9683 int object_size, const char *data_pool, std::string alternate_name)
9684{
9685 return openat(CEPHFS_AT_FDCWD, relpath, flags, perms, mode, stripe_unit,
9686 stripe_count, object_size, data_pool, alternate_name);
9687}
9688
b3b6e05e
TL
9689int Client::openat(int dirfd, const char *relpath, int flags, const UserPerm& perms,
9690 mode_t mode, int stripe_unit, int stripe_count, int object_size,
9691 const char *data_pool, std::string alternate_name) {
9692 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9693 if (!mref_reader.is_state_satisfied()) {
9694 return -CEPHFS_ENOTCONN;
9695 }
9696
9697 ldout(cct, 3) << "openat enter(" << relpath << ")" << dendl;
9698 tout(cct) << dirfd << std::endl;
9699 tout(cct) << relpath << std::endl;
9700 tout(cct) << flags << std::endl;
9701 tout(cct) << mode << std::endl;
9702
9703 std::scoped_lock locker(client_lock);
9704 int r = create_and_open(dirfd, relpath, flags, perms, mode, stripe_unit, stripe_count,
9705 object_size, data_pool, alternate_name);
9706
7c673cae 9707 tout(cct) << r << std::endl;
b3b6e05e 9708 ldout(cct, 3) << "openat exit(" << relpath << ")" << dendl;
7c673cae
FG
9709 return r;
9710}
9711
7c673cae
FG
9712int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
9713 const UserPerm& perms)
9714{
11fdf7f2 9715 ldout(cct, 3) << __func__ << " enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
7c673cae 9716
f67539c2
TL
9717 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9718 if (!mref_reader.is_state_satisfied())
9719 return -CEPHFS_ENOTCONN;
181888fb 9720
f67539c2 9721 std::scoped_lock lock(client_lock);
7c673cae
FG
9722 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
9723 filepath path(ino);
9724 req->set_filepath(path);
9725
9726 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
9727 char f[30];
9728 sprintf(f, "%u", h);
9729 filepath path2(dirino);
9730 path2.push_dentry(string(f));
9731 req->set_filepath2(path2);
9732
9733 int r = make_request(req, perms, NULL, NULL,
9734 rand() % mdsmap->get_num_in_mds());
11fdf7f2 9735 ldout(cct, 3) << __func__ << " exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
7c673cae
FG
9736 return r;
9737}
9738
9739
9740/**
9741 * Load inode into local cache.
9742 *
9743 * If inode pointer is non-NULL, and take a reference on
9744 * the resulting Inode object in one operation, so that caller
9745 * can safely assume inode will still be there after return.
9746 */
f67539c2 9747int Client::_lookup_vino(vinodeno_t vino, const UserPerm& perms, Inode **inode)
7c673cae 9748{
f67539c2 9749 ldout(cct, 8) << __func__ << " enter(" << vino << ")" << dendl;
7c673cae 9750
f67539c2
TL
9751 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9752 if (!mref_reader.is_state_satisfied())
9753 return -CEPHFS_ENOTCONN;
181888fb 9754
b3b6e05e
TL
9755 if (is_reserved_vino(vino))
9756 return -CEPHFS_ESTALE;
9757
7c673cae 9758 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
f67539c2 9759 filepath path(vino.ino);
7c673cae
FG
9760 req->set_filepath(path);
9761
f67539c2
TL
9762 /*
9763 * The MDS expects either a "real" snapid here or 0. The special value
9764 * carveouts for the snapid are all at the end of the range so we can
9765 * just look for any snapid below this value.
9766 */
9767 if (vino.snapid < CEPH_NOSNAP)
9768 req->head.args.lookupino.snapid = vino.snapid;
9769
7c673cae
FG
9770 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
9771 if (r == 0 && inode != NULL) {
7c673cae 9772 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
11fdf7f2 9773 ceph_assert(p != inode_map.end());
7c673cae
FG
9774 *inode = p->second;
9775 _ll_get(*inode);
9776 }
f67539c2 9777 ldout(cct, 8) << __func__ << " exit(" << vino << ") = " << r << dendl;
7c673cae
FG
9778 return r;
9779}
9780
1adf2230
AA
9781int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
9782{
f67539c2
TL
9783 vinodeno_t vino(ino, CEPH_NOSNAP);
9784 std::scoped_lock lock(client_lock);
9785 return _lookup_vino(vino, perms, inode);
1adf2230 9786}
7c673cae
FG
9787
9788/**
9789 * Find the parent inode of `ino` and insert it into
9790 * our cache. Conditionally also set `parent` to a referenced
9791 * Inode* if caller provides non-NULL value.
9792 */
1adf2230 9793int Client::_lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
7c673cae 9794{
11fdf7f2 9795 ldout(cct, 8) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9796
7c673cae
FG
9797 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
9798 filepath path(ino->ino);
9799 req->set_filepath(path);
9800
9801 InodeRef target;
9802 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
9803 // Give caller a reference to the parent ino if they provided a pointer.
9804 if (parent != NULL) {
9805 if (r == 0) {
9806 *parent = target.get();
9807 _ll_get(*parent);
11fdf7f2 9808 ldout(cct, 8) << __func__ << " found parent " << (*parent)->ino << dendl;
7c673cae
FG
9809 } else {
9810 *parent = NULL;
9811 }
9812 }
11fdf7f2 9813 ldout(cct, 8) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9814 return r;
9815}
9816
7c673cae
FG
9817/**
9818 * Populate the parent dentry for `ino`, provided it is
9819 * a child of `parent`.
9820 */
1adf2230 9821int Client::_lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
7c673cae 9822{
11fdf7f2
TL
9823 ceph_assert(parent->is_dir());
9824 ldout(cct, 3) << __func__ << " enter(" << ino->ino << ")" << dendl;
7c673cae 9825
f67539c2
TL
9826 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
9827 if (!mref_reader.is_state_satisfied())
9828 return -CEPHFS_ENOTCONN;
181888fb 9829
7c673cae
FG
9830 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9831 req->set_filepath2(filepath(parent->ino));
9832 req->set_filepath(filepath(ino->ino));
9833 req->set_inode(ino);
9834
9835 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
11fdf7f2 9836 ldout(cct, 3) << __func__ << " exit(" << ino->ino << ") = " << r << dendl;
7c673cae
FG
9837 return r;
9838}
9839
1adf2230
AA
9840int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
9841{
f67539c2 9842 std::scoped_lock lock(client_lock);
1adf2230
AA
9843 return _lookup_name(ino, parent, perms);
9844}
7c673cae 9845
11fdf7f2 9846Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
7c673cae 9847{
11fdf7f2 9848 ceph_assert(in);
f6b5b4d7 9849 Fh *f = new Fh(in, flags, cmode, fd_gen, perms);
7c673cae 9850
11fdf7f2 9851 ldout(cct, 10) << __func__ << " " << in->ino << " mode " << cmode << dendl;
7c673cae
FG
9852
9853 if (in->snapid != CEPH_NOSNAP) {
9854 in->snap_cap_refs++;
9855 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
9856 << ccap_string(in->caps_issued()) << dendl;
9857 }
9858
11fdf7f2 9859 const auto& conf = cct->_conf;
7c673cae
FG
9860 f->readahead.set_trigger_requests(1);
9861 f->readahead.set_min_readahead_size(conf->client_readahead_min);
9862 uint64_t max_readahead = Readahead::NO_LIMIT;
9863 if (conf->client_readahead_max_bytes) {
11fdf7f2 9864 max_readahead = std::min(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
7c673cae
FG
9865 }
9866 if (conf->client_readahead_max_periods) {
11fdf7f2 9867 max_readahead = std::min(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
7c673cae
FG
9868 }
9869 f->readahead.set_max_readahead_size(max_readahead);
9870 vector<uint64_t> alignments;
9871 alignments.push_back(in->layout.get_period());
9872 alignments.push_back(in->layout.stripe_unit);
9873 f->readahead.set_alignments(alignments);
9874
9875 return f;
9876}
9877
9878int Client::_release_fh(Fh *f)
9879{
9880 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
9881 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
9882 Inode *in = f->inode.get();
11fdf7f2 9883 ldout(cct, 8) << __func__ << " " << f << " mode " << f->mode << " on " << *in << dendl;
7c673cae 9884
b32b8144
FG
9885 in->unset_deleg(f);
9886
7c673cae
FG
9887 if (in->snapid == CEPH_NOSNAP) {
9888 if (in->put_open_ref(f->mode)) {
9889 _flush(in, new C_Client_FlushComplete(this, in));
9890 check_caps(in, 0);
9891 }
9892 } else {
11fdf7f2 9893 ceph_assert(in->snap_cap_refs > 0);
7c673cae
FG
9894 in->snap_cap_refs--;
9895 }
9896
9897 _release_filelocks(f);
9898
9899 // Finally, read any async err (i.e. from flushes)
9900 int err = f->take_async_err();
9901 if (err != 0) {
11fdf7f2 9902 ldout(cct, 1) << __func__ << " " << f << " on inode " << *in << " caught async_err = "
7c673cae
FG
9903 << cpp_strerror(err) << dendl;
9904 } else {
11fdf7f2 9905 ldout(cct, 10) << __func__ << " " << f << " on inode " << *in << " no async_err state" << dendl;
7c673cae
FG
9906 }
9907
9908 _put_fh(f);
9909
9910 return err;
9911}
9912
9913void Client::_put_fh(Fh *f)
9914{
9915 int left = f->put();
9916 if (!left) {
9917 delete f;
9918 }
9919}
9920
9921int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
9922 const UserPerm& perms)
9923{
9924 if (in->snapid != CEPH_NOSNAP &&
9925 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
f67539c2 9926 return -CEPHFS_EROFS;
7c673cae
FG
9927 }
9928
9929 // use normalized flags to generate cmode
11fdf7f2
TL
9930 int cflags = ceph_flags_sys2wire(flags);
9931 if (cct->_conf.get_val<bool>("client_force_lazyio"))
9932 cflags |= CEPH_O_LAZY;
9933
9934 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
9935 int want = ceph_caps_for_mode(cmode);
9936 int result = 0;
9937
9938 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
9939
b32b8144 9940 if ((flags & O_TRUNC) == 0 && in->caps_issued_mask(want)) {
7c673cae
FG
9941 // update wanted?
9942 check_caps(in, CHECK_CAPS_NODELAY);
9943 } else {
b32b8144 9944
7c673cae
FG
9945 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
9946 filepath path;
9947 in->make_nosnap_relative_path(path);
9948 req->set_filepath(path);
11fdf7f2 9949 req->head.args.open.flags = cflags & ~CEPH_O_CREAT;
7c673cae
FG
9950 req->head.args.open.mode = mode;
9951 req->head.args.open.pool = -1;
9952 if (cct->_conf->client_debug_getattr_caps)
9953 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
9954 else
9955 req->head.args.open.mask = 0;
9956 req->head.args.open.old_size = in->size; // for O_TRUNC
9957 req->set_inode(in);
9958 result = make_request(req, perms);
b32b8144
FG
9959
9960 /*
9961 * NFS expects that delegations will be broken on a conflicting open,
9962 * not just when there is actual conflicting access to the file. SMB leases
9963 * and oplocks also have similar semantics.
9964 *
9965 * Ensure that clients that have delegations enabled will wait on minimal
9966 * caps during open, just to ensure that other clients holding delegations
9967 * return theirs first.
9968 */
9969 if (deleg_timeout && result == 0) {
9970 int need = 0, have;
9971
9972 if (cmode & CEPH_FILE_MODE_WR)
9973 need |= CEPH_CAP_FILE_WR;
9974 if (cmode & CEPH_FILE_MODE_RD)
9975 need |= CEPH_CAP_FILE_RD;
9976
f6b5b4d7
TL
9977 Fh fh(in, flags, cmode, fd_gen, perms);
9978 result = get_caps(&fh, need, want, &have, -1);
b32b8144 9979 if (result < 0) {
1adf2230 9980 ldout(cct, 8) << "Unable to get caps after open of inode " << *in <<
b32b8144
FG
9981 " . Denying open: " <<
9982 cpp_strerror(result) << dendl;
b32b8144
FG
9983 } else {
9984 put_cap_ref(in, need);
9985 }
9986 }
7c673cae
FG
9987 }
9988
9989 // success?
9990 if (result >= 0) {
9991 if (fhp)
9992 *fhp = _create_fh(in, flags, cmode, perms);
9993 } else {
9994 in->put_open_ref(cmode);
9995 }
9996
9997 trim_cache();
9998
9999 return result;
10000}
10001
10002int Client::_renew_caps(Inode *in)
10003{
10004 int wanted = in->caps_file_wanted();
10005 if (in->is_any_caps() &&
10006 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
10007 check_caps(in, CHECK_CAPS_NODELAY);
10008 return 0;
10009 }
10010
10011 int flags = 0;
10012 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
10013 flags = O_RDWR;
10014 else if (wanted & CEPH_CAP_FILE_RD)
10015 flags = O_RDONLY;
10016 else if (wanted & CEPH_CAP_FILE_WR)
10017 flags = O_WRONLY;
10018
10019 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
10020 filepath path;
10021 in->make_nosnap_relative_path(path);
10022 req->set_filepath(path);
10023 req->head.args.open.flags = flags;
10024 req->head.args.open.pool = -1;
10025 if (cct->_conf->client_debug_getattr_caps)
10026 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
10027 else
10028 req->head.args.open.mask = 0;
10029 req->set_inode(in);
10030
10031 // duplicate in case Cap goes away; not sure if that race is a concern?
10032 const UserPerm *pperm = in->get_best_perms();
10033 UserPerm perms;
10034 if (pperm != NULL)
10035 perms = *pperm;
10036 int ret = make_request(req, perms);
10037 return ret;
10038}
10039
b3b6e05e 10040int Client::_close(int fd)
7c673cae
FG
10041{
10042 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
7c673cae
FG
10043 tout(cct) << "close" << std::endl;
10044 tout(cct) << fd << std::endl;
10045
10046 Fh *fh = get_filehandle(fd);
10047 if (!fh)
f67539c2 10048 return -CEPHFS_EBADF;
7c673cae
FG
10049 int err = _release_fh(fh);
10050 fd_map.erase(fd);
10051 put_fd(fd);
10052 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
10053 return err;
10054}
10055
b3b6e05e
TL
10056int Client::close(int fd) {
10057 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10058 if (!mref_reader.is_state_satisfied())
10059 return -CEPHFS_ENOTCONN;
10060
10061 std::scoped_lock lock(client_lock);
10062 return _close(fd);
10063}
7c673cae
FG
10064
10065// ------------
10066// read, write
10067
10068loff_t Client::lseek(int fd, loff_t offset, int whence)
10069{
f67539c2
TL
10070 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10071 if (!mref_reader.is_state_satisfied())
10072 return -CEPHFS_ENOTCONN;
10073
7c673cae
FG
10074 tout(cct) << "lseek" << std::endl;
10075 tout(cct) << fd << std::endl;
10076 tout(cct) << offset << std::endl;
10077 tout(cct) << whence << std::endl;
10078
f67539c2 10079 std::scoped_lock lock(client_lock);
7c673cae
FG
10080 Fh *f = get_filehandle(fd);
10081 if (!f)
f67539c2 10082 return -CEPHFS_EBADF;
7c673cae
FG
10083#if defined(__linux__) && defined(O_PATH)
10084 if (f->flags & O_PATH)
f67539c2 10085 return -CEPHFS_EBADF;
7c673cae
FG
10086#endif
10087 return _lseek(f, offset, whence);
10088}
10089
10090loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
10091{
10092 Inode *in = f->inode.get();
9f95a23c 10093 bool whence_check = false;
11fdf7f2 10094 loff_t pos = -1;
7c673cae 10095
9f95a23c
TL
10096 switch (whence) {
10097 case SEEK_END:
10098 whence_check = true;
10099 break;
10100
10101#ifdef SEEK_DATA
10102 case SEEK_DATA:
10103 whence_check = true;
10104 break;
10105#endif
10106
10107#ifdef SEEK_HOLE
10108 case SEEK_HOLE:
10109 whence_check = true;
10110 break;
10111#endif
10112 }
10113
10114 if (whence_check) {
10115 int r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10116 if (r < 0)
92f5a8d4 10117 return r;
92f5a8d4
TL
10118 }
10119
7c673cae
FG
10120 switch (whence) {
10121 case SEEK_SET:
11fdf7f2 10122 pos = offset;
7c673cae
FG
10123 break;
10124
10125 case SEEK_CUR:
92f5a8d4 10126 pos = f->pos + offset;
7c673cae
FG
10127 break;
10128
10129 case SEEK_END:
11fdf7f2 10130 pos = in->size + offset;
7c673cae
FG
10131 break;
10132
9f95a23c 10133#ifdef SEEK_DATA
92f5a8d4 10134 case SEEK_DATA:
9f95a23c 10135 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 10136 return -CEPHFS_ENXIO;
92f5a8d4
TL
10137 pos = offset;
10138 break;
9f95a23c 10139#endif
92f5a8d4 10140
9f95a23c 10141#ifdef SEEK_HOLE
92f5a8d4 10142 case SEEK_HOLE:
9f95a23c 10143 if (offset < 0 || static_cast<uint64_t>(offset) >= in->size)
f67539c2 10144 return -CEPHFS_ENXIO;
9f95a23c 10145 pos = in->size;
92f5a8d4 10146 break;
9f95a23c 10147#endif
92f5a8d4 10148
7c673cae 10149 default:
92f5a8d4 10150 ldout(cct, 1) << __func__ << ": invalid whence value " << whence << dendl;
f67539c2 10151 return -CEPHFS_EINVAL;
7c673cae
FG
10152 }
10153
11fdf7f2 10154 if (pos < 0) {
f67539c2 10155 return -CEPHFS_EINVAL;
11fdf7f2
TL
10156 } else {
10157 f->pos = pos;
10158 }
10159
1adf2230 10160 ldout(cct, 8) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
7c673cae
FG
10161 return f->pos;
10162}
10163
10164
10165void Client::lock_fh_pos(Fh *f)
10166{
11fdf7f2 10167 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae
FG
10168
10169 if (f->pos_locked || !f->pos_waiters.empty()) {
9f95a23c 10170 ceph::condition_variable cond;
7c673cae 10171 f->pos_waiters.push_back(&cond);
11fdf7f2 10172 ldout(cct, 10) << __func__ << " BLOCKING on " << f << dendl;
9f95a23c
TL
10173 std::unique_lock l{client_lock, std::adopt_lock};
10174 cond.wait(l, [f, me=&cond] {
10175 return !f->pos_locked && f->pos_waiters.front() == me;
10176 });
10177 l.release();
11fdf7f2
TL
10178 ldout(cct, 10) << __func__ << " UNBLOCKING on " << f << dendl;
10179 ceph_assert(f->pos_waiters.front() == &cond);
7c673cae
FG
10180 f->pos_waiters.pop_front();
10181 }
10182
10183 f->pos_locked = true;
10184}
10185
10186void Client::unlock_fh_pos(Fh *f)
10187{
f67539c2
TL
10188 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10189
11fdf7f2 10190 ldout(cct, 10) << __func__ << " " << f << dendl;
7c673cae 10191 f->pos_locked = false;
f67539c2
TL
10192 if (!f->pos_waiters.empty()) {
10193 // only wake up the oldest waiter
10194 auto cond = f->pos_waiters.front();
10195 cond->notify_one();
10196 }
7c673cae
FG
10197}
10198
10199int Client::uninline_data(Inode *in, Context *onfinish)
10200{
10201 if (!in->inline_data.length()) {
10202 onfinish->complete(0);
10203 return 0;
10204 }
10205
10206 char oid_buf[32];
10207 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
10208 object_t oid = oid_buf;
10209
10210 ObjectOperation create_ops;
10211 create_ops.create(false);
10212
10213 objecter->mutate(oid,
10214 OSDMap::file_to_object_locator(in->layout),
10215 create_ops,
10216 in->snaprealm->get_snap_context(),
10217 ceph::real_clock::now(),
10218 0,
10219 NULL);
10220
10221 bufferlist inline_version_bl;
11fdf7f2 10222 encode(in->inline_version, inline_version_bl);
7c673cae
FG
10223
10224 ObjectOperation uninline_ops;
10225 uninline_ops.cmpxattr("inline_version",
10226 CEPH_OSD_CMPXATTR_OP_GT,
10227 CEPH_OSD_CMPXATTR_MODE_U64,
10228 inline_version_bl);
10229 bufferlist inline_data = in->inline_data;
10230 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
10231 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
10232
10233 objecter->mutate(oid,
10234 OSDMap::file_to_object_locator(in->layout),
10235 uninline_ops,
10236 in->snaprealm->get_snap_context(),
10237 ceph::real_clock::now(),
10238 0,
10239 onfinish);
10240
10241 return 0;
10242}
10243
10244//
10245
10246// blocking osd interface
10247
10248int Client::read(int fd, char *buf, loff_t size, loff_t offset)
10249{
f67539c2
TL
10250 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10251 if (!mref_reader.is_state_satisfied())
10252 return -CEPHFS_ENOTCONN;
10253
7c673cae
FG
10254 tout(cct) << "read" << std::endl;
10255 tout(cct) << fd << std::endl;
10256 tout(cct) << size << std::endl;
10257 tout(cct) << offset << std::endl;
10258
f67539c2 10259 std::unique_lock lock(client_lock);
7c673cae
FG
10260 Fh *f = get_filehandle(fd);
10261 if (!f)
f67539c2 10262 return -CEPHFS_EBADF;
7c673cae
FG
10263#if defined(__linux__) && defined(O_PATH)
10264 if (f->flags & O_PATH)
f67539c2 10265 return -CEPHFS_EBADF;
7c673cae
FG
10266#endif
10267 bufferlist bl;
11fdf7f2
TL
10268 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10269 size = std::min(size, (loff_t)INT_MAX);
7c673cae
FG
10270 int r = _read(f, offset, size, &bl);
10271 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
10272 if (r >= 0) {
f6b5b4d7 10273 lock.unlock();
9f95a23c 10274 bl.begin().copy(bl.length(), buf);
7c673cae
FG
10275 r = bl.length();
10276 }
10277 return r;
10278}
10279
10280int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
10281{
10282 if (iovcnt < 0)
f67539c2 10283 return -CEPHFS_EINVAL;
7c673cae
FG
10284 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
10285}
10286
11fdf7f2 10287int64_t Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
7c673cae 10288{
f67539c2
TL
10289 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10290
11fdf7f2
TL
10291 int want, have = 0;
10292 bool movepos = false;
adb31ebb 10293 int64_t rc = 0;
11fdf7f2 10294 const auto& conf = cct->_conf;
7c673cae 10295 Inode *in = f->inode.get();
11fdf7f2
TL
10296 utime_t lat;
10297 utime_t start = ceph_clock_now();
7c673cae
FG
10298
10299 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
f67539c2 10300 return -CEPHFS_EBADF;
7c673cae
FG
10301 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10302
7c673cae
FG
10303 if (offset < 0) {
10304 lock_fh_pos(f);
10305 offset = f->pos;
10306 movepos = true;
10307 }
10308 loff_t start_pos = offset;
10309
10310 if (in->inline_version == 0) {
adb31ebb 10311 auto r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
c07f9fc5 10312 if (r < 0) {
adb31ebb 10313 rc = r;
11fdf7f2 10314 goto done;
c07f9fc5 10315 }
11fdf7f2 10316 ceph_assert(in->inline_version > 0);
7c673cae
FG
10317 }
10318
10319retry:
11fdf7f2
TL
10320 if (f->mode & CEPH_FILE_MODE_LAZY)
10321 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
10322 else
10323 want = CEPH_CAP_FILE_CACHE;
adb31ebb
TL
10324 {
10325 auto r = get_caps(f, CEPH_CAP_FILE_RD, want, &have, -1);
10326 if (r < 0) {
10327 rc = r;
10328 goto done;
10329 }
c07f9fc5 10330 }
7c673cae 10331 if (f->flags & O_DIRECT)
11fdf7f2 10332 have &= ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10333
10334 if (in->inline_version < CEPH_INLINE_NONE) {
39ae355f
TL
10335 uint32_t len = in->inline_data.length();
10336 uint64_t endoff = offset + size;
10337 if (endoff > in->size)
10338 endoff = in->size;
10339
10340 if (offset < len) {
10341 if (endoff <= len) {
10342 bl->substr_of(in->inline_data, offset, endoff - offset);
11fdf7f2 10343 } else {
39ae355f
TL
10344 bl->substr_of(in->inline_data, offset, len - offset);
10345 bl->append_zero(endoff - len);
7c673cae 10346 }
39ae355f
TL
10347 rc = endoff - offset;
10348 } else if ((uint64_t)offset < endoff) {
10349 bl->append_zero(endoff - offset);
10350 rc = endoff - offset;
10351 } else {
10352 rc = 0;
7c673cae 10353 }
39ae355f 10354 goto success;
7c673cae
FG
10355 }
10356
10357 if (!conf->client_debug_force_sync_read &&
11fdf7f2
TL
10358 conf->client_oc &&
10359 (have & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10360
10361 if (f->flags & O_RSYNC) {
10362 _flush_range(in, offset, size);
10363 }
adb31ebb
TL
10364 rc = _read_async(f, offset, size, bl);
10365 if (rc < 0)
7c673cae
FG
10366 goto done;
10367 } else {
10368 if (f->flags & O_DIRECT)
10369 _flush_range(in, offset, size);
10370
10371 bool checkeof = false;
adb31ebb
TL
10372 rc = _read_sync(f, offset, size, bl, &checkeof);
10373 if (rc < 0)
7c673cae
FG
10374 goto done;
10375 if (checkeof) {
adb31ebb
TL
10376 offset += rc;
10377 size -= rc;
7c673cae
FG
10378
10379 put_cap_ref(in, CEPH_CAP_FILE_RD);
10380 have = 0;
10381 // reverify size
adb31ebb
TL
10382 {
10383 auto r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
10384 if (r < 0) {
10385 rc = r;
10386 goto done;
10387 }
10388 }
7c673cae
FG
10389
10390 // eof? short read.
10391 if ((uint64_t)offset < in->size)
10392 goto retry;
10393 }
10394 }
10395
10396success:
adb31ebb 10397 ceph_assert(rc >= 0);
a4b75251 10398 update_read_io_size(bl->length());
7c673cae
FG
10399 if (movepos) {
10400 // adjust fd pos
adb31ebb 10401 f->pos = start_pos + rc;
7c673cae 10402 }
11fdf7f2
TL
10403
10404 lat = ceph_clock_now();
10405 lat -= start;
2a845540
TL
10406
10407 ++nr_read_request;
10408 update_io_stat_read(lat);
7c673cae
FG
10409
10410done:
10411 // done!
11fdf7f2 10412 if (have) {
7c673cae 10413 put_cap_ref(in, CEPH_CAP_FILE_RD);
11fdf7f2
TL
10414 }
10415 if (movepos) {
10416 unlock_fh_pos(f);
10417 }
adb31ebb 10418 return rc;
7c673cae
FG
10419}
10420
10421Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
10422 client(c), f(f) {
10423 f->get();
10424 f->readahead.inc_pending();
10425}
10426
10427Client::C_Readahead::~C_Readahead() {
10428 f->readahead.dec_pending();
10429 client->_put_fh(f);
10430}
10431
10432void Client::C_Readahead::finish(int r) {
10433 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
10434 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
a4b75251
TL
10435 if (r > 0) {
10436 client->update_read_io_size(r);
10437 }
7c673cae
FG
10438}
10439
10440int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
10441{
f67539c2
TL
10442 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10443
11fdf7f2 10444 const auto& conf = cct->_conf;
7c673cae
FG
10445 Inode *in = f->inode.get();
10446
11fdf7f2 10447 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae
FG
10448
10449 // trim read based on file size?
10450 if (off >= in->size)
10451 return 0;
10452 if (len == 0)
10453 return 0;
10454 if (off + len > in->size) {
10455 len = in->size - off;
10456 }
10457
10458 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
10459 << " max_bytes=" << f->readahead.get_max_readahead_size()
10460 << " max_periods=" << conf->client_readahead_max_periods << dendl;
10461
10462 // read (and possibly block)
11fdf7f2
TL
10463 int r = 0;
10464 C_SaferCond onfinish("Client::_read_async flock");
7c673cae 10465 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
11fdf7f2 10466 off, len, bl, 0, &onfinish);
7c673cae
FG
10467 if (r == 0) {
10468 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
9f95a23c 10469 client_lock.unlock();
11fdf7f2 10470 r = onfinish.wait();
9f95a23c 10471 client_lock.lock();
7c673cae 10472 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
a4b75251 10473 update_read_io_size(bl->length());
7c673cae
FG
10474 }
10475
10476 if(f->readahead.get_min_readahead_size() > 0) {
10477 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
10478 if (readahead_extent.second > 0) {
10479 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
10480 << " (caller wants " << off << "~" << len << ")" << dendl;
10481 Context *onfinish2 = new C_Readahead(this, f);
10482 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
10483 readahead_extent.first, readahead_extent.second,
10484 NULL, 0, onfinish2);
10485 if (r2 == 0) {
10486 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
10487 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
10488 } else {
10489 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
10490 delete onfinish2;
10491 }
10492 }
10493 }
10494
10495 return r;
10496}
10497
10498int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
10499 bool *checkeof)
10500{
f67539c2
TL
10501 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10502
7c673cae
FG
10503 Inode *in = f->inode.get();
10504 uint64_t pos = off;
10505 int left = len;
10506 int read = 0;
10507
11fdf7f2 10508 ldout(cct, 10) << __func__ << " " << *in << " " << off << "~" << len << dendl;
7c673cae 10509
f67539c2
TL
10510 // 0 success, 1 continue and < 0 error happen.
10511 auto wait_and_copy = [&](C_SaferCond &onfinish, bufferlist &tbl, int wanted) {
11fdf7f2 10512 int r = onfinish.wait();
7c673cae
FG
10513
10514 // if we get ENOENT from OSD, assume 0 bytes returned
f67539c2 10515 if (r == -CEPHFS_ENOENT)
7c673cae
FG
10516 r = 0;
10517 if (r < 0)
10518 return r;
f67539c2 10519
7c673cae
FG
10520 if (tbl.length()) {
10521 r = tbl.length();
10522
10523 read += r;
10524 pos += r;
10525 left -= r;
10526 bl->claim_append(tbl);
10527 }
10528 // short read?
10529 if (r >= 0 && r < wanted) {
10530 if (pos < in->size) {
10531 // zero up to known EOF
10532 int64_t some = in->size - pos;
10533 if (some > left)
10534 some = left;
11fdf7f2
TL
10535 auto z = buffer::ptr_node::create(some);
10536 z->zero();
10537 bl->push_back(std::move(z));
7c673cae
FG
10538 read += some;
10539 pos += some;
10540 left -= some;
10541 if (left == 0)
f67539c2 10542 return 0;
7c673cae
FG
10543 }
10544
10545 *checkeof = true;
f67539c2 10546 return 0;
7c673cae 10547 }
f67539c2
TL
10548 return 1;
10549 };
7c673cae 10550
f67539c2
TL
10551 while (left > 0) {
10552 C_SaferCond onfinish("Client::_read_sync flock");
10553 bufferlist tbl;
7c673cae 10554
f67539c2
TL
10555 int wanted = left;
10556 filer->read_trunc(in->ino, &in->layout, in->snapid,
10557 pos, left, &tbl, 0,
10558 in->truncate_size, in->truncate_seq,
10559 &onfinish);
10560 client_lock.unlock();
10561 int r = wait_and_copy(onfinish, tbl, wanted);
10562 client_lock.lock();
10563 if (!r)
10564 return read;
10565 if (r < 0)
10566 return r;
7c673cae 10567 }
f67539c2 10568 return read;
7c673cae
FG
10569}
10570
10571int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
10572{
f67539c2
TL
10573 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10574 if (!mref_reader.is_state_satisfied())
10575 return -CEPHFS_ENOTCONN;
10576
7c673cae
FG
10577 tout(cct) << "write" << std::endl;
10578 tout(cct) << fd << std::endl;
10579 tout(cct) << size << std::endl;
10580 tout(cct) << offset << std::endl;
10581
f67539c2 10582 std::scoped_lock lock(client_lock);
7c673cae
FG
10583 Fh *fh = get_filehandle(fd);
10584 if (!fh)
f67539c2 10585 return -CEPHFS_EBADF;
7c673cae
FG
10586#if defined(__linux__) && defined(O_PATH)
10587 if (fh->flags & O_PATH)
f67539c2 10588 return -CEPHFS_EBADF;
7c673cae 10589#endif
11fdf7f2
TL
10590 /* We can't return bytes written larger than INT_MAX, clamp size to that */
10591 size = std::min(size, (loff_t)INT_MAX);
10592 int r = _write(fh, offset, size, buf, NULL, false);
7c673cae
FG
10593 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
10594 return r;
10595}
10596
10597int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
10598{
10599 if (iovcnt < 0)
f67539c2 10600 return -CEPHFS_EINVAL;
7c673cae
FG
10601 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
10602}
10603
11fdf7f2 10604int64_t Client::_preadv_pwritev_locked(Fh *fh, const struct iovec *iov,
20effc67
TL
10605 unsigned iovcnt, int64_t offset,
10606 bool write, bool clamp_to_int)
7c673cae 10607{
20effc67
TL
10608 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10609
7c673cae
FG
10610#if defined(__linux__) && defined(O_PATH)
10611 if (fh->flags & O_PATH)
f67539c2 10612 return -CEPHFS_EBADF;
7c673cae
FG
10613#endif
10614 loff_t totallen = 0;
10615 for (unsigned i = 0; i < iovcnt; i++) {
10616 totallen += iov[i].iov_len;
10617 }
11fdf7f2
TL
10618
10619 /*
10620 * Some of the API functions take 64-bit size values, but only return
10621 * 32-bit signed integers. Clamp the I/O sizes in those functions so that
10622 * we don't do I/Os larger than the values we can return.
10623 */
10624 if (clamp_to_int) {
10625 totallen = std::min(totallen, (loff_t)INT_MAX);
10626 }
7c673cae 10627 if (write) {
11fdf7f2
TL
10628 int64_t w = _write(fh, offset, totallen, NULL, iov, iovcnt);
10629 ldout(cct, 3) << "pwritev(" << fh << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
7c673cae
FG
10630 return w;
10631 } else {
10632 bufferlist bl;
11fdf7f2
TL
10633 int64_t r = _read(fh, offset, totallen, &bl);
10634 ldout(cct, 3) << "preadv(" << fh << ", " << offset << ") = " << r << dendl;
7c673cae
FG
10635 if (r <= 0)
10636 return r;
10637
20effc67 10638 client_lock.unlock();
9f95a23c 10639 auto iter = bl.cbegin();
7c673cae
FG
10640 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
10641 /*
f67539c2
TL
10642 * This piece of code aims to handle the case that bufferlist
10643 * does not have enough data to fill in the iov
7c673cae 10644 */
9f95a23c
TL
10645 const auto round_size = std::min<unsigned>(resid, iov[j].iov_len);
10646 iter.copy(round_size, reinterpret_cast<char*>(iov[j].iov_base));
10647 resid -= round_size;
10648 /* iter is self-updating */
7c673cae 10649 }
20effc67 10650 client_lock.lock();
f67539c2 10651 return r;
7c673cae
FG
10652 }
10653}
10654
11fdf7f2
TL
10655int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
10656{
f67539c2
TL
10657 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10658 if (!mref_reader.is_state_satisfied())
10659 return -CEPHFS_ENOTCONN;
10660
11fdf7f2
TL
10661 tout(cct) << fd << std::endl;
10662 tout(cct) << offset << std::endl;
10663
20effc67 10664 std::scoped_lock cl(client_lock);
11fdf7f2
TL
10665 Fh *fh = get_filehandle(fd);
10666 if (!fh)
f67539c2 10667 return -CEPHFS_EBADF;
20effc67 10668 return _preadv_pwritev_locked(fh, iov, iovcnt, offset, write, true);
11fdf7f2
TL
10669}
10670
10671int64_t Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
10672 const struct iovec *iov, int iovcnt)
7c673cae 10673{
f67539c2
TL
10674 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10675
f64942e4 10676 uint64_t fpos = 0;
2a845540 10677 Inode *in = f->inode.get();
f64942e4 10678
2a845540
TL
10679 if ( (uint64_t)(offset+size) > mdsmap->get_max_filesize() && //exceeds config
10680 (uint64_t)(offset+size) > in->size ) { //exceeds filesize
10681 return -CEPHFS_EFBIG;
10682 }
7c673cae 10683 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
7c673cae
FG
10684
10685 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
f67539c2 10686 return -CEPHFS_ENOSPC;
7c673cae
FG
10687 }
10688
11fdf7f2 10689 ceph_assert(in->snapid == CEPH_NOSNAP);
7c673cae
FG
10690
10691 // was Fh opened as writeable?
10692 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10693 return -CEPHFS_EBADF;
7c673cae 10694
7c673cae
FG
10695 // use/adjust fd pos?
10696 if (offset < 0) {
10697 lock_fh_pos(f);
10698 /*
10699 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
10700 * change out from under us.
10701 */
10702 if (f->flags & O_APPEND) {
9f95a23c 10703 auto r = _lseek(f, 0, SEEK_END);
7c673cae
FG
10704 if (r < 0) {
10705 unlock_fh_pos(f);
10706 return r;
10707 }
10708 }
10709 offset = f->pos;
f64942e4 10710 fpos = offset+size;
7c673cae
FG
10711 unlock_fh_pos(f);
10712 }
10713
11fdf7f2
TL
10714 // check quota
10715 uint64_t endoff = offset + size;
10716 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
10717 f->actor_perms)) {
f67539c2 10718 return -CEPHFS_EDQUOT;
11fdf7f2
TL
10719 }
10720
7c673cae
FG
10721 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
10722
10723 ldout(cct, 10) << "cur file size is " << in->size << dendl;
10724
10725 // time it.
10726 utime_t start = ceph_clock_now();
10727
10728 if (in->inline_version == 0) {
10729 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
10730 if (r < 0)
10731 return r;
11fdf7f2 10732 ceph_assert(in->inline_version > 0);
7c673cae
FG
10733 }
10734
10735 // copy into fresh buffer (since our write may be resub, async)
10736 bufferlist bl;
10737 if (buf) {
10738 if (size > 0)
10739 bl.append(buf, size);
10740 } else if (iov){
10741 for (int i = 0; i < iovcnt; i++) {
10742 if (iov[i].iov_len > 0) {
10743 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
10744 }
10745 }
10746 }
10747
10748 utime_t lat;
10749 uint64_t totalwritten;
11fdf7f2
TL
10750 int want, have;
10751 if (f->mode & CEPH_FILE_MODE_LAZY)
10752 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
10753 else
10754 want = CEPH_CAP_FILE_BUFFER;
f6b5b4d7 10755 int r = get_caps(f, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, want, &have, endoff);
7c673cae
FG
10756 if (r < 0)
10757 return r;
10758
1e59de90
TL
10759 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
10760 if (size > 0) {
10761 r = clear_suid_sgid(in, f->actor_perms);
10762 if (r < 0) {
10763 put_cap_ref(in, CEPH_CAP_FILE_WR);
7c673cae 10764 return r;
1e59de90 10765 }
7c673cae
FG
10766 }
10767
10768 if (f->flags & O_DIRECT)
11fdf7f2 10769 have &= ~(CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO);
7c673cae
FG
10770
10771 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
10772
11fdf7f2
TL
10773 std::unique_ptr<C_SaferCond> onuninline = nullptr;
10774
7c673cae
FG
10775 if (in->inline_version < CEPH_INLINE_NONE) {
10776 if (endoff > cct->_conf->client_max_inline_size ||
10777 endoff > CEPH_INLINE_MAX_SIZE ||
10778 !(have & CEPH_CAP_FILE_BUFFER)) {
11fdf7f2
TL
10779 onuninline.reset(new C_SaferCond("Client::_write_uninline_data flock"));
10780 uninline_data(in, onuninline.get());
7c673cae
FG
10781 } else {
10782 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10783
10784 uint32_t len = in->inline_data.length();
10785
10786 if (endoff < len)
9f95a23c 10787 in->inline_data.begin(endoff).copy(len - endoff, bl); // XXX
7c673cae
FG
10788
10789 if (offset < len)
10790 in->inline_data.splice(offset, len - offset);
10791 else if (offset > len)
10792 in->inline_data.append_zero(offset - len);
10793
10794 in->inline_data.append(bl);
10795 in->inline_version++;
10796
10797 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10798
10799 goto success;
10800 }
10801 }
10802
11fdf7f2
TL
10803 if (cct->_conf->client_oc &&
10804 (have & (CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO))) {
7c673cae
FG
10805 // do buffered write
10806 if (!in->oset.dirty_or_tx)
10807 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
10808
10809 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10810
10811 // async, caching, non-blocking.
10812 r = objectcacher->file_write(&in->oset, &in->layout,
10813 in->snaprealm->get_snap_context(),
10814 offset, size, bl, ceph::real_clock::now(),
10815 0);
10816 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
10817
10818 if (r < 0)
10819 goto done;
10820
10821 // flush cached write if O_SYNC is set on file fh
10822 // O_DSYNC == O_SYNC on linux < 2.6.33
10823 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
10824 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
10825 _flush_range(in, offset, size);
10826 }
10827 } else {
10828 if (f->flags & O_DIRECT)
10829 _flush_range(in, offset, size);
10830
10831 // simple, non-atomic sync write
11fdf7f2 10832 C_SaferCond onfinish("Client::_write flock");
f67539c2 10833 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
10834
10835 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
10836 offset, size, bl, ceph::real_clock::now(), 0,
10837 in->truncate_size, in->truncate_seq,
11fdf7f2 10838 &onfinish);
9f95a23c 10839 client_lock.unlock();
f6b5b4d7 10840 r = onfinish.wait();
9f95a23c 10841 client_lock.lock();
f67539c2 10842 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
f6b5b4d7
TL
10843 if (r < 0)
10844 goto done;
7c673cae
FG
10845 }
10846
10847 // if we get here, write was successful, update client metadata
10848success:
a4b75251 10849 update_write_io_size(size);
7c673cae
FG
10850 // time
10851 lat = ceph_clock_now();
10852 lat -= start;
2a845540
TL
10853
10854 ++nr_write_request;
10855 update_io_stat_write(lat);
7c673cae 10856
f64942e4
AA
10857 if (fpos) {
10858 lock_fh_pos(f);
10859 f->pos = fpos;
10860 unlock_fh_pos(f);
10861 }
7c673cae 10862 totalwritten = size;
11fdf7f2 10863 r = (int64_t)totalwritten;
7c673cae
FG
10864
10865 // extend file?
10866 if (totalwritten + offset > in->size) {
10867 in->size = totalwritten + offset;
28e407b8 10868 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 10869
11fdf7f2 10870 if (is_quota_bytes_approaching(in, f->actor_perms)) {
7c673cae 10871 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
10872 } else if (is_max_size_approaching(in)) {
10873 check_caps(in, 0);
7c673cae
FG
10874 }
10875
10876 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
10877 } else {
10878 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
10879 }
10880
10881 // mtime
91327a77 10882 in->mtime = in->ctime = ceph_clock_now();
7c673cae 10883 in->change_attr++;
28e407b8 10884 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10885
10886done:
10887
11fdf7f2 10888 if (nullptr != onuninline) {
9f95a23c 10889 client_lock.unlock();
11fdf7f2 10890 int uninline_ret = onuninline->wait();
9f95a23c 10891 client_lock.lock();
7c673cae 10892
f67539c2 10893 if (uninline_ret >= 0 || uninline_ret == -CEPHFS_ECANCELED) {
7c673cae
FG
10894 in->inline_data.clear();
10895 in->inline_version = CEPH_INLINE_NONE;
28e407b8 10896 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
10897 check_caps(in, 0);
10898 } else
10899 r = uninline_ret;
10900 }
10901
10902 put_cap_ref(in, CEPH_CAP_FILE_WR);
10903 return r;
10904}
10905
10906int Client::_flush(Fh *f)
10907{
10908 Inode *in = f->inode.get();
10909 int err = f->take_async_err();
10910 if (err != 0) {
10911 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
10912 << cpp_strerror(err) << dendl;
10913 } else {
10914 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
10915 }
10916
10917 return err;
10918}
10919
10920int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
10921{
10922 struct ceph_statx stx;
10923 stx.stx_size = length;
10924 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
10925}
10926
10927int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
10928{
f67539c2
TL
10929 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10930 if (!mref_reader.is_state_satisfied())
10931 return -CEPHFS_ENOTCONN;
10932
11fdf7f2 10933 tout(cct) << __func__ << std::endl;
7c673cae
FG
10934 tout(cct) << fd << std::endl;
10935 tout(cct) << length << std::endl;
10936
f67539c2 10937 std::scoped_lock lock(client_lock);
7c673cae
FG
10938 Fh *f = get_filehandle(fd);
10939 if (!f)
f67539c2 10940 return -CEPHFS_EBADF;
7c673cae
FG
10941#if defined(__linux__) && defined(O_PATH)
10942 if (f->flags & O_PATH)
f67539c2 10943 return -CEPHFS_EBADF;
7c673cae 10944#endif
adb31ebb 10945 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 10946 return -CEPHFS_EBADF;
7c673cae
FG
10947 struct stat attr;
10948 attr.st_size = length;
10949 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
10950}
10951
10952int Client::fsync(int fd, bool syncdataonly)
10953{
f67539c2
TL
10954 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
10955 if (!mref_reader.is_state_satisfied())
10956 return -CEPHFS_ENOTCONN;
10957
7c673cae
FG
10958 tout(cct) << "fsync" << std::endl;
10959 tout(cct) << fd << std::endl;
10960 tout(cct) << syncdataonly << std::endl;
10961
f67539c2 10962 std::scoped_lock lock(client_lock);
7c673cae
FG
10963 Fh *f = get_filehandle(fd);
10964 if (!f)
f67539c2 10965 return -CEPHFS_EBADF;
7c673cae
FG
10966#if defined(__linux__) && defined(O_PATH)
10967 if (f->flags & O_PATH)
f67539c2 10968 return -CEPHFS_EBADF;
7c673cae
FG
10969#endif
10970 int r = _fsync(f, syncdataonly);
10971 if (r == 0) {
10972 // The IOs in this fsync were okay, but maybe something happened
10973 // in the background that we shoudl be reporting?
10974 r = f->take_async_err();
1adf2230 10975 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly
7c673cae
FG
10976 << ") = 0, async_err = " << r << dendl;
10977 } else {
10978 // Assume that an error we encountered during fsync, even reported
10979 // synchronously, would also have applied the error to the Fh, and we
10980 // should clear it here to avoid returning the same error again on next
10981 // call.
1adf2230 10982 ldout(cct, 5) << "fsync(" << fd << ", " << syncdataonly << ") = "
7c673cae
FG
10983 << r << dendl;
10984 f->take_async_err();
10985 }
10986 return r;
10987}
10988
10989int Client::_fsync(Inode *in, bool syncdataonly)
10990{
f67539c2
TL
10991 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
10992
7c673cae 10993 int r = 0;
11fdf7f2 10994 std::unique_ptr<C_SaferCond> object_cacher_completion = nullptr;
7c673cae
FG
10995 ceph_tid_t flush_tid = 0;
10996 InodeRef tmp_ref;
11fdf7f2
TL
10997 utime_t lat;
10998 utime_t start = ceph_clock_now();
7c673cae 10999
1adf2230 11000 ldout(cct, 8) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
7c673cae
FG
11001
11002 if (cct->_conf->client_oc) {
11fdf7f2
TL
11003 object_cacher_completion.reset(new C_SaferCond("Client::_fsync::lock"));
11004 tmp_ref = in; // take a reference; C_SaferCond doesn't and _flush won't either
11005 _flush(in, object_cacher_completion.get());
7c673cae
FG
11006 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
11007 }
11008
11009 if (!syncdataonly && in->dirty_caps) {
11010 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
11011 if (in->flushing_caps)
11012 flush_tid = last_flush_tid;
11013 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
11014
11015 if (!syncdataonly && !in->unsafe_ops.empty()) {
522d829b 11016 flush_mdlog_sync(in);
28e407b8 11017
7c673cae
FG
11018 MetaRequest *req = in->unsafe_ops.back();
11019 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
11020
11021 req->get();
11022 wait_on_list(req->waitfor_safe);
11023 put_request(req);
11024 }
11025
11fdf7f2 11026 if (nullptr != object_cacher_completion) { // wait on a real reply instead of guessing
9f95a23c 11027 client_lock.unlock();
7c673cae 11028 ldout(cct, 15) << "waiting on data to flush" << dendl;
11fdf7f2 11029 r = object_cacher_completion->wait();
9f95a23c 11030 client_lock.lock();
7c673cae
FG
11031 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
11032 } else {
11033 // FIXME: this can starve
11034 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
11035 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
11036 << " uncommitted, waiting" << dendl;
11037 wait_on_list(in->waitfor_commit);
11038 }
11039 }
11040
11041 if (!r) {
11042 if (flush_tid > 0)
11043 wait_sync_caps(in, flush_tid);
11044
11045 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
11046 } else {
1adf2230 11047 ldout(cct, 8) << "ino " << in->ino << " failed to commit to disk! "
7c673cae
FG
11048 << cpp_strerror(-r) << dendl;
11049 }
11fdf7f2
TL
11050
11051 lat = ceph_clock_now();
11052 lat -= start;
11053 logger->tinc(l_c_fsync, lat);
7c673cae
FG
11054
11055 return r;
11056}
11057
11058int Client::_fsync(Fh *f, bool syncdataonly)
11059{
1adf2230 11060 ldout(cct, 8) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
7c673cae
FG
11061 return _fsync(f->inode.get(), syncdataonly);
11062}
11063
11064int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
11065{
f67539c2
TL
11066 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11067 if (!mref_reader.is_state_satisfied())
11068 return -CEPHFS_ENOTCONN;
11069
7c673cae
FG
11070 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
11071 tout(cct) << fd << std::endl;
11072
f67539c2 11073 std::scoped_lock lock(client_lock);
7c673cae
FG
11074 Fh *f = get_filehandle(fd);
11075 if (!f)
f67539c2 11076 return -CEPHFS_EBADF;
7c673cae
FG
11077 int r = _getattr(f->inode, mask, perms);
11078 if (r < 0)
11079 return r;
11080 fill_stat(f->inode, stbuf, NULL);
1adf2230 11081 ldout(cct, 5) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
7c673cae
FG
11082 return r;
11083}
11084
11085int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
11086 unsigned int want, unsigned int flags)
11087{
f67539c2
TL
11088 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11089 if (!mref_reader.is_state_satisfied())
11090 return -CEPHFS_ENOTCONN;
11091
7c673cae
FG
11092 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
11093 tout(cct) << fd << std::endl;
11094
f67539c2 11095 std::scoped_lock lock(client_lock);
7c673cae
FG
11096 Fh *f = get_filehandle(fd);
11097 if (!f)
f67539c2 11098 return -CEPHFS_EBADF;
7c673cae
FG
11099
11100 unsigned mask = statx_to_mask(flags, want);
11101
11102 int r = 0;
b3b6e05e 11103 if (mask) {
7c673cae
FG
11104 r = _getattr(f->inode, mask, perms);
11105 if (r < 0) {
11106 ldout(cct, 3) << "fstatx exit on error!" << dendl;
11107 return r;
11108 }
11109 }
11110
11111 fill_statx(f->inode, mask, stx);
11112 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
11113 return r;
11114}
11115
b3b6e05e
TL
11116int Client::statxat(int dirfd, const char *relpath,
11117 struct ceph_statx *stx, const UserPerm& perms,
11118 unsigned int want, unsigned int flags) {
11119 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11120 if (!mref_reader.is_state_satisfied()) {
11121 return -CEPHFS_ENOTCONN;
11122 }
11123
11124 tout(cct) << __func__ << " flags " << hex << flags << " want " << want << dec << std::endl;
11125 tout(cct) << dirfd << std::endl;
11126 tout(cct) << relpath << std::endl;
11127
11128 unsigned mask = statx_to_mask(flags, want);
11129
11130 InodeRef dirinode;
11131 std::scoped_lock lock(client_lock);
11132 int r = get_fd_inode(dirfd, &dirinode);
11133 if (r < 0) {
11134 return r;
11135 }
11136
11137 InodeRef in;
11138 filepath path(relpath);
11139 r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask, dirinode);
11140 if (r < 0) {
11141 return r;
11142 }
11143 r = _getattr(in, mask, perms);
11144 if (r < 0) {
11145 ldout(cct, 3) << __func__ << " exit on error!" << dendl;
11146 return r;
11147 }
11148
11149 fill_statx(in, mask, stx);
11150 ldout(cct, 3) << __func__ << " dirfd" << dirfd << ", r= " << r << dendl;
11151 return r;
11152}
11153
7c673cae
FG
11154// not written yet, but i want to link!
11155
11156int Client::chdir(const char *relpath, std::string &new_cwd,
11157 const UserPerm& perms)
11158{
f67539c2
TL
11159 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11160 if (!mref_reader.is_state_satisfied())
11161 return -CEPHFS_ENOTCONN;
11162
7c673cae
FG
11163 tout(cct) << "chdir" << std::endl;
11164 tout(cct) << relpath << std::endl;
181888fb 11165
7c673cae
FG
11166 filepath path(relpath);
11167 InodeRef in;
f67539c2
TL
11168
11169 std::scoped_lock lock(client_lock);
7c673cae
FG
11170 int r = path_walk(path, &in, perms);
11171 if (r < 0)
11172 return r;
92f5a8d4
TL
11173
11174 if (!(in.get()->is_dir()))
f67539c2 11175 return -CEPHFS_ENOTDIR;
92f5a8d4 11176
7c673cae
FG
11177 if (cwd != in)
11178 cwd.swap(in);
11179 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
11180
b5b8bbf5 11181 _getcwd(new_cwd, perms);
7c673cae
FG
11182 return 0;
11183}
11184
b5b8bbf5 11185void Client::_getcwd(string& dir, const UserPerm& perms)
7c673cae
FG
11186{
11187 filepath path;
11fdf7f2 11188 ldout(cct, 10) << __func__ << " " << *cwd << dendl;
7c673cae
FG
11189
11190 Inode *in = cwd.get();
b3b6e05e 11191 while (in != root.get()) {
11fdf7f2 11192 ceph_assert(in->dentries.size() < 2); // dirs can't be hard-linked
7c673cae
FG
11193
11194 // A cwd or ancester is unlinked
11fdf7f2 11195 if (in->dentries.empty()) {
7c673cae
FG
11196 return;
11197 }
11198
11199 Dentry *dn = in->get_first_parent();
11200
11201
11202 if (!dn) {
11203 // look it up
11fdf7f2 11204 ldout(cct, 10) << __func__ << " looking up parent for " << *in << dendl;
7c673cae
FG
11205 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
11206 filepath path(in->ino);
11207 req->set_filepath(path);
11208 req->set_inode(in);
11209 int res = make_request(req, perms);
11210 if (res < 0)
11211 break;
11212
11213 // start over
11214 path = filepath();
11215 in = cwd.get();
11216 continue;
11217 }
11218 path.push_front_dentry(dn->name);
11219 in = dn->dir->parent_inode;
11220 }
11221 dir = "/";
11222 dir += path.get_path();
11223}
11224
b5b8bbf5
FG
11225void Client::getcwd(string& dir, const UserPerm& perms)
11226{
f67539c2
TL
11227 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11228 if (!mref_reader.is_state_satisfied())
11229 return;
11230
11231 std::scoped_lock l(client_lock);
11232
11233 _getcwd(dir, perms);
b5b8bbf5
FG
11234}
11235
7c673cae
FG
11236int Client::statfs(const char *path, struct statvfs *stbuf,
11237 const UserPerm& perms)
11238{
f67539c2
TL
11239 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11240 if (!mref_reader.is_state_satisfied())
11241 return -CEPHFS_ENOTCONN;
11242
11fdf7f2 11243 tout(cct) << __func__ << std::endl;
91327a77 11244 unsigned long int total_files_on_fs;
7c673cae
FG
11245
11246 ceph_statfs stats;
11247 C_SaferCond cond;
d2e6a577 11248
f67539c2 11249 std::unique_lock lock(client_lock);
d2e6a577
FG
11250 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
11251 if (data_pools.size() == 1) {
11252 objecter->get_fs_stats(stats, data_pools[0], &cond);
11253 } else {
20effc67 11254 objecter->get_fs_stats(stats, std::optional<int64_t>(), &cond);
d2e6a577 11255 }
7c673cae 11256
f67539c2 11257 lock.unlock();
7c673cae 11258 int rval = cond.wait();
f67539c2
TL
11259 lock.lock();
11260
20effc67 11261 ceph_assert(root);
91327a77 11262 total_files_on_fs = root->rstat.rfiles + root->rstat.rsubdirs;
7c673cae
FG
11263
11264 if (rval < 0) {
11265 ldout(cct, 1) << "underlying call to statfs returned error: "
11266 << cpp_strerror(rval)
11267 << dendl;
11268 return rval;
11269 }
11270
11271 memset(stbuf, 0, sizeof(*stbuf));
11272
11273 /*
11274 * we're going to set a block size of 4MB so we can represent larger
11275 * FSes without overflowing. Additionally convert the space
11276 * measurements from KB to bytes while making them in terms of
11277 * blocks. We use 4MB only because it is big enough, and because it
11278 * actually *is* the (ceph) default block size.
11279 */
11280 const int CEPH_BLOCK_SHIFT = 22;
11281 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
11282 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
91327a77 11283 stbuf->f_files = total_files_on_fs;
f67539c2 11284 stbuf->f_ffree = -1;
7c673cae
FG
11285 stbuf->f_favail = -1;
11286 stbuf->f_fsid = -1; // ??
11287 stbuf->f_flag = 0; // ??
11288 stbuf->f_namemax = NAME_MAX;
11289
11290 // Usually quota_root will == root_ancestor, but if the mount root has no
11291 // quota but we can see a parent of it that does have a quota, we'll
11292 // respect that one instead.
11fdf7f2 11293 ceph_assert(root != nullptr);
1e59de90 11294 InodeRef quota_root = root->quota.is_enabled(QUOTA_MAX_BYTES) ? root : get_quota_root(root.get(), perms, QUOTA_MAX_BYTES);
7c673cae 11295
2a845540
TL
11296 // get_quota_root should always give us something if client quotas are
11297 // enabled
11298 ceph_assert(cct->_conf.get_val<bool>("client_quota") == false || quota_root != nullptr);
7c673cae 11299
1e59de90
TL
11300 /* If bytes quota is set on a directory and conf option "client quota df"
11301 * is also set, available space = quota limit - used space. Else,
11302 * available space = total space - used space. */
7c673cae
FG
11303 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
11304
11305 // Skip the getattr if any sessions are stale, as we don't want to
11306 // block `df` if this client has e.g. been evicted, or if the MDS cluster
11307 // is unhealthy.
11308 if (!_any_stale_sessions()) {
11309 int r = _getattr(quota_root, 0, perms, true);
11310 if (r != 0) {
11311 // Ignore return value: error getting latest inode metadata is not a good
11312 // reason to break "df".
11313 lderr(cct) << "Error in getattr on quota root 0x"
11314 << std::hex << quota_root->ino << std::dec
11315 << " statfs result may be outdated" << dendl;
11316 }
11317 }
11318
11319 // Special case: if there is a size quota set on the Inode acting
11320 // as the root for this client mount, then report the quota status
11321 // as the filesystem statistics.
11322 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
11323 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
31f18b77
FG
11324 // It is possible for a quota to be exceeded: arithmetic here must
11325 // handle case where used > total.
11326 const fsblkcnt_t free = total > used ? total - used : 0;
7c673cae
FG
11327
11328 stbuf->f_blocks = total;
11329 stbuf->f_bfree = free;
11330 stbuf->f_bavail = free;
11331 } else {
d2e6a577 11332 // General case: report the cluster statistics returned from RADOS. Because
7c673cae
FG
11333 // multiple pools may be used without one filesystem namespace via
11334 // layouts, this is the most correct thing we can do.
11335 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
11336 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11337 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
11338 }
11339
11340 return rval;
11341}
11342
11343int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
11344 struct flock *fl, uint64_t owner, bool removing)
11345{
11fdf7f2 11346 ldout(cct, 10) << __func__ << " ino " << in->ino
7c673cae
FG
11347 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
11348 << " type " << fl->l_type << " owner " << owner
11349 << " " << fl->l_start << "~" << fl->l_len << dendl;
11350
f6b5b4d7 11351 if (in->flags & I_ERROR_FILELOCK)
f67539c2 11352 return -CEPHFS_EIO;
f6b5b4d7 11353
7c673cae
FG
11354 int lock_cmd;
11355 if (F_RDLCK == fl->l_type)
11356 lock_cmd = CEPH_LOCK_SHARED;
11357 else if (F_WRLCK == fl->l_type)
11358 lock_cmd = CEPH_LOCK_EXCL;
11359 else if (F_UNLCK == fl->l_type)
11360 lock_cmd = CEPH_LOCK_UNLOCK;
11361 else
f67539c2 11362 return -CEPHFS_EIO;
7c673cae
FG
11363
11364 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
11365 sleep = 0;
11366
11367 /*
11368 * Set the most significant bit, so that MDS knows the 'owner'
11369 * is sufficient to identify the owner of lock. (old code uses
11370 * both 'owner' and 'pid')
11371 */
11372 owner |= (1ULL << 63);
11373
11374 MetaRequest *req = new MetaRequest(op);
11375 filepath path;
11376 in->make_nosnap_relative_path(path);
11377 req->set_filepath(path);
11378 req->set_inode(in);
11379
11380 req->head.args.filelock_change.rule = lock_type;
11381 req->head.args.filelock_change.type = lock_cmd;
11382 req->head.args.filelock_change.owner = owner;
11383 req->head.args.filelock_change.pid = fl->l_pid;
11384 req->head.args.filelock_change.start = fl->l_start;
11385 req->head.args.filelock_change.length = fl->l_len;
11386 req->head.args.filelock_change.wait = sleep;
11387
11388 int ret;
11389 bufferlist bl;
11390
11391 if (sleep && switch_interrupt_cb) {
11392 // enable interrupt
11393 switch_interrupt_cb(callback_handle, req->get());
11394 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
7c673cae
FG
11395 // disable interrupt
11396 switch_interrupt_cb(callback_handle, NULL);
31f18b77
FG
11397 if (ret == 0 && req->aborted()) {
11398 // effect of this lock request has been revoked by the 'lock intr' request
11399 ret = req->get_abort_code();
11400 }
7c673cae
FG
11401 put_request(req);
11402 } else {
11403 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
11404 }
11405
11406 if (ret == 0) {
11407 if (op == CEPH_MDS_OP_GETFILELOCK) {
11408 ceph_filelock filelock;
11fdf7f2
TL
11409 auto p = bl.cbegin();
11410 decode(filelock, p);
7c673cae
FG
11411
11412 if (CEPH_LOCK_SHARED == filelock.type)
11413 fl->l_type = F_RDLCK;
11414 else if (CEPH_LOCK_EXCL == filelock.type)
11415 fl->l_type = F_WRLCK;
11416 else
11417 fl->l_type = F_UNLCK;
11418
11419 fl->l_whence = SEEK_SET;
11420 fl->l_start = filelock.start;
11421 fl->l_len = filelock.length;
11422 fl->l_pid = filelock.pid;
11423 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
11424 ceph_lock_state_t *lock_state;
11425 if (lock_type == CEPH_LOCK_FCNTL) {
11426 if (!in->fcntl_locks)
11fdf7f2
TL
11427 in->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11428 lock_state = in->fcntl_locks.get();
7c673cae
FG
11429 } else if (lock_type == CEPH_LOCK_FLOCK) {
11430 if (!in->flock_locks)
11fdf7f2
TL
11431 in->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11432 lock_state = in->flock_locks.get();
7c673cae
FG
11433 } else {
11434 ceph_abort();
f67539c2 11435 return -CEPHFS_EINVAL;
7c673cae
FG
11436 }
11437 _update_lock_state(fl, owner, lock_state);
11438
11439 if (!removing) {
11440 if (lock_type == CEPH_LOCK_FCNTL) {
11441 if (!fh->fcntl_locks)
11fdf7f2
TL
11442 fh->fcntl_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL));
11443 lock_state = fh->fcntl_locks.get();
7c673cae
FG
11444 } else {
11445 if (!fh->flock_locks)
11fdf7f2
TL
11446 fh->flock_locks.reset(new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK));
11447 lock_state = fh->flock_locks.get();
7c673cae
FG
11448 }
11449 _update_lock_state(fl, owner, lock_state);
11450 }
11451 } else
11452 ceph_abort();
11453 }
11454 return ret;
11455}
11456
11457int Client::_interrupt_filelock(MetaRequest *req)
11458{
31f18b77
FG
11459 // Set abort code, but do not kick. The abort code prevents the request
11460 // from being re-sent.
f67539c2 11461 req->abort(-CEPHFS_EINTR);
31f18b77
FG
11462 if (req->mds < 0)
11463 return 0; // haven't sent the request
11464
7c673cae
FG
11465 Inode *in = req->inode();
11466
11467 int lock_type;
11468 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
11469 lock_type = CEPH_LOCK_FLOCK_INTR;
11470 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
11471 lock_type = CEPH_LOCK_FCNTL_INTR;
11472 else {
11473 ceph_abort();
f67539c2 11474 return -CEPHFS_EINVAL;
7c673cae
FG
11475 }
11476
11477 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
11478 filepath path;
11479 in->make_nosnap_relative_path(path);
11480 intr_req->set_filepath(path);
11481 intr_req->set_inode(in);
11482 intr_req->head.args.filelock_change = req->head.args.filelock_change;
11483 intr_req->head.args.filelock_change.rule = lock_type;
11484 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
11485
11486 UserPerm perms(req->get_uid(), req->get_gid());
11487 return make_request(intr_req, perms, NULL, NULL, -1);
11488}
11489
11490void Client::_encode_filelocks(Inode *in, bufferlist& bl)
11491{
11492 if (!in->fcntl_locks && !in->flock_locks)
11493 return;
11494
11495 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
11fdf7f2 11496 encode(nr_fcntl_locks, bl);
7c673cae 11497 if (nr_fcntl_locks) {
11fdf7f2 11498 auto &lock_state = in->fcntl_locks;
20effc67 11499 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11500 p != lock_state->held_locks.end();
11501 ++p)
11fdf7f2 11502 encode(p->second, bl);
7c673cae
FG
11503 }
11504
11505 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
11fdf7f2 11506 encode(nr_flock_locks, bl);
7c673cae 11507 if (nr_flock_locks) {
11fdf7f2 11508 auto &lock_state = in->flock_locks;
20effc67 11509 for(auto p = lock_state->held_locks.begin();
7c673cae
FG
11510 p != lock_state->held_locks.end();
11511 ++p)
11fdf7f2 11512 encode(p->second, bl);
7c673cae
FG
11513 }
11514
11fdf7f2 11515 ldout(cct, 10) << __func__ << " ino " << in->ino << ", " << nr_fcntl_locks
7c673cae
FG
11516 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
11517}
11518
11519void Client::_release_filelocks(Fh *fh)
11520{
11521 if (!fh->fcntl_locks && !fh->flock_locks)
11522 return;
11523
11524 Inode *in = fh->inode.get();
11fdf7f2 11525 ldout(cct, 10) << __func__ << " " << fh << " ino " << in->ino << dendl;
7c673cae 11526
f6b5b4d7
TL
11527 list<ceph_filelock> activated_locks;
11528
7c673cae
FG
11529 list<pair<int, ceph_filelock> > to_release;
11530
11531 if (fh->fcntl_locks) {
11fdf7f2 11532 auto &lock_state = fh->fcntl_locks;
f6b5b4d7
TL
11533 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11534 auto q = p++;
11535 if (in->flags & I_ERROR_FILELOCK) {
11536 lock_state->remove_lock(q->second, activated_locks);
11537 } else {
11538 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, q->second));
11539 }
11540 }
11fdf7f2 11541 lock_state.reset();
7c673cae
FG
11542 }
11543 if (fh->flock_locks) {
11fdf7f2 11544 auto &lock_state = fh->flock_locks;
f6b5b4d7
TL
11545 for(auto p = lock_state->held_locks.begin(); p != lock_state->held_locks.end(); ) {
11546 auto q = p++;
11547 if (in->flags & I_ERROR_FILELOCK) {
11548 lock_state->remove_lock(q->second, activated_locks);
11549 } else {
11550 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, q->second));
11551 }
11552 }
11fdf7f2 11553 lock_state.reset();
7c673cae
FG
11554 }
11555
f6b5b4d7
TL
11556 if ((in->flags & I_ERROR_FILELOCK) && !in->has_any_filelocks())
11557 in->flags &= ~I_ERROR_FILELOCK;
7c673cae 11558
f6b5b4d7 11559 if (to_release.empty())
11fdf7f2
TL
11560 return;
11561
7c673cae
FG
11562 struct flock fl;
11563 memset(&fl, 0, sizeof(fl));
11564 fl.l_whence = SEEK_SET;
11565 fl.l_type = F_UNLCK;
11566
11567 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
11568 p != to_release.end();
11569 ++p) {
11570 fl.l_start = p->second.start;
11571 fl.l_len = p->second.length;
11572 fl.l_pid = p->second.pid;
11573 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
11574 p->second.owner, true);
11575 }
11576}
11577
11578void Client::_update_lock_state(struct flock *fl, uint64_t owner,
11579 ceph_lock_state_t *lock_state)
11580{
11581 int lock_cmd;
11582 if (F_RDLCK == fl->l_type)
11583 lock_cmd = CEPH_LOCK_SHARED;
11584 else if (F_WRLCK == fl->l_type)
11585 lock_cmd = CEPH_LOCK_EXCL;
11586 else
11587 lock_cmd = CEPH_LOCK_UNLOCK;;
11588
11589 ceph_filelock filelock;
11590 filelock.start = fl->l_start;
11591 filelock.length = fl->l_len;
11592 filelock.client = 0;
11593 // see comment in _do_filelock()
11594 filelock.owner = owner | (1ULL << 63);
11595 filelock.pid = fl->l_pid;
11596 filelock.type = lock_cmd;
11597
11598 if (filelock.type == CEPH_LOCK_UNLOCK) {
11599 list<ceph_filelock> activated_locks;
11600 lock_state->remove_lock(filelock, activated_locks);
11601 } else {
11602 bool r = lock_state->add_lock(filelock, false, false, NULL);
11fdf7f2 11603 ceph_assert(r);
7c673cae
FG
11604 }
11605}
11606
11607int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
11608{
11609 Inode *in = fh->inode.get();
11610 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
11611 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
11612 return ret;
11613}
11614
11615int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
11616{
11617 Inode *in = fh->inode.get();
11618 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
11619 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
11620 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
11621 return ret;
11622}
11623
11624int Client::_flock(Fh *fh, int cmd, uint64_t owner)
11625{
11626 Inode *in = fh->inode.get();
11627 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
11628
11629 int sleep = !(cmd & LOCK_NB);
11630 cmd &= ~LOCK_NB;
11631
11632 int type;
11633 switch (cmd) {
11634 case LOCK_SH:
11635 type = F_RDLCK;
11636 break;
11637 case LOCK_EX:
11638 type = F_WRLCK;
11639 break;
11640 case LOCK_UN:
11641 type = F_UNLCK;
11642 break;
11643 default:
f67539c2 11644 return -CEPHFS_EINVAL;
7c673cae
FG
11645 }
11646
11647 struct flock fl;
11648 memset(&fl, 0, sizeof(fl));
11649 fl.l_type = type;
11650 fl.l_whence = SEEK_SET;
11651
11652 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
11653 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
11654 return ret;
11655}
11656
f67539c2
TL
11657int Client::get_snap_info(const char *path, const UserPerm &perms, SnapInfo *snap_info) {
11658 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11659 if (!mref_reader.is_state_satisfied()) {
11660 return -CEPHFS_ENOTCONN;
11661 }
11662
20effc67 11663 std::scoped_lock lock(client_lock);
f67539c2
TL
11664 InodeRef in;
11665 int r = Client::path_walk(path, &in, perms, true);
11666 if (r < 0) {
11667 return r;
11668 }
11669
11670 if (in->snapid == CEPH_NOSNAP) {
11671 return -CEPHFS_EINVAL;
11672 }
11673
11674 snap_info->id = in->snapid;
11675 snap_info->metadata = in->snap_metadata;
11676 return 0;
11677}
11678
11679int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
11680{
11681 /* Since the only thing this does is wrap a call to statfs, and
11682 statfs takes a lock, it doesn't seem we have a need to split it
11683 out. */
7c673cae
FG
11684 return statfs(0, stbuf, perms);
11685}
11686
20effc67 11687void Client::_ll_register_callbacks(struct ceph_client_callback_args *args)
7c673cae
FG
11688{
11689 if (!args)
11690 return;
20effc67 11691
11fdf7f2 11692 ldout(cct, 10) << __func__ << " cb " << args->handle
7c673cae
FG
11693 << " invalidate_ino_cb " << args->ino_cb
11694 << " invalidate_dentry_cb " << args->dentry_cb
7c673cae
FG
11695 << " switch_interrupt_cb " << args->switch_intr_cb
11696 << " remount_cb " << args->remount_cb
11697 << dendl;
11698 callback_handle = args->handle;
11699 if (args->ino_cb) {
11700 ino_invalidate_cb = args->ino_cb;
11701 async_ino_invalidator.start();
11702 }
11703 if (args->dentry_cb) {
11704 dentry_invalidate_cb = args->dentry_cb;
11705 async_dentry_invalidator.start();
11706 }
11707 if (args->switch_intr_cb) {
11708 switch_interrupt_cb = args->switch_intr_cb;
11709 interrupt_finisher.start();
11710 }
11711 if (args->remount_cb) {
11712 remount_cb = args->remount_cb;
11713 remount_finisher.start();
11714 }
e306af50
TL
11715 if (args->ino_release_cb) {
11716 ino_release_cb = args->ino_release_cb;
11717 async_ino_releasor.start();
11718 }
11719 if (args->umask_cb)
11720 umask_cb = args->umask_cb;
7c673cae
FG
11721}
11722
20effc67
TL
11723// This is deprecated, use ll_register_callbacks2() instead.
11724void Client::ll_register_callbacks(struct ceph_client_callback_args *args)
11725{
11726 ceph_assert(!is_mounting() && !is_mounted() && !is_unmounting());
11727
11728 _ll_register_callbacks(args);
11729}
11730
11731int Client::ll_register_callbacks2(struct ceph_client_callback_args *args)
11732{
11733 if (is_mounting() || is_mounted() || is_unmounting())
11734 return -CEPHFS_EBUSY;
11735
11736 _ll_register_callbacks(args);
11737 return 0;
11738}
11739
1d09f67e 11740std::pair<int, bool> Client::test_dentry_handling(bool can_invalidate)
7c673cae 11741{
1d09f67e 11742 std::pair <int, bool> r(0, false);
7c673cae 11743
f67539c2
TL
11744 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
11745 if (!iref_reader.is_state_satisfied())
1d09f67e 11746 return std::make_pair(-CEPHFS_ENOTCONN, false);
f67539c2 11747
7c673cae
FG
11748 can_invalidate_dentries = can_invalidate;
11749
39ae355f
TL
11750 /*
11751 * Force to use the old and slow method to invalidate the dcache
11752 * if the euid is non-root, or the remount may fail with return
11753 * code 1 or 32.
11754 */
11755 uid_t euid = geteuid();
11756 ldout(cct, 10) << "euid: " << euid << dendl;
11757 if (euid != 0) {
11758 can_invalidate_dentries = true;
11759 }
11760
7c673cae 11761 if (can_invalidate_dentries) {
11fdf7f2 11762 ceph_assert(dentry_invalidate_cb);
7c673cae 11763 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
11fdf7f2
TL
11764 } else {
11765 ceph_assert(remount_cb);
7c673cae 11766 ldout(cct, 1) << "using remount_cb" << dendl;
91327a77 11767 r = _do_remount(false);
b32b8144 11768 }
11fdf7f2 11769
7c673cae
FG
11770 return r;
11771}
11772
11773int Client::_sync_fs()
11774{
f67539c2
TL
11775 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
11776
11fdf7f2 11777 ldout(cct, 10) << __func__ << dendl;
7c673cae
FG
11778
11779 // flush file data
11fdf7f2
TL
11780 std::unique_ptr<C_SaferCond> cond = nullptr;
11781 if (cct->_conf->client_oc) {
11782 cond.reset(new C_SaferCond("Client::_sync_fs:lock"));
11783 objectcacher->flush_all(cond.get());
11784 }
7c673cae
FG
11785
11786 // flush caps
11787 flush_caps_sync();
11788 ceph_tid_t flush_tid = last_flush_tid;
11789
05a536ef
TL
11790 // flush the mdlog before waiting for unsafe requests.
11791 flush_mdlog_sync();
11792
7c673cae
FG
11793 // wait for unsafe mds requests
11794 wait_unsafe_requests();
11795
11796 wait_sync_caps(flush_tid);
11797
11fdf7f2 11798 if (nullptr != cond) {
9f95a23c 11799 client_lock.unlock();
11fdf7f2
TL
11800 ldout(cct, 15) << __func__ << " waiting on data to flush" << dendl;
11801 cond->wait();
11802 ldout(cct, 15) << __func__ << " flush finished" << dendl;
9f95a23c 11803 client_lock.lock();
7c673cae
FG
11804 }
11805
11806 return 0;
11807}
11808
11809int Client::sync_fs()
11810{
f67539c2
TL
11811 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11812 if (!mref_reader.is_state_satisfied())
11813 return -CEPHFS_ENOTCONN;
181888fb 11814
f67539c2 11815 std::scoped_lock l(client_lock);
181888fb 11816
7c673cae
FG
11817 return _sync_fs();
11818}
11819
11820int64_t Client::drop_caches()
11821{
f67539c2 11822 std::scoped_lock l(client_lock);
7c673cae
FG
11823 return objectcacher->release_all();
11824}
11825
11fdf7f2
TL
11826int Client::_lazyio(Fh *fh, int enable)
11827{
11828 Inode *in = fh->inode.get();
11829 ldout(cct, 20) << __func__ << " " << *in << " " << !!enable << dendl;
11830
11831 if (!!(fh->mode & CEPH_FILE_MODE_LAZY) == !!enable)
11832 return 0;
11833
11834 int orig_mode = fh->mode;
11835 if (enable) {
11836 fh->mode |= CEPH_FILE_MODE_LAZY;
11837 in->get_open_ref(fh->mode);
11838 in->put_open_ref(orig_mode);
11839 check_caps(in, CHECK_CAPS_NODELAY);
11840 } else {
11841 fh->mode &= ~CEPH_FILE_MODE_LAZY;
11842 in->get_open_ref(fh->mode);
11843 in->put_open_ref(orig_mode);
11844 check_caps(in, 0);
11845 }
11846
11847 return 0;
11848}
11849
11850int Client::lazyio(int fd, int enable)
11851{
f67539c2 11852 std::scoped_lock l(client_lock);
11fdf7f2
TL
11853 Fh *f = get_filehandle(fd);
11854 if (!f)
f67539c2 11855 return -CEPHFS_EBADF;
11fdf7f2
TL
11856
11857 return _lazyio(f, enable);
11858}
11859
11860int Client::ll_lazyio(Fh *fh, int enable)
11861{
11fdf7f2
TL
11862 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << !!enable << dendl;
11863 tout(cct) << __func__ << std::endl;
11864
f67539c2 11865 std::scoped_lock lock(client_lock);
11fdf7f2
TL
11866 return _lazyio(fh, enable);
11867}
7c673cae 11868
92f5a8d4 11869int Client::lazyio_propagate(int fd, loff_t offset, size_t count)
7c673cae 11870{
f67539c2 11871 std::scoped_lock l(client_lock);
92f5a8d4 11872 ldout(cct, 3) << "op: client->lazyio_propagate(" << fd
7c673cae
FG
11873 << ", " << offset << ", " << count << ")" << dendl;
11874
11875 Fh *f = get_filehandle(fd);
11876 if (!f)
f67539c2 11877 return -CEPHFS_EBADF;
7c673cae
FG
11878
11879 // for now
11880 _fsync(f, true);
11881
11882 return 0;
11883}
11884
11885int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
11886{
f67539c2 11887 std::scoped_lock l(client_lock);
7c673cae
FG
11888 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
11889 << ", " << offset << ", " << count << ")" << dendl;
11890
11891 Fh *f = get_filehandle(fd);
11892 if (!f)
f67539c2 11893 return -CEPHFS_EBADF;
7c673cae
FG
11894 Inode *in = f->inode.get();
11895
11896 _fsync(f, true);
92f5a8d4
TL
11897 if (_release(in)) {
11898 int r =_getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
11899 if (r < 0)
11900 return r;
11901 }
7c673cae
FG
11902 return 0;
11903}
11904
11905
11906// =============================
11907// snaps
11908
f67539c2
TL
11909int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm,
11910 mode_t mode, const std::map<std::string, std::string> &metadata)
7c673cae 11911{
f67539c2
TL
11912 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11913 if (!mref_reader.is_state_satisfied())
11914 return -CEPHFS_ENOTCONN;
181888fb 11915
f67539c2 11916 std::scoped_lock l(client_lock);
181888fb 11917
7c673cae
FG
11918 filepath path(relpath);
11919 InodeRef in;
11920 int r = path_walk(path, &in, perm);
11921 if (r < 0)
11922 return r;
11923 if (cct->_conf->client_permissions) {
11924 r = may_create(in.get(), perm);
11925 if (r < 0)
11926 return r;
11927 }
11928 Inode *snapdir = open_snapdir(in.get());
f67539c2 11929 return _mkdir(snapdir, name, mode, perm, nullptr, metadata);
7c673cae 11930}
181888fb 11931
f67539c2 11932int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms, bool check_perms)
7c673cae 11933{
f67539c2
TL
11934 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11935 if (!mref_reader.is_state_satisfied())
11936 return -CEPHFS_ENOTCONN;
181888fb 11937
f67539c2 11938 std::scoped_lock l(client_lock);
181888fb 11939
7c673cae
FG
11940 filepath path(relpath);
11941 InodeRef in;
11942 int r = path_walk(path, &in, perms);
11943 if (r < 0)
11944 return r;
f67539c2 11945 Inode *snapdir = open_snapdir(in.get());
7c673cae 11946 if (cct->_conf->client_permissions) {
f67539c2 11947 r = may_delete(snapdir, check_perms ? name : NULL, perms);
7c673cae
FG
11948 if (r < 0)
11949 return r;
11950 }
7c673cae
FG
11951 return _rmdir(snapdir, name, perms);
11952}
11953
11954// =============================
11955// expose caps
11956
f67539c2
TL
11957int Client::get_caps_issued(int fd)
11958{
11959 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11960 if (!mref_reader.is_state_satisfied())
11961 return -CEPHFS_ENOTCONN;
7c673cae 11962
f67539c2 11963 std::scoped_lock lock(client_lock);
181888fb 11964
7c673cae
FG
11965 Fh *f = get_filehandle(fd);
11966 if (!f)
f67539c2 11967 return -CEPHFS_EBADF;
7c673cae
FG
11968
11969 return f->inode->caps_issued();
11970}
11971
11972int Client::get_caps_issued(const char *path, const UserPerm& perms)
11973{
f67539c2
TL
11974 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
11975 if (!mref_reader.is_state_satisfied())
11976 return -CEPHFS_ENOTCONN;
181888fb 11977
f67539c2 11978 std::scoped_lock lock(client_lock);
181888fb 11979
7c673cae
FG
11980 filepath p(path);
11981 InodeRef in;
11982 int r = path_walk(p, &in, perms, true);
11983 if (r < 0)
11984 return r;
11985 return in->caps_issued();
11986}
11987
11988// =========================================
11989// low level
11990
39ae355f
TL
11991void Client::refresh_snapdir_attrs(Inode *in, Inode *diri) {
11992 ldout(cct, 10) << __func__ << ": snapdir inode=" << *in
11993 << ", inode=" << *diri << dendl;
11994 in->ino = diri->ino;
11995 in->snapid = CEPH_SNAPDIR;
11996 in->mode = diri->mode;
11997 in->uid = diri->uid;
11998 in->gid = diri->gid;
11999 in->nlink = 1;
1e59de90
TL
12000 in->mtime = diri->snaprealm->last_modified;
12001 in->ctime = in->mtime;
12002 in->change_attr = diri->snaprealm->change_attr;
39ae355f
TL
12003 in->btime = diri->btime;
12004 in->atime = diri->atime;
12005 in->size = diri->size;
39ae355f
TL
12006
12007 in->dirfragtree.clear();
12008 in->snapdir_parent = diri;
12009 // copy posix acls to snapshotted inode
12010 in->xattrs.clear();
12011 for (auto &[xattr_key, xattr_value] : diri->xattrs) {
12012 if (xattr_key.rfind("system.", 0) == 0) {
12013 in->xattrs[xattr_key] = xattr_value;
12014 }
12015 }
12016}
12017
7c673cae
FG
12018Inode *Client::open_snapdir(Inode *diri)
12019{
12020 Inode *in;
12021 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
12022 if (!inode_map.count(vino)) {
12023 in = new Inode(this, vino, &diri->layout);
39ae355f 12024 refresh_snapdir_attrs(in, diri);
7c673cae
FG
12025 diri->flags |= I_SNAPDIR_OPEN;
12026 inode_map[vino] = in;
12027 if (use_faked_inos())
12028 _assign_faked_ino(in);
12029 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
12030 } else {
12031 in = inode_map[vino];
12032 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
12033 }
12034 return in;
12035}
12036
12037int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
12038 Inode **out, const UserPerm& perms)
12039{
f67539c2
TL
12040 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12041 if (!mref_reader.is_state_satisfied())
12042 return -CEPHFS_ENOTCONN;
12043
31f18b77 12044 vinodeno_t vparent = _get_vino(parent);
11fdf7f2
TL
12045 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
12046 tout(cct) << __func__ << std::endl;
7c673cae
FG
12047 tout(cct) << name << std::endl;
12048
f67539c2 12049 std::scoped_lock lock(client_lock);
181888fb 12050
7c673cae 12051 int r = 0;
11fdf7f2
TL
12052 if (!fuse_default_permissions) {
12053 if (strcmp(name, ".") && strcmp(name, "..")) {
12054 r = may_lookup(parent, perms);
12055 if (r < 0)
12056 return r;
12057 }
7c673cae
FG
12058 }
12059
12060 string dname(name);
12061 InodeRef in;
12062
12063 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
12064 if (r < 0) {
12065 attr->st_ino = 0;
12066 goto out;
12067 }
12068
11fdf7f2 12069 ceph_assert(in);
7c673cae
FG
12070 fill_stat(in, attr);
12071 _ll_get(in.get());
12072
12073 out:
11fdf7f2 12074 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
12075 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
12076 tout(cct) << attr->st_ino << std::endl;
12077 *out = in.get();
12078 return r;
12079}
12080
f67539c2
TL
12081int Client::ll_lookup_vino(
12082 vinodeno_t vino,
1adf2230
AA
12083 const UserPerm& perms,
12084 Inode **inode)
12085{
81eedcae 12086 ceph_assert(inode != NULL);
f67539c2
TL
12087 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12088 if (!mref_reader.is_state_satisfied())
12089 return -CEPHFS_ENOTCONN;
81eedcae 12090
b3b6e05e
TL
12091 if (is_reserved_vino(vino))
12092 return -CEPHFS_ESTALE;
12093
f67539c2
TL
12094 std::scoped_lock lock(client_lock);
12095 ldout(cct, 3) << __func__ << " " << vino << dendl;
1adf2230 12096
f67539c2
TL
12097 // Check the cache first
12098 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12099 if (p != inode_map.end()) {
12100 *inode = p->second;
12101 _ll_get(*inode);
81eedcae
TL
12102 return 0;
12103 }
12104
f67539c2 12105 uint64_t snapid = vino.snapid;
81eedcae 12106
f67539c2
TL
12107 // for snapdir, find the non-snapped dir inode
12108 if (snapid == CEPH_SNAPDIR)
12109 vino.snapid = CEPH_NOSNAP;
12110
12111 int r = _lookup_vino(vino, perms, inode);
12112 if (r)
1adf2230 12113 return r;
f67539c2 12114 ceph_assert(*inode != NULL);
81eedcae 12115
f67539c2
TL
12116 if (snapid == CEPH_SNAPDIR) {
12117 Inode *tmp = *inode;
1adf2230 12118
f67539c2
TL
12119 // open the snapdir and put the inode ref
12120 *inode = open_snapdir(tmp);
12121 _ll_forget(tmp, 1);
12122 _ll_get(*inode);
1adf2230 12123 }
1adf2230
AA
12124 return 0;
12125}
12126
f67539c2
TL
12127int Client::ll_lookup_inode(
12128 struct inodeno_t ino,
12129 const UserPerm& perms,
12130 Inode **inode)
12131{
12132 vinodeno_t vino(ino, CEPH_NOSNAP);
12133 return ll_lookup_vino(vino, perms, inode);
12134}
12135
7c673cae
FG
12136int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
12137 struct ceph_statx *stx, unsigned want, unsigned flags,
12138 const UserPerm& perms)
12139{
f67539c2
TL
12140 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12141 if (!mref_reader.is_state_satisfied())
12142 return -CEPHFS_ENOTCONN;
12143
31f18b77 12144 vinodeno_t vparent = _get_vino(parent);
11fdf7f2 12145 ldout(cct, 3) << __func__ << " " << vparent << " " << name << dendl;
7c673cae
FG
12146 tout(cct) << "ll_lookupx" << std::endl;
12147 tout(cct) << name << std::endl;
12148
f67539c2 12149 std::scoped_lock lock(client_lock);
181888fb 12150
7c673cae 12151 int r = 0;
11fdf7f2 12152 if (!fuse_default_permissions) {
7c673cae
FG
12153 r = may_lookup(parent, perms);
12154 if (r < 0)
12155 return r;
12156 }
12157
12158 string dname(name);
12159 InodeRef in;
12160
12161 unsigned mask = statx_to_mask(flags, want);
12162 r = _lookup(parent, dname, mask, &in, perms);
12163 if (r < 0) {
12164 stx->stx_ino = 0;
12165 stx->stx_mask = 0;
12166 } else {
11fdf7f2 12167 ceph_assert(in);
7c673cae
FG
12168 fill_statx(in, mask, stx);
12169 _ll_get(in.get());
12170 }
12171
11fdf7f2 12172 ldout(cct, 3) << __func__ << " " << vparent << " " << name
7c673cae
FG
12173 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
12174 tout(cct) << stx->stx_ino << std::endl;
12175 *out = in.get();
12176 return r;
12177}
12178
12179int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
12180 unsigned int want, unsigned int flags, const UserPerm& perms)
12181{
f67539c2
TL
12182 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12183 if (!mref_reader.is_state_satisfied())
12184 return -CEPHFS_ENOTCONN;
181888fb 12185
7c673cae
FG
12186 filepath fp(name, 0);
12187 InodeRef in;
12188 int rc;
12189 unsigned mask = statx_to_mask(flags, want);
12190
11fdf7f2
TL
12191 ldout(cct, 3) << __func__ << " " << name << dendl;
12192 tout(cct) << __func__ << std::endl;
7c673cae
FG
12193 tout(cct) << name << std::endl;
12194
f67539c2 12195 std::scoped_lock lock(client_lock);
7c673cae
FG
12196 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
12197 if (rc < 0) {
12198 /* zero out mask, just in case... */
12199 stx->stx_mask = 0;
12200 stx->stx_ino = 0;
12201 *out = NULL;
12202 return rc;
12203 } else {
11fdf7f2 12204 ceph_assert(in);
7c673cae
FG
12205 fill_statx(in, mask, stx);
12206 _ll_get(in.get());
12207 *out = in.get();
12208 return 0;
12209 }
12210}
12211
12212void Client::_ll_get(Inode *in)
12213{
12214 if (in->ll_ref == 0) {
b3b6e05e 12215 in->iget();
11fdf7f2
TL
12216 if (in->is_dir() && !in->dentries.empty()) {
12217 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
12218 in->get_first_parent()->get(); // pin dentry
12219 }
11fdf7f2
TL
12220 if (in->snapid != CEPH_NOSNAP)
12221 ll_snap_ref[in->snapid]++;
7c673cae
FG
12222 }
12223 in->ll_get();
11fdf7f2 12224 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
7c673cae
FG
12225}
12226
494da23a 12227int Client::_ll_put(Inode *in, uint64_t num)
7c673cae
FG
12228{
12229 in->ll_put(num);
11fdf7f2 12230 ldout(cct, 20) << __func__ << " " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
7c673cae 12231 if (in->ll_ref == 0) {
11fdf7f2
TL
12232 if (in->is_dir() && !in->dentries.empty()) {
12233 ceph_assert(in->dentries.size() == 1); // dirs can't be hard-linked
7c673cae
FG
12234 in->get_first_parent()->put(); // unpin dentry
12235 }
11fdf7f2
TL
12236 if (in->snapid != CEPH_NOSNAP) {
12237 auto p = ll_snap_ref.find(in->snapid);
12238 ceph_assert(p != ll_snap_ref.end());
12239 ceph_assert(p->second > 0);
12240 if (--p->second == 0)
12241 ll_snap_ref.erase(p);
12242 }
7c673cae
FG
12243 put_inode(in);
12244 return 0;
12245 } else {
12246 return in->ll_ref;
12247 }
12248}
12249
12250void Client::_ll_drop_pins()
12251{
11fdf7f2 12252 ldout(cct, 10) << __func__ << dendl;
1adf2230 12253 std::set<InodeRef> to_be_put; //this set will be deconstructed item by item when exit
7c673cae
FG
12254 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
12255 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
12256 it != inode_map.end();
12257 it = next) {
12258 Inode *in = it->second;
12259 next = it;
12260 ++next;
1adf2230
AA
12261 if (in->ll_ref){
12262 to_be_put.insert(in);
7c673cae 12263 _ll_put(in, in->ll_ref);
1adf2230 12264 }
7c673cae
FG
12265 }
12266}
12267
494da23a 12268bool Client::_ll_forget(Inode *in, uint64_t count)
7c673cae 12269{
11fdf7f2 12270 inodeno_t ino = in->ino;
7c673cae 12271
11fdf7f2
TL
12272 ldout(cct, 8) << __func__ << " " << ino << " " << count << dendl;
12273 tout(cct) << __func__ << std::endl;
7c673cae
FG
12274 tout(cct) << ino.val << std::endl;
12275 tout(cct) << count << std::endl;
12276
181888fb 12277 // Ignore forget if we're no longer mounted
f67539c2
TL
12278 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12279 if (!mref_reader.is_state_satisfied())
181888fb
FG
12280 return true;
12281
7c673cae
FG
12282 if (ino == 1) return true; // ignore forget on root.
12283
12284 bool last = false;
12285 if (in->ll_ref < count) {
12286 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
12287 << ", which only has ll_ref=" << in->ll_ref << dendl;
12288 _ll_put(in, in->ll_ref);
12289 last = true;
12290 } else {
12291 if (_ll_put(in, count) == 0)
12292 last = true;
12293 }
12294
12295 return last;
12296}
12297
494da23a 12298bool Client::ll_forget(Inode *in, uint64_t count)
1adf2230 12299{
f67539c2 12300 std::scoped_lock lock(client_lock);
1adf2230
AA
12301 return _ll_forget(in, count);
12302}
12303
7c673cae
FG
12304bool Client::ll_put(Inode *in)
12305{
12306 /* ll_forget already takes the lock */
12307 return ll_forget(in, 1);
12308}
12309
11fdf7f2
TL
12310int Client::ll_get_snap_ref(snapid_t snap)
12311{
f67539c2 12312 std::scoped_lock lock(client_lock);
11fdf7f2
TL
12313 auto p = ll_snap_ref.find(snap);
12314 if (p != ll_snap_ref.end())
12315 return p->second;
12316 return 0;
12317}
12318
7c673cae
FG
12319snapid_t Client::ll_get_snapid(Inode *in)
12320{
f67539c2 12321 std::scoped_lock lock(client_lock);
7c673cae
FG
12322 return in->snapid;
12323}
12324
12325Inode *Client::ll_get_inode(ino_t ino)
12326{
f67539c2
TL
12327 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12328 if (!mref_reader.is_state_satisfied())
181888fb
FG
12329 return NULL;
12330
f67539c2
TL
12331 std::scoped_lock lock(client_lock);
12332
7c673cae
FG
12333 vinodeno_t vino = _map_faked_ino(ino);
12334 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12335 if (p == inode_map.end())
12336 return NULL;
12337 Inode *in = p->second;
12338 _ll_get(in);
12339 return in;
12340}
12341
12342Inode *Client::ll_get_inode(vinodeno_t vino)
12343{
f67539c2
TL
12344 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12345 if (!mref_reader.is_state_satisfied())
181888fb
FG
12346 return NULL;
12347
b3b6e05e
TL
12348 if (is_reserved_vino(vino))
12349 return NULL;
12350
f67539c2
TL
12351 std::scoped_lock lock(client_lock);
12352
7c673cae
FG
12353 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
12354 if (p == inode_map.end())
12355 return NULL;
12356 Inode *in = p->second;
12357 _ll_get(in);
12358 return in;
12359}
12360
12361int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
12362{
12363 vinodeno_t vino = _get_vino(in);
12364
11fdf7f2
TL
12365 ldout(cct, 8) << __func__ << " " << vino << dendl;
12366 tout(cct) << __func__ << std::endl;
7c673cae
FG
12367 tout(cct) << vino.ino.val << std::endl;
12368
12369 if (vino.snapid < CEPH_NOSNAP)
12370 return 0;
12371 else
12372 return _getattr(in, caps, perms);
12373}
12374
12375int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
12376{
f67539c2
TL
12377 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12378 if (!mref_reader.is_state_satisfied())
12379 return -CEPHFS_ENOTCONN;
7c673cae 12380
f67539c2 12381 std::scoped_lock lock(client_lock);
181888fb 12382
7c673cae
FG
12383 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
12384
12385 if (res == 0)
12386 fill_stat(in, attr);
11fdf7f2 12387 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12388 return res;
12389}
12390
12391int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
12392 unsigned int flags, const UserPerm& perms)
12393{
f67539c2
TL
12394 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12395 if (!mref_reader.is_state_satisfied())
12396 return -CEPHFS_ENOTCONN;
7c673cae 12397
f67539c2 12398 std::scoped_lock lock(client_lock);
181888fb 12399
7c673cae
FG
12400 int res = 0;
12401 unsigned mask = statx_to_mask(flags, want);
12402
94b18763 12403 if (mask && !in->caps_issued_mask(mask, true))
7c673cae
FG
12404 res = _ll_getattr(in, mask, perms);
12405
12406 if (res == 0)
12407 fill_statx(in, mask, stx);
11fdf7f2 12408 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12409 return res;
12410}
12411
12412int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12413 const UserPerm& perms, InodeRef *inp)
12414{
12415 vinodeno_t vino = _get_vino(in);
12416
11fdf7f2 12417 ldout(cct, 8) << __func__ << " " << vino << " mask " << hex << mask << dec
7c673cae 12418 << dendl;
11fdf7f2 12419 tout(cct) << __func__ << std::endl;
7c673cae
FG
12420 tout(cct) << vino.ino.val << std::endl;
12421 tout(cct) << stx->stx_mode << std::endl;
12422 tout(cct) << stx->stx_uid << std::endl;
12423 tout(cct) << stx->stx_gid << std::endl;
12424 tout(cct) << stx->stx_size << std::endl;
12425 tout(cct) << stx->stx_mtime << std::endl;
12426 tout(cct) << stx->stx_atime << std::endl;
12427 tout(cct) << stx->stx_btime << std::endl;
12428 tout(cct) << mask << std::endl;
12429
11fdf7f2 12430 if (!fuse_default_permissions) {
7c673cae
FG
12431 int res = may_setattr(in, stx, mask, perms);
12432 if (res < 0)
12433 return res;
12434 }
12435
12436 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
12437
12438 return __setattrx(in, stx, mask, perms, inp);
12439}
12440
12441int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
12442 const UserPerm& perms)
12443{
f67539c2
TL
12444 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12445 if (!mref_reader.is_state_satisfied())
12446 return -CEPHFS_ENOTCONN;
181888fb 12447
f67539c2 12448 std::scoped_lock lock(client_lock);
181888fb 12449
7c673cae
FG
12450 InodeRef target(in);
12451 int res = _ll_setattrx(in, stx, mask, perms, &target);
12452 if (res == 0) {
11fdf7f2 12453 ceph_assert(in == target.get());
7c673cae
FG
12454 fill_statx(in, in->caps_issued(), stx);
12455 }
12456
11fdf7f2 12457 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12458 return res;
12459}
12460
12461int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
12462 const UserPerm& perms)
12463{
12464 struct ceph_statx stx;
12465 stat_to_statx(attr, &stx);
12466
f67539c2
TL
12467 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12468 if (!mref_reader.is_state_satisfied())
12469 return -CEPHFS_ENOTCONN;
181888fb 12470
f67539c2 12471 std::scoped_lock lock(client_lock);
181888fb 12472
7c673cae
FG
12473 InodeRef target(in);
12474 int res = _ll_setattrx(in, &stx, mask, perms, &target);
12475 if (res == 0) {
11fdf7f2 12476 ceph_assert(in == target.get());
7c673cae
FG
12477 fill_stat(in, attr);
12478 }
12479
11fdf7f2 12480 ldout(cct, 3) << __func__ << " " << _get_vino(in) << " = " << res << dendl;
7c673cae
FG
12481 return res;
12482}
12483
12484
12485// ----------
12486// xattrs
12487
12488int Client::getxattr(const char *path, const char *name, void *value, size_t size,
12489 const UserPerm& perms)
12490{
f67539c2
TL
12491 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12492 if (!mref_reader.is_state_satisfied())
12493 return -CEPHFS_ENOTCONN;
181888fb 12494
f67539c2 12495 std::scoped_lock lock(client_lock);
181888fb 12496
7c673cae
FG
12497 InodeRef in;
12498 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12499 if (r < 0)
12500 return r;
12501 return _getxattr(in, name, value, size, perms);
12502}
12503
12504int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
12505 const UserPerm& perms)
12506{
f67539c2
TL
12507 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12508 if (!mref_reader.is_state_satisfied())
12509 return -CEPHFS_ENOTCONN;
181888fb 12510
f67539c2 12511 std::scoped_lock lock(client_lock);
181888fb 12512
7c673cae
FG
12513 InodeRef in;
12514 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12515 if (r < 0)
12516 return r;
12517 return _getxattr(in, name, value, size, perms);
12518}
12519
12520int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
12521 const UserPerm& perms)
12522{
f67539c2
TL
12523 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12524 if (!mref_reader.is_state_satisfied())
12525 return -CEPHFS_ENOTCONN;
181888fb 12526
f67539c2 12527 std::scoped_lock lock(client_lock);
181888fb 12528
7c673cae
FG
12529 Fh *f = get_filehandle(fd);
12530 if (!f)
f67539c2 12531 return -CEPHFS_EBADF;
7c673cae
FG
12532 return _getxattr(f->inode, name, value, size, perms);
12533}
12534
12535int Client::listxattr(const char *path, char *list, size_t size,
12536 const UserPerm& perms)
12537{
f67539c2
TL
12538 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12539 if (!mref_reader.is_state_satisfied())
12540 return -CEPHFS_ENOTCONN;
181888fb 12541
f67539c2 12542 std::scoped_lock lock(client_lock);
181888fb 12543
7c673cae
FG
12544 InodeRef in;
12545 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
12546 if (r < 0)
12547 return r;
12548 return Client::_listxattr(in.get(), list, size, perms);
12549}
12550
12551int Client::llistxattr(const char *path, char *list, size_t size,
12552 const UserPerm& perms)
12553{
f67539c2
TL
12554 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12555 if (!mref_reader.is_state_satisfied())
12556 return -CEPHFS_ENOTCONN;
181888fb 12557
f67539c2 12558 std::scoped_lock lock(client_lock);
181888fb 12559
7c673cae
FG
12560 InodeRef in;
12561 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
12562 if (r < 0)
12563 return r;
12564 return Client::_listxattr(in.get(), list, size, perms);
12565}
12566
12567int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
12568{
f67539c2
TL
12569 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12570 if (!mref_reader.is_state_satisfied())
12571 return -CEPHFS_ENOTCONN;
181888fb 12572
f67539c2 12573 std::scoped_lock lock(client_lock);
181888fb 12574
7c673cae
FG
12575 Fh *f = get_filehandle(fd);
12576 if (!f)
f67539c2 12577 return -CEPHFS_EBADF;
7c673cae
FG
12578 return Client::_listxattr(f->inode.get(), list, size, perms);
12579}
12580
12581int Client::removexattr(const char *path, const char *name,
12582 const UserPerm& perms)
12583{
f67539c2
TL
12584 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12585 if (!mref_reader.is_state_satisfied())
12586 return -CEPHFS_ENOTCONN;
181888fb 12587
f67539c2 12588 std::scoped_lock lock(client_lock);
181888fb 12589
7c673cae
FG
12590 InodeRef in;
12591 int r = Client::path_walk(path, &in, perms, true);
12592 if (r < 0)
12593 return r;
12594 return _removexattr(in, name, perms);
12595}
12596
12597int Client::lremovexattr(const char *path, const char *name,
12598 const UserPerm& perms)
12599{
f67539c2
TL
12600 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12601 if (!mref_reader.is_state_satisfied())
12602 return -CEPHFS_ENOTCONN;
181888fb 12603
f67539c2 12604 std::scoped_lock lock(client_lock);
181888fb 12605
7c673cae
FG
12606 InodeRef in;
12607 int r = Client::path_walk(path, &in, perms, false);
12608 if (r < 0)
12609 return r;
12610 return _removexattr(in, name, perms);
12611}
12612
12613int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
12614{
f67539c2
TL
12615 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12616 if (!mref_reader.is_state_satisfied())
12617 return -CEPHFS_ENOTCONN;
181888fb 12618
f67539c2 12619 std::scoped_lock lock(client_lock);
181888fb 12620
7c673cae
FG
12621 Fh *f = get_filehandle(fd);
12622 if (!f)
f67539c2 12623 return -CEPHFS_EBADF;
7c673cae
FG
12624 return _removexattr(f->inode, name, perms);
12625}
12626
12627int Client::setxattr(const char *path, const char *name, const void *value,
12628 size_t size, int flags, const UserPerm& perms)
12629{
f67539c2
TL
12630 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12631 if (!mref_reader.is_state_satisfied())
12632 return -CEPHFS_ENOTCONN;
12633
7c673cae
FG
12634 _setxattr_maybe_wait_for_osdmap(name, value, size);
12635
f67539c2 12636 std::scoped_lock lock(client_lock);
181888fb 12637
7c673cae
FG
12638 InodeRef in;
12639 int r = Client::path_walk(path, &in, perms, true);
12640 if (r < 0)
12641 return r;
12642 return _setxattr(in, name, value, size, flags, perms);
12643}
12644
12645int Client::lsetxattr(const char *path, const char *name, const void *value,
12646 size_t size, int flags, const UserPerm& perms)
12647{
f67539c2
TL
12648 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12649 if (!mref_reader.is_state_satisfied())
12650 return -CEPHFS_ENOTCONN;
7c673cae 12651
f67539c2 12652 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12653
f67539c2 12654 std::scoped_lock lock(client_lock);
181888fb 12655
7c673cae
FG
12656 InodeRef in;
12657 int r = Client::path_walk(path, &in, perms, false);
12658 if (r < 0)
12659 return r;
12660 return _setxattr(in, name, value, size, flags, perms);
12661}
12662
12663int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
12664 int flags, const UserPerm& perms)
12665{
f67539c2
TL
12666 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12667 if (!mref_reader.is_state_satisfied())
12668 return -CEPHFS_ENOTCONN;
7c673cae 12669
f67539c2 12670 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 12671
f67539c2 12672 std::scoped_lock lock(client_lock);
181888fb 12673
7c673cae
FG
12674 Fh *f = get_filehandle(fd);
12675 if (!f)
f67539c2 12676 return -CEPHFS_EBADF;
7c673cae
FG
12677 return _setxattr(f->inode, name, value, size, flags, perms);
12678}
12679
12680int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
12681 const UserPerm& perms)
12682{
12683 int r;
1d09f67e 12684 const VXattr *vxattr = nullptr;
7c673cae 12685
1d09f67e 12686 vxattr = _match_vxattr(in, name);
7c673cae 12687 if (vxattr) {
f67539c2 12688 r = -CEPHFS_ENODATA;
7c673cae
FG
12689
12690 // Do a force getattr to get the latest quota before returning
12691 // a value to userspace.
28e407b8
AA
12692 int flags = 0;
12693 if (vxattr->flags & VXATTR_RSTAT) {
12694 flags |= CEPH_STAT_RSTAT;
12695 }
adb31ebb
TL
12696 if (vxattr->flags & VXATTR_DIRSTAT) {
12697 flags |= CEPH_CAP_FILE_SHARED;
12698 }
f67539c2 12699 r = _getattr(in, flags | CEPH_STAT_CAP_XATTR, perms, true);
7c673cae
FG
12700 if (r != 0) {
12701 // Error from getattr!
12702 return r;
12703 }
12704
12705 // call pointer-to-member function
12706 char buf[256];
12707 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
12708 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
12709 } else {
f67539c2 12710 r = -CEPHFS_ENODATA;
7c673cae
FG
12711 }
12712
12713 if (size != 0) {
12714 if (r > (int)size) {
f67539c2 12715 r = -CEPHFS_ERANGE;
7c673cae
FG
12716 } else if (r > 0) {
12717 memcpy(value, buf, r);
12718 }
12719 }
12720 goto out;
12721 }
12722
1d09f67e
TL
12723 if (!strncmp(name, "ceph.", 5)) {
12724 r = _getvxattr(in, perms, name, size, value, MDS_RANK_NONE);
12725 goto out;
12726 }
12727
7c673cae 12728 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
f67539c2 12729 r = -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12730 goto out;
12731 }
12732
12733 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
12734 if (r == 0) {
12735 string n(name);
f67539c2 12736 r = -CEPHFS_ENODATA;
1d09f67e 12737 if (in->xattrs.count(n)) {
7c673cae
FG
12738 r = in->xattrs[n].length();
12739 if (r > 0 && size != 0) {
12740 if (size >= (unsigned)r)
12741 memcpy(value, in->xattrs[n].c_str(), r);
12742 else
f67539c2 12743 r = -CEPHFS_ERANGE;
7c673cae
FG
12744 }
12745 }
12746 }
12747 out:
1adf2230 12748 ldout(cct, 8) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
7c673cae
FG
12749 return r;
12750}
12751
12752int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
12753 const UserPerm& perms)
12754{
12755 if (cct->_conf->client_permissions) {
12756 int r = xattr_permission(in.get(), name, MAY_READ, perms);
12757 if (r < 0)
12758 return r;
12759 }
12760 return _getxattr(in.get(), name, value, size, perms);
12761}
12762
12763int Client::ll_getxattr(Inode *in, const char *name, void *value,
12764 size_t size, const UserPerm& perms)
12765{
f67539c2
TL
12766 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12767 if (!mref_reader.is_state_satisfied())
12768 return -CEPHFS_ENOTCONN;
181888fb 12769
7c673cae
FG
12770 vinodeno_t vino = _get_vino(in);
12771
11fdf7f2
TL
12772 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
12773 tout(cct) << __func__ << std::endl;
7c673cae
FG
12774 tout(cct) << vino.ino.val << std::endl;
12775 tout(cct) << name << std::endl;
12776
f67539c2 12777 std::scoped_lock lock(client_lock);
11fdf7f2 12778 if (!fuse_default_permissions) {
7c673cae
FG
12779 int r = xattr_permission(in, name, MAY_READ, perms);
12780 if (r < 0)
12781 return r;
12782 }
12783
12784 return _getxattr(in, name, value, size, perms);
12785}
12786
12787int Client::_listxattr(Inode *in, char *name, size_t size,
12788 const UserPerm& perms)
12789{
81eedcae 12790 bool len_only = (size == 0);
7c673cae 12791 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
81eedcae
TL
12792 if (r != 0) {
12793 goto out;
12794 }
7c673cae 12795
81eedcae 12796 r = 0;
f67539c2
TL
12797 for ([[maybe_unused]] const auto &[xattr_name, xattr_value_bl] : in->xattrs) {
12798 if (xattr_name.rfind("ceph.", 0) == 0) {
12799 continue;
12800 }
12801
12802 size_t this_len = xattr_name.length() + 1;
81eedcae
TL
12803 r += this_len;
12804 if (len_only)
12805 continue;
7c673cae 12806
81eedcae 12807 if (this_len > size) {
f67539c2 12808 r = -CEPHFS_ERANGE;
81eedcae
TL
12809 goto out;
12810 }
12811
f67539c2 12812 memcpy(name, xattr_name.c_str(), this_len);
81eedcae
TL
12813 name += this_len;
12814 size -= this_len;
12815 }
81eedcae 12816out:
11fdf7f2 12817 ldout(cct, 8) << __func__ << "(" << in->ino << ", " << size << ") = " << r << dendl;
7c673cae
FG
12818 return r;
12819}
12820
12821int Client::ll_listxattr(Inode *in, char *names, size_t size,
12822 const UserPerm& perms)
12823{
f67539c2
TL
12824 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
12825 if (!mref_reader.is_state_satisfied())
12826 return -CEPHFS_ENOTCONN;
181888fb 12827
7c673cae
FG
12828 vinodeno_t vino = _get_vino(in);
12829
11fdf7f2
TL
12830 ldout(cct, 3) << __func__ << " " << vino << " size " << size << dendl;
12831 tout(cct) << __func__ << std::endl;
7c673cae
FG
12832 tout(cct) << vino.ino.val << std::endl;
12833 tout(cct) << size << std::endl;
12834
f67539c2 12835 std::scoped_lock lock(client_lock);
7c673cae
FG
12836 return _listxattr(in, names, size, perms);
12837}
12838
12839int Client::_do_setxattr(Inode *in, const char *name, const void *value,
12840 size_t size, int flags, const UserPerm& perms)
12841{
12842
12843 int xattr_flags = 0;
12844 if (!value)
12845 xattr_flags |= CEPH_XATTR_REMOVE;
12846 if (flags & XATTR_CREATE)
12847 xattr_flags |= CEPH_XATTR_CREATE;
12848 if (flags & XATTR_REPLACE)
12849 xattr_flags |= CEPH_XATTR_REPLACE;
12850
12851 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
12852 filepath path;
12853 in->make_nosnap_relative_path(path);
12854 req->set_filepath(path);
12855 req->set_string2(name);
12856 req->set_inode(in);
12857 req->head.args.setxattr.flags = xattr_flags;
12858
12859 bufferlist bl;
20effc67 12860 ceph_assert(value || size == 0);
7c673cae
FG
12861 bl.append((const char*)value, size);
12862 req->set_data(bl);
12863
12864 int res = make_request(req, perms);
12865
12866 trim_cache();
11fdf7f2 12867 ldout(cct, 3) << __func__ << "(" << in->ino << ", \"" << name << "\") = " <<
7c673cae
FG
12868 res << dendl;
12869 return res;
12870}
12871
12872int Client::_setxattr(Inode *in, const char *name, const void *value,
12873 size_t size, int flags, const UserPerm& perms)
12874{
12875 if (in->snapid != CEPH_NOSNAP) {
f67539c2 12876 return -CEPHFS_EROFS;
7c673cae
FG
12877 }
12878
f6b5b4d7
TL
12879 if (size == 0) {
12880 value = "";
12881 } else if (value == NULL) {
f67539c2 12882 return -CEPHFS_EINVAL;
f6b5b4d7
TL
12883 }
12884
7c673cae
FG
12885 bool posix_acl_xattr = false;
12886 if (acl_type == POSIX_ACL)
12887 posix_acl_xattr = !strncmp(name, "system.", 7);
12888
12889 if (strncmp(name, "user.", 5) &&
12890 strncmp(name, "security.", 9) &&
12891 strncmp(name, "trusted.", 8) &&
12892 strncmp(name, "ceph.", 5) &&
12893 !posix_acl_xattr)
f67539c2 12894 return -CEPHFS_EOPNOTSUPP;
7c673cae 12895
11fdf7f2
TL
12896 bool check_realm = false;
12897
7c673cae
FG
12898 if (posix_acl_xattr) {
12899 if (!strcmp(name, ACL_EA_ACCESS)) {
12900 mode_t new_mode = in->mode;
12901 if (value) {
12902 int ret = posix_acl_equiv_mode(value, size, &new_mode);
12903 if (ret < 0)
12904 return ret;
12905 if (ret == 0) {
12906 value = NULL;
12907 size = 0;
12908 }
12909 if (new_mode != in->mode) {
12910 struct ceph_statx stx;
12911 stx.stx_mode = new_mode;
1e59de90 12912 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, nullptr);
7c673cae
FG
12913 if (ret < 0)
12914 return ret;
12915 }
12916 }
12917 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
12918 if (value) {
12919 if (!S_ISDIR(in->mode))
f67539c2 12920 return -CEPHFS_EACCES;
7c673cae
FG
12921 int ret = posix_acl_check(value, size);
12922 if (ret < 0)
f67539c2 12923 return -CEPHFS_EINVAL;
7c673cae
FG
12924 if (ret == 0) {
12925 value = NULL;
12926 size = 0;
12927 }
12928 }
12929 } else {
f67539c2 12930 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
12931 }
12932 } else {
12933 const VXattr *vxattr = _match_vxattr(in, name);
11fdf7f2
TL
12934 if (vxattr) {
12935 if (vxattr->readonly)
f67539c2 12936 return -CEPHFS_EOPNOTSUPP;
1e59de90
TL
12937 if (vxattr->setxattr_cb)
12938 return (this->*(vxattr->setxattr_cb))(in, value, size, perms);
11fdf7f2
TL
12939 if (vxattr->name.compare(0, 10, "ceph.quota") == 0 && value)
12940 check_realm = true;
12941 }
7c673cae
FG
12942 }
12943
11fdf7f2
TL
12944 int ret = _do_setxattr(in, name, value, size, flags, perms);
12945 if (ret >= 0 && check_realm) {
12946 // check if snaprealm was created for quota inode
1e59de90 12947 if (in->quota.is_enabled() &&
11fdf7f2 12948 !(in->snaprealm && in->snaprealm->ino == in->ino))
f67539c2 12949 ret = -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
12950 }
12951
12952 return ret;
7c673cae
FG
12953}
12954
12955int Client::_setxattr(InodeRef &in, const char *name, const void *value,
12956 size_t size, int flags, const UserPerm& perms)
12957{
12958 if (cct->_conf->client_permissions) {
12959 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
12960 if (r < 0)
12961 return r;
12962 }
12963 return _setxattr(in.get(), name, value, size, flags, perms);
12964}
12965
12966int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
12967{
12968 string tmp;
12969 if (name == "layout") {
12970 string::iterator begin = value.begin();
12971 string::iterator end = value.end();
12972 keys_and_values<string::iterator> p; // create instance of parser
12973 std::map<string, string> m; // map to receive results
12974 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 12975 return -CEPHFS_EINVAL;
7c673cae
FG
12976 }
12977 if (begin != end)
f67539c2 12978 return -CEPHFS_EINVAL;
7c673cae
FG
12979 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
12980 if (q->first == "pool") {
12981 tmp = q->second;
12982 break;
12983 }
12984 }
12985 } else if (name == "layout.pool") {
12986 tmp = value;
12987 }
12988
12989 if (tmp.length()) {
12990 int64_t pool;
12991 try {
12992 pool = boost::lexical_cast<unsigned>(tmp);
12993 if (!osdmap->have_pg_pool(pool))
f67539c2 12994 return -CEPHFS_ENOENT;
7c673cae
FG
12995 } catch (boost::bad_lexical_cast const&) {
12996 pool = osdmap->lookup_pg_pool_name(tmp);
12997 if (pool < 0) {
f67539c2 12998 return -CEPHFS_ENOENT;
7c673cae
FG
12999 }
13000 }
13001 }
13002
13003 return 0;
13004}
13005
13006void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
13007{
13008 // For setting pool of layout, MetaRequest need osdmap epoch.
13009 // There is a race which create a new data pool but client and mds both don't have.
13010 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
f67539c2 13011 ldout(cct, 15) << __func__ << ": name = " << name << dendl;
7c673cae
FG
13012 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
13013 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
13014 string rest(strstr(name, "layout"));
13015 string v((const char*)value, size);
13016 int r = objecter->with_osdmap([&](const OSDMap& o) {
13017 return _setxattr_check_data_pool(rest, v, &o);
13018 });
13019
f67539c2
TL
13020 if (r == -CEPHFS_ENOENT) {
13021 bs::error_code ec;
13022 ldout(cct, 20) << __func__ << ": waiting for latest osdmap" << dendl;
13023 objecter->wait_for_latest_osdmap(ca::use_blocked[ec]);
13024 ldout(cct, 20) << __func__ << ": got latest osdmap: " << ec << dendl;
7c673cae
FG
13025 }
13026 }
13027}
13028
13029int Client::ll_setxattr(Inode *in, const char *name, const void *value,
13030 size_t size, int flags, const UserPerm& perms)
13031{
f67539c2
TL
13032 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13033 if (!mref_reader.is_state_satisfied())
13034 return -CEPHFS_ENOTCONN;
7c673cae 13035
f67539c2 13036 _setxattr_maybe_wait_for_osdmap(name, value, size);
181888fb 13037
7c673cae
FG
13038 vinodeno_t vino = _get_vino(in);
13039
11fdf7f2
TL
13040 ldout(cct, 3) << __func__ << " " << vino << " " << name << " size " << size << dendl;
13041 tout(cct) << __func__ << std::endl;
7c673cae
FG
13042 tout(cct) << vino.ino.val << std::endl;
13043 tout(cct) << name << std::endl;
13044
f67539c2 13045 std::scoped_lock lock(client_lock);
11fdf7f2 13046 if (!fuse_default_permissions) {
7c673cae
FG
13047 int r = xattr_permission(in, name, MAY_WRITE, perms);
13048 if (r < 0)
13049 return r;
13050 }
13051 return _setxattr(in, name, value, size, flags, perms);
13052}
13053
13054int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
13055{
13056 if (in->snapid != CEPH_NOSNAP) {
f67539c2 13057 return -CEPHFS_EROFS;
7c673cae
FG
13058 }
13059
13060 // same xattrs supported by kernel client
13061 if (strncmp(name, "user.", 5) &&
13062 strncmp(name, "system.", 7) &&
13063 strncmp(name, "security.", 9) &&
13064 strncmp(name, "trusted.", 8) &&
13065 strncmp(name, "ceph.", 5))
f67539c2 13066 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
13067
13068 const VXattr *vxattr = _match_vxattr(in, name);
13069 if (vxattr && vxattr->readonly)
f67539c2 13070 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
13071
13072 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
13073 filepath path;
13074 in->make_nosnap_relative_path(path);
13075 req->set_filepath(path);
13076 req->set_filepath2(name);
13077 req->set_inode(in);
13078
13079 int res = make_request(req, perms);
13080
13081 trim_cache();
1adf2230 13082 ldout(cct, 8) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
7c673cae
FG
13083 return res;
13084}
13085
13086int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
13087{
13088 if (cct->_conf->client_permissions) {
13089 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
13090 if (r < 0)
13091 return r;
13092 }
13093 return _removexattr(in.get(), name, perms);
13094}
13095
13096int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
13097{
f67539c2
TL
13098 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13099 if (!mref_reader.is_state_satisfied())
13100 return -CEPHFS_ENOTCONN;
181888fb 13101
7c673cae
FG
13102 vinodeno_t vino = _get_vino(in);
13103
13104 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
13105 tout(cct) << "ll_removexattr" << std::endl;
13106 tout(cct) << vino.ino.val << std::endl;
13107 tout(cct) << name << std::endl;
13108
f67539c2 13109 std::scoped_lock lock(client_lock);
11fdf7f2 13110 if (!fuse_default_permissions) {
7c673cae
FG
13111 int r = xattr_permission(in, name, MAY_WRITE, perms);
13112 if (r < 0)
13113 return r;
13114 }
13115
13116 return _removexattr(in, name, perms);
13117}
13118
1e59de90
TL
13119bool Client::_vxattrcb_fscrypt_auth_exists(Inode *in)
13120{
13121 bool exists = !in->fscrypt_auth.empty();
13122
13123 ldout(cct, 10) << "fscrypt_auth exists " << exists << dendl;
13124 return exists;
13125}
13126
13127size_t Client::_vxattrcb_fscrypt_auth(Inode *in, char *val, size_t size)
13128{
13129 size_t count = in->fscrypt_auth.size();
13130
13131 if (count <= size)
13132 memcpy(val, in->fscrypt_auth.data(), count);
13133 return count;
13134}
13135
13136int Client::_vxattrcb_fscrypt_auth_set(Inode *in, const void *val, size_t size,
13137 const UserPerm& perms)
13138{
13139 struct ceph_statx stx = { 0 };
13140 std::vector<uint8_t> aux;
13141
13142 aux.resize(size);
13143 memcpy(aux.data(), val, size);
13144
13145 return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_AUTH, perms, nullptr, &aux);
13146}
13147
13148bool Client::_vxattrcb_fscrypt_file_exists(Inode *in)
13149{
13150 return !in->fscrypt_file.empty();
13151}
13152
13153size_t Client::_vxattrcb_fscrypt_file(Inode *in, char *val, size_t size)
13154{
13155 size_t count = in->fscrypt_file.size();
13156
13157 if (count <= size)
13158 memcpy(val, in->fscrypt_file.data(), count);
13159 return count;
13160}
13161
13162int Client::_vxattrcb_fscrypt_file_set(Inode *in, const void *val, size_t size,
13163 const UserPerm& perms)
13164{
13165 struct ceph_statx stx = { 0 };
13166 std::vector<uint8_t> aux;
13167
13168 aux.resize(size);
13169 memcpy(aux.data(), val, size);
13170
13171 return _do_setattr(in, &stx, CEPH_SETATTR_FSCRYPT_FILE, perms, nullptr, &aux);
13172}
13173
7c673cae
FG
13174bool Client::_vxattrcb_quota_exists(Inode *in)
13175{
1e59de90 13176 return in->quota.is_enabled() &&
f6b5b4d7
TL
13177 (in->snapid != CEPH_NOSNAP ||
13178 (in->snaprealm && in->snaprealm->ino == in->ino));
7c673cae
FG
13179}
13180size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
13181{
13182 return snprintf(val, size,
13183 "max_bytes=%lld max_files=%lld",
13184 (long long int)in->quota.max_bytes,
13185 (long long int)in->quota.max_files);
13186}
13187size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
13188{
13189 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
13190}
13191size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
13192{
13193 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
13194}
13195
13196bool Client::_vxattrcb_layout_exists(Inode *in)
13197{
13198 return in->layout != file_layout_t();
13199}
13200size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
13201{
13202 int r = snprintf(val, size,
11fdf7f2 13203 "stripe_unit=%llu stripe_count=%llu object_size=%llu pool=",
7c673cae
FG
13204 (unsigned long long)in->layout.stripe_unit,
13205 (unsigned long long)in->layout.stripe_count,
13206 (unsigned long long)in->layout.object_size);
13207 objecter->with_osdmap([&](const OSDMap& o) {
13208 if (o.have_pg_pool(in->layout.pool_id))
13209 r += snprintf(val + r, size - r, "%s",
13210 o.get_pool_name(in->layout.pool_id).c_str());
13211 else
13212 r += snprintf(val + r, size - r, "%" PRIu64,
13213 (uint64_t)in->layout.pool_id);
13214 });
13215 if (in->layout.pool_ns.length())
13216 r += snprintf(val + r, size - r, " pool_namespace=%s",
13217 in->layout.pool_ns.c_str());
13218 return r;
13219}
13220size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
13221{
11fdf7f2 13222 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_unit);
7c673cae
FG
13223}
13224size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
13225{
11fdf7f2 13226 return snprintf(val, size, "%llu", (unsigned long long)in->layout.stripe_count);
7c673cae
FG
13227}
13228size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
13229{
11fdf7f2 13230 return snprintf(val, size, "%llu", (unsigned long long)in->layout.object_size);
7c673cae
FG
13231}
13232size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
13233{
13234 size_t r;
13235 objecter->with_osdmap([&](const OSDMap& o) {
13236 if (o.have_pg_pool(in->layout.pool_id))
13237 r = snprintf(val, size, "%s", o.get_pool_name(
13238 in->layout.pool_id).c_str());
13239 else
13240 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
13241 });
13242 return r;
13243}
13244size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
13245{
13246 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
13247}
13248size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
13249{
11fdf7f2 13250 return snprintf(val, size, "%llu", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
7c673cae
FG
13251}
13252size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
13253{
11fdf7f2 13254 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nfiles);
7c673cae
FG
13255}
13256size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
13257{
11fdf7f2 13258 return snprintf(val, size, "%llu", (unsigned long long)in->dirstat.nsubdirs);
7c673cae
FG
13259}
13260size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
13261{
11fdf7f2 13262 return snprintf(val, size, "%llu", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
7c673cae
FG
13263}
13264size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
13265{
11fdf7f2 13266 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rfiles);
7c673cae
FG
13267}
13268size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
13269{
11fdf7f2 13270 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsubdirs);
7c673cae 13271}
f67539c2
TL
13272size_t Client::_vxattrcb_dir_rsnaps(Inode *in, char *val, size_t size)
13273{
13274 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rsnaps);
13275}
7c673cae
FG
13276size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
13277{
11fdf7f2 13278 return snprintf(val, size, "%llu", (unsigned long long)in->rstat.rbytes);
7c673cae
FG
13279}
13280size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
13281{
81eedcae 13282 return snprintf(val, size, "%ld.%09ld", (long)in->rstat.rctime.sec(),
7c673cae
FG
13283 (long)in->rstat.rctime.nsec());
13284}
11fdf7f2
TL
13285bool Client::_vxattrcb_dir_pin_exists(Inode *in)
13286{
f67539c2 13287 return in->dir_pin != -CEPHFS_ENODATA;
11fdf7f2
TL
13288}
13289size_t Client::_vxattrcb_dir_pin(Inode *in, char *val, size_t size)
13290{
13291 return snprintf(val, size, "%ld", (long)in->dir_pin);
13292}
7c673cae 13293
81eedcae
TL
13294bool Client::_vxattrcb_snap_btime_exists(Inode *in)
13295{
13296 return !in->snap_btime.is_zero();
13297}
13298
13299size_t Client::_vxattrcb_snap_btime(Inode *in, char *val, size_t size)
13300{
13301 return snprintf(val, size, "%llu.%09lu",
13302 (long long unsigned)in->snap_btime.sec(),
13303 (long unsigned)in->snap_btime.nsec());
13304}
13305
20effc67
TL
13306size_t Client::_vxattrcb_caps(Inode *in, char *val, size_t size)
13307{
13308 int issued;
13309
13310 in->caps_issued(&issued);
13311 return snprintf(val, size, "%s/0x%x", ccap_string(issued).c_str(), issued);
13312}
13313
f67539c2
TL
13314bool Client::_vxattrcb_mirror_info_exists(Inode *in)
13315{
13316 // checking one of the xattrs would suffice
13317 return in->xattrs.count("ceph.mirror.info.cluster_id") != 0;
13318}
13319
13320size_t Client::_vxattrcb_mirror_info(Inode *in, char *val, size_t size)
13321{
13322 return snprintf(val, size, "cluster_id=%.*s fs_id=%.*s",
13323 in->xattrs["ceph.mirror.info.cluster_id"].length(),
13324 in->xattrs["ceph.mirror.info.cluster_id"].c_str(),
13325 in->xattrs["ceph.mirror.info.fs_id"].length(),
13326 in->xattrs["ceph.mirror.info.fs_id"].c_str());
13327}
13328
adb31ebb
TL
13329size_t Client::_vxattrcb_cluster_fsid(Inode *in, char *val, size_t size)
13330{
13331 return snprintf(val, size, "%s", monclient->get_fsid().to_string().c_str());
13332}
13333
13334size_t Client::_vxattrcb_client_id(Inode *in, char *val, size_t size)
13335{
13336 auto name = messenger->get_myname();
20effc67 13337 return snprintf(val, size, "%s%" PRId64, name.type_str(), name.num());
adb31ebb
TL
13338}
13339
7c673cae
FG
13340#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
13341#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
13342
adb31ebb 13343#define XATTR_NAME_CEPH(_type, _name, _flags) \
28e407b8
AA
13344{ \
13345 name: CEPH_XATTR_NAME(_type, _name), \
13346 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13347 readonly: true, \
28e407b8
AA
13348 exists_cb: NULL, \
13349 flags: _flags, \
7c673cae
FG
13350}
13351#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
13352{ \
13353 name: CEPH_XATTR_NAME2(_type, _name, _field), \
13354 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
13355 readonly: false, \
7c673cae 13356 exists_cb: &Client::_vxattrcb_layout_exists, \
28e407b8 13357 flags: 0, \
7c673cae
FG
13358}
13359#define XATTR_QUOTA_FIELD(_type, _name) \
13360{ \
13361 name: CEPH_XATTR_NAME(_type, _name), \
13362 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
13363 readonly: false, \
7c673cae 13364 exists_cb: &Client::_vxattrcb_quota_exists, \
28e407b8 13365 flags: 0, \
7c673cae
FG
13366}
13367
13368const Client::VXattr Client::_dir_vxattrs[] = {
13369 {
13370 name: "ceph.dir.layout",
13371 getxattr_cb: &Client::_vxattrcb_layout,
13372 readonly: false,
7c673cae 13373 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 13374 flags: 0,
7c673cae 13375 },
1d09f67e
TL
13376 // FIXME
13377 // Delete the following dir layout field definitions for release "S"
7c673cae
FG
13378 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
13379 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
13380 XATTR_LAYOUT_FIELD(dir, layout, object_size),
13381 XATTR_LAYOUT_FIELD(dir, layout, pool),
13382 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
adb31ebb
TL
13383 XATTR_NAME_CEPH(dir, entries, VXATTR_DIRSTAT),
13384 XATTR_NAME_CEPH(dir, files, VXATTR_DIRSTAT),
13385 XATTR_NAME_CEPH(dir, subdirs, VXATTR_DIRSTAT),
13386 XATTR_NAME_CEPH(dir, rentries, VXATTR_RSTAT),
13387 XATTR_NAME_CEPH(dir, rfiles, VXATTR_RSTAT),
13388 XATTR_NAME_CEPH(dir, rsubdirs, VXATTR_RSTAT),
f67539c2 13389 XATTR_NAME_CEPH(dir, rsnaps, VXATTR_RSTAT),
adb31ebb
TL
13390 XATTR_NAME_CEPH(dir, rbytes, VXATTR_RSTAT),
13391 XATTR_NAME_CEPH(dir, rctime, VXATTR_RSTAT),
7c673cae
FG
13392 {
13393 name: "ceph.quota",
13394 getxattr_cb: &Client::_vxattrcb_quota,
13395 readonly: false,
7c673cae 13396 exists_cb: &Client::_vxattrcb_quota_exists,
28e407b8 13397 flags: 0,
7c673cae
FG
13398 },
13399 XATTR_QUOTA_FIELD(quota, max_bytes),
13400 XATTR_QUOTA_FIELD(quota, max_files),
1d09f67e
TL
13401 // FIXME
13402 // Delete the following dir pin field definitions for release "S"
11fdf7f2
TL
13403 {
13404 name: "ceph.dir.pin",
13405 getxattr_cb: &Client::_vxattrcb_dir_pin,
13406 readonly: false,
11fdf7f2
TL
13407 exists_cb: &Client::_vxattrcb_dir_pin_exists,
13408 flags: 0,
13409 },
81eedcae
TL
13410 {
13411 name: "ceph.snap.btime",
13412 getxattr_cb: &Client::_vxattrcb_snap_btime,
13413 readonly: true,
81eedcae
TL
13414 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13415 flags: 0,
13416 },
f67539c2
TL
13417 {
13418 name: "ceph.mirror.info",
13419 getxattr_cb: &Client::_vxattrcb_mirror_info,
13420 readonly: false,
13421 exists_cb: &Client::_vxattrcb_mirror_info_exists,
13422 flags: 0,
13423 },
20effc67
TL
13424 {
13425 name: "ceph.caps",
13426 getxattr_cb: &Client::_vxattrcb_caps,
13427 readonly: true,
13428 exists_cb: NULL,
13429 flags: 0,
13430 },
7c673cae
FG
13431 { name: "" } /* Required table terminator */
13432};
13433
13434const Client::VXattr Client::_file_vxattrs[] = {
13435 {
13436 name: "ceph.file.layout",
13437 getxattr_cb: &Client::_vxattrcb_layout,
13438 readonly: false,
7c673cae 13439 exists_cb: &Client::_vxattrcb_layout_exists,
28e407b8 13440 flags: 0,
7c673cae
FG
13441 },
13442 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
13443 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
13444 XATTR_LAYOUT_FIELD(file, layout, object_size),
13445 XATTR_LAYOUT_FIELD(file, layout, pool),
13446 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
81eedcae
TL
13447 {
13448 name: "ceph.snap.btime",
13449 getxattr_cb: &Client::_vxattrcb_snap_btime,
13450 readonly: true,
81eedcae
TL
13451 exists_cb: &Client::_vxattrcb_snap_btime_exists,
13452 flags: 0,
13453 },
20effc67
TL
13454 {
13455 name: "ceph.caps",
13456 getxattr_cb: &Client::_vxattrcb_caps,
13457 readonly: true,
13458 exists_cb: NULL,
13459 flags: 0,
13460 },
7c673cae
FG
13461 { name: "" } /* Required table terminator */
13462};
13463
adb31ebb
TL
13464const Client::VXattr Client::_common_vxattrs[] = {
13465 {
13466 name: "ceph.cluster_fsid",
13467 getxattr_cb: &Client::_vxattrcb_cluster_fsid,
13468 readonly: true,
13469 exists_cb: nullptr,
13470 flags: 0,
13471 },
13472 {
13473 name: "ceph.client_id",
13474 getxattr_cb: &Client::_vxattrcb_client_id,
13475 readonly: true,
13476 exists_cb: nullptr,
13477 flags: 0,
13478 },
1e59de90
TL
13479 {
13480 name: "ceph.fscrypt.auth",
13481 getxattr_cb: &Client::_vxattrcb_fscrypt_auth,
13482 setxattr_cb: &Client::_vxattrcb_fscrypt_auth_set,
13483 readonly: false,
13484 exists_cb: &Client::_vxattrcb_fscrypt_auth_exists,
13485 flags: 0,
13486 },
13487 {
13488 name: "ceph.fscrypt.file",
13489 getxattr_cb: &Client::_vxattrcb_fscrypt_file,
13490 setxattr_cb: &Client::_vxattrcb_fscrypt_file_set,
13491 readonly: false,
13492 exists_cb: &Client::_vxattrcb_fscrypt_file_exists,
13493 flags: 0,
13494 },
adb31ebb
TL
13495 { name: "" } /* Required table terminator */
13496};
13497
7c673cae
FG
13498const Client::VXattr *Client::_get_vxattrs(Inode *in)
13499{
13500 if (in->is_dir())
13501 return _dir_vxattrs;
13502 else if (in->is_file())
13503 return _file_vxattrs;
13504 return NULL;
13505}
13506
13507const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
13508{
13509 if (strncmp(name, "ceph.", 5) == 0) {
13510 const VXattr *vxattr = _get_vxattrs(in);
13511 if (vxattr) {
13512 while (!vxattr->name.empty()) {
13513 if (vxattr->name == name)
13514 return vxattr;
13515 vxattr++;
13516 }
13517 }
adb31ebb
TL
13518
13519 // for common vxattrs
13520 vxattr = _common_vxattrs;
13521 while (!vxattr->name.empty()) {
13522 if (vxattr->name == name)
13523 return vxattr;
13524 vxattr++;
13525 }
7c673cae 13526 }
adb31ebb 13527
7c673cae
FG
13528 return NULL;
13529}
13530
7c673cae
FG
13531int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
13532{
f67539c2
TL
13533 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13534 if (!mref_reader.is_state_satisfied())
13535 return -CEPHFS_ENOTCONN;
181888fb 13536
7c673cae
FG
13537 vinodeno_t vino = _get_vino(in);
13538
13539 ldout(cct, 3) << "ll_readlink " << vino << dendl;
13540 tout(cct) << "ll_readlink" << std::endl;
13541 tout(cct) << vino.ino.val << std::endl;
13542
f67539c2 13543 std::scoped_lock lock(client_lock);
11fdf7f2
TL
13544 for (auto dn : in->dentries) {
13545 touch_dn(dn);
7c673cae
FG
13546 }
13547
13548 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
13549 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
13550 return r;
13551}
13552
13553int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
13554 const UserPerm& perms, InodeRef *inp)
13555{
1adf2230 13556 ldout(cct, 8) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13557 << mode << dec << ", " << rdev << ", uid " << perms.uid()
13558 << ", gid " << perms.gid() << ")" << dendl;
13559
13560 if (strlen(name) > NAME_MAX)
f67539c2 13561 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13562
13563 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13564 return -CEPHFS_EROFS;
7c673cae
FG
13565 }
13566 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13567 return -CEPHFS_EDQUOT;
7c673cae
FG
13568 }
13569
13570 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
13571
13572 filepath path;
13573 dir->make_nosnap_relative_path(path);
13574 path.push_dentry(name);
13575 req->set_filepath(path);
13576 req->set_inode(dir);
13577 req->head.args.mknod.rdev = rdev;
13578 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13579 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13580
13581 bufferlist xattrs_bl;
13582 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
1e59de90
TL
13583 if (res < 0) {
13584 put_request(req);
13585 return res;
13586 }
7c673cae
FG
13587 req->head.args.mknod.mode = mode;
13588 if (xattrs_bl.length() > 0)
13589 req->set_data(xattrs_bl);
13590
1e59de90 13591 Dentry *de = get_or_create(dir, name);
7c673cae
FG
13592 req->set_dentry(de);
13593
13594 res = make_request(req, perms, inp);
13595
13596 trim_cache();
13597
1adf2230 13598 ldout(cct, 8) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae 13599 return res;
7c673cae
FG
13600}
13601
13602int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
13603 dev_t rdev, struct stat *attr, Inode **out,
13604 const UserPerm& perms)
13605{
f67539c2
TL
13606 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13607 if (!mref_reader.is_state_satisfied())
13608 return -CEPHFS_ENOTCONN;
181888fb 13609
7c673cae
FG
13610 vinodeno_t vparent = _get_vino(parent);
13611
13612 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
13613 tout(cct) << "ll_mknod" << std::endl;
13614 tout(cct) << vparent.ino.val << std::endl;
13615 tout(cct) << name << std::endl;
13616 tout(cct) << mode << std::endl;
13617 tout(cct) << rdev << std::endl;
13618
f67539c2 13619 std::scoped_lock lock(client_lock);
11fdf7f2 13620 if (!fuse_default_permissions) {
7c673cae
FG
13621 int r = may_create(parent, perms);
13622 if (r < 0)
13623 return r;
13624 }
13625
13626 InodeRef in;
13627 int r = _mknod(parent, name, mode, rdev, perms, &in);
13628 if (r == 0) {
13629 fill_stat(in, attr);
13630 _ll_get(in.get());
13631 }
13632 tout(cct) << attr->st_ino << std::endl;
13633 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
13634 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13635 *out = in.get();
13636 return r;
13637}
13638
13639int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
13640 dev_t rdev, Inode **out,
13641 struct ceph_statx *stx, unsigned want, unsigned flags,
13642 const UserPerm& perms)
13643{
f67539c2
TL
13644 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13645 if (!mref_reader.is_state_satisfied())
13646 return -CEPHFS_ENOTCONN;
7c673cae 13647
f67539c2 13648 unsigned caps = statx_to_mask(flags, want);
181888fb 13649
7c673cae
FG
13650 vinodeno_t vparent = _get_vino(parent);
13651
13652 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
13653 tout(cct) << "ll_mknodx" << std::endl;
13654 tout(cct) << vparent.ino.val << std::endl;
13655 tout(cct) << name << std::endl;
13656 tout(cct) << mode << std::endl;
13657 tout(cct) << rdev << std::endl;
13658
f67539c2
TL
13659 std::scoped_lock lock(client_lock);
13660
11fdf7f2 13661 if (!fuse_default_permissions) {
7c673cae
FG
13662 int r = may_create(parent, perms);
13663 if (r < 0)
13664 return r;
13665 }
13666
13667 InodeRef in;
13668 int r = _mknod(parent, name, mode, rdev, perms, &in);
13669 if (r == 0) {
13670 fill_statx(in, caps, stx);
13671 _ll_get(in.get());
13672 }
13673 tout(cct) << stx->stx_ino << std::endl;
13674 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
13675 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13676 *out = in.get();
13677 return r;
13678}
13679
13680int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
13681 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
13682 int object_size, const char *data_pool, bool *created,
f67539c2 13683 const UserPerm& perms, std::string alternate_name)
7c673cae 13684{
1adf2230 13685 ldout(cct, 8) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
7c673cae
FG
13686 mode << dec << ")" << dendl;
13687
13688 if (strlen(name) > NAME_MAX)
f67539c2 13689 return -CEPHFS_ENAMETOOLONG;
7c673cae 13690 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13691 return -CEPHFS_EROFS;
7c673cae
FG
13692 }
13693 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13694 return -CEPHFS_EDQUOT;
7c673cae
FG
13695 }
13696
13697 // use normalized flags to generate cmode
11fdf7f2
TL
13698 int cflags = ceph_flags_sys2wire(flags);
13699 if (cct->_conf.get_val<bool>("client_force_lazyio"))
13700 cflags |= CEPH_O_LAZY;
13701
13702 int cmode = ceph_flags_to_mode(cflags);
7c673cae
FG
13703
13704 int64_t pool_id = -1;
13705 if (data_pool && *data_pool) {
13706 pool_id = objecter->with_osdmap(
13707 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
13708 if (pool_id < 0)
f67539c2 13709 return -CEPHFS_EINVAL;
7c673cae 13710 if (pool_id > 0xffffffffll)
f67539c2 13711 return -CEPHFS_ERANGE; // bummer!
7c673cae
FG
13712 }
13713
13714 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
13715
13716 filepath path;
13717 dir->make_nosnap_relative_path(path);
13718 path.push_dentry(name);
13719 req->set_filepath(path);
f67539c2 13720 req->set_alternate_name(std::move(alternate_name));
7c673cae 13721 req->set_inode(dir);
11fdf7f2 13722 req->head.args.open.flags = cflags | CEPH_O_CREAT;
7c673cae
FG
13723
13724 req->head.args.open.stripe_unit = stripe_unit;
13725 req->head.args.open.stripe_count = stripe_count;
13726 req->head.args.open.object_size = object_size;
13727 if (cct->_conf->client_debug_getattr_caps)
13728 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
13729 else
13730 req->head.args.open.mask = 0;
13731 req->head.args.open.pool = pool_id;
13732 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13733 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13734
13735 mode |= S_IFREG;
13736 bufferlist xattrs_bl;
13737 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
1e59de90
TL
13738 if (res < 0) {
13739 put_request(req);
13740 return res;
13741 }
7c673cae
FG
13742 req->head.args.open.mode = mode;
13743 if (xattrs_bl.length() > 0)
13744 req->set_data(xattrs_bl);
13745
1e59de90 13746 Dentry *de = get_or_create(dir, name);
7c673cae
FG
13747 req->set_dentry(de);
13748
13749 res = make_request(req, perms, inp, created);
13750 if (res < 0) {
13751 goto reply_error;
13752 }
13753
13754 /* If the caller passed a value in fhp, do the open */
13755 if(fhp) {
13756 (*inp)->get_open_ref(cmode);
13757 *fhp = _create_fh(inp->get(), flags, cmode, perms);
13758 }
13759
13760 reply_error:
13761 trim_cache();
13762
1adf2230 13763 ldout(cct, 8) << "create(" << path << ", 0" << oct << mode << dec
7c673cae
FG
13764 << " layout " << stripe_unit
13765 << ' ' << stripe_count
13766 << ' ' << object_size
13767 <<") = " << res << dendl;
13768 return res;
7c673cae
FG
13769}
13770
7c673cae 13771int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
f67539c2
TL
13772 InodeRef *inp, const std::map<std::string, std::string> &metadata,
13773 std::string alternate_name)
7c673cae 13774{
1adf2230 13775 ldout(cct, 8) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
7c673cae
FG
13776 << mode << dec << ", uid " << perm.uid()
13777 << ", gid " << perm.gid() << ")" << dendl;
13778
13779 if (strlen(name) > NAME_MAX)
f67539c2 13780 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13781
13782 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 13783 return -CEPHFS_EROFS;
7c673cae
FG
13784 }
13785 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 13786 return -CEPHFS_EDQUOT;
7c673cae 13787 }
f67539c2
TL
13788
13789 bool is_snap_op = dir->snapid == CEPH_SNAPDIR;
13790 MetaRequest *req = new MetaRequest(is_snap_op ?
7c673cae
FG
13791 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
13792
13793 filepath path;
13794 dir->make_nosnap_relative_path(path);
13795 path.push_dentry(name);
13796 req->set_filepath(path);
13797 req->set_inode(dir);
13798 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13799 req->dentry_unless = CEPH_CAP_FILE_EXCL;
f67539c2 13800 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13801
13802 mode |= S_IFDIR;
f67539c2
TL
13803 bufferlist bl;
13804 int res = _posix_acl_create(dir, &mode, bl, perm);
1e59de90
TL
13805 if (res < 0) {
13806 put_request(req);
13807 return res;
13808 }
7c673cae 13809 req->head.args.mkdir.mode = mode;
f67539c2
TL
13810 if (is_snap_op) {
13811 SnapPayload payload;
13812 // clear the bufferlist that may have been populated by the call
13813 // to _posix_acl_create(). MDS mksnap does not make use of it.
13814 // So, reuse it to pass metadata payload.
13815 bl.clear();
13816 payload.metadata = metadata;
13817 encode(payload, bl);
13818 }
13819 if (bl.length() > 0) {
13820 req->set_data(bl);
13821 }
7c673cae 13822
1e59de90 13823 Dentry *de = get_or_create(dir, name);
7c673cae 13824 req->set_dentry(de);
1e59de90 13825
7c673cae
FG
13826 ldout(cct, 10) << "_mkdir: making request" << dendl;
13827 res = make_request(req, perm, inp);
13828 ldout(cct, 10) << "_mkdir result is " << res << dendl;
13829
13830 trim_cache();
13831
1adf2230 13832 ldout(cct, 8) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
7c673cae 13833 return res;
7c673cae
FG
13834}
13835
13836int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
13837 struct stat *attr, Inode **out, const UserPerm& perm)
13838{
f67539c2
TL
13839 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13840 if (!mref_reader.is_state_satisfied())
13841 return -CEPHFS_ENOTCONN;
181888fb 13842
7c673cae
FG
13843 vinodeno_t vparent = _get_vino(parent);
13844
13845 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
13846 tout(cct) << "ll_mkdir" << std::endl;
13847 tout(cct) << vparent.ino.val << std::endl;
13848 tout(cct) << name << std::endl;
13849 tout(cct) << mode << std::endl;
13850
f67539c2
TL
13851 std::scoped_lock lock(client_lock);
13852
11fdf7f2 13853 if (!fuse_default_permissions) {
7c673cae
FG
13854 int r = may_create(parent, perm);
13855 if (r < 0)
13856 return r;
13857 }
13858
13859 InodeRef in;
13860 int r = _mkdir(parent, name, mode, perm, &in);
13861 if (r == 0) {
13862 fill_stat(in, attr);
13863 _ll_get(in.get());
13864 }
13865 tout(cct) << attr->st_ino << std::endl;
13866 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
13867 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13868 *out = in.get();
13869 return r;
13870}
13871
13872int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
13873 struct ceph_statx *stx, unsigned want, unsigned flags,
13874 const UserPerm& perms)
13875{
f67539c2
TL
13876 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13877 if (!mref_reader.is_state_satisfied())
13878 return -CEPHFS_ENOTCONN;
181888fb 13879
7c673cae
FG
13880 vinodeno_t vparent = _get_vino(parent);
13881
13882 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
13883 tout(cct) << "ll_mkdirx" << std::endl;
13884 tout(cct) << vparent.ino.val << std::endl;
13885 tout(cct) << name << std::endl;
13886 tout(cct) << mode << std::endl;
13887
f67539c2
TL
13888 std::scoped_lock lock(client_lock);
13889
11fdf7f2 13890 if (!fuse_default_permissions) {
7c673cae
FG
13891 int r = may_create(parent, perms);
13892 if (r < 0)
13893 return r;
13894 }
13895
13896 InodeRef in;
13897 int r = _mkdir(parent, name, mode, perms, &in);
13898 if (r == 0) {
13899 fill_statx(in, statx_to_mask(flags, want), stx);
13900 _ll_get(in.get());
13901 } else {
13902 stx->stx_ino = 0;
13903 stx->stx_mask = 0;
13904 }
13905 tout(cct) << stx->stx_ino << std::endl;
13906 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
13907 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
13908 *out = in.get();
13909 return r;
13910}
13911
13912int Client::_symlink(Inode *dir, const char *name, const char *target,
f67539c2 13913 const UserPerm& perms, std::string alternate_name, InodeRef *inp)
7c673cae 13914{
1adf2230 13915 ldout(cct, 8) << "_symlink(" << dir->ino << " " << name << ", " << target
7c673cae
FG
13916 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
13917 << dendl;
13918
13919 if (strlen(name) > NAME_MAX)
f67539c2 13920 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
13921
13922 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 13923 return -CEPHFS_EROFS;
7c673cae
FG
13924 }
13925 if (is_quota_files_exceeded(dir, perms)) {
f67539c2 13926 return -CEPHFS_EDQUOT;
7c673cae
FG
13927 }
13928
13929 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
13930
13931 filepath path;
13932 dir->make_nosnap_relative_path(path);
13933 path.push_dentry(name);
13934 req->set_filepath(path);
f67539c2 13935 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
13936 req->set_inode(dir);
13937 req->set_string2(target);
13938 req->dentry_drop = CEPH_CAP_FILE_SHARED;
13939 req->dentry_unless = CEPH_CAP_FILE_EXCL;
13940
1e59de90 13941 Dentry *de = get_or_create(dir, name);
7c673cae
FG
13942 req->set_dentry(de);
13943
1e59de90 13944 int res = make_request(req, perms, inp);
7c673cae
FG
13945
13946 trim_cache();
1adf2230 13947 ldout(cct, 8) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
7c673cae
FG
13948 res << dendl;
13949 return res;
7c673cae
FG
13950}
13951
13952int Client::ll_symlink(Inode *parent, const char *name, const char *value,
13953 struct stat *attr, Inode **out, const UserPerm& perms)
13954{
f67539c2
TL
13955 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13956 if (!mref_reader.is_state_satisfied())
13957 return -CEPHFS_ENOTCONN;
181888fb 13958
7c673cae
FG
13959 vinodeno_t vparent = _get_vino(parent);
13960
13961 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
13962 << dendl;
13963 tout(cct) << "ll_symlink" << std::endl;
13964 tout(cct) << vparent.ino.val << std::endl;
13965 tout(cct) << name << std::endl;
13966 tout(cct) << value << std::endl;
13967
f67539c2
TL
13968 std::scoped_lock lock(client_lock);
13969
11fdf7f2 13970 if (!fuse_default_permissions) {
7c673cae
FG
13971 int r = may_create(parent, perms);
13972 if (r < 0)
13973 return r;
13974 }
13975
13976 InodeRef in;
f67539c2 13977 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
13978 if (r == 0) {
13979 fill_stat(in, attr);
13980 _ll_get(in.get());
13981 }
13982 tout(cct) << attr->st_ino << std::endl;
13983 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
13984 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
13985 *out = in.get();
13986 return r;
13987}
13988
13989int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
13990 Inode **out, struct ceph_statx *stx, unsigned want,
13991 unsigned flags, const UserPerm& perms)
13992{
f67539c2
TL
13993 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
13994 if (!mref_reader.is_state_satisfied())
13995 return -CEPHFS_ENOTCONN;
181888fb 13996
7c673cae
FG
13997 vinodeno_t vparent = _get_vino(parent);
13998
13999 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
14000 << dendl;
14001 tout(cct) << "ll_symlinkx" << std::endl;
14002 tout(cct) << vparent.ino.val << std::endl;
14003 tout(cct) << name << std::endl;
14004 tout(cct) << value << std::endl;
14005
f67539c2
TL
14006 std::scoped_lock lock(client_lock);
14007
11fdf7f2 14008 if (!fuse_default_permissions) {
7c673cae
FG
14009 int r = may_create(parent, perms);
14010 if (r < 0)
14011 return r;
14012 }
14013
14014 InodeRef in;
f67539c2 14015 int r = _symlink(parent, name, value, perms, "", &in);
7c673cae
FG
14016 if (r == 0) {
14017 fill_statx(in, statx_to_mask(flags, want), stx);
14018 _ll_get(in.get());
14019 }
14020 tout(cct) << stx->stx_ino << std::endl;
14021 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
14022 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
14023 *out = in.get();
14024 return r;
14025}
14026
14027int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
14028{
1adf2230 14029 ldout(cct, 8) << "_unlink(" << dir->ino << " " << name
7c673cae
FG
14030 << " uid " << perm.uid() << " gid " << perm.gid()
14031 << ")" << dendl;
14032
14033 if (dir->snapid != CEPH_NOSNAP) {
f67539c2 14034 return -CEPHFS_EROFS;
7c673cae
FG
14035 }
14036
14037 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
14038
14039 filepath path;
14040 dir->make_nosnap_relative_path(path);
14041 path.push_dentry(name);
14042 req->set_filepath(path);
14043
14044 InodeRef otherin;
b32b8144 14045 Inode *in;
1e59de90 14046 Dentry *de = get_or_create(dir, name);
7c673cae
FG
14047 req->set_dentry(de);
14048 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14049 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14050
1e59de90
TL
14051 int res = _lookup(dir, name, 0, &otherin, perm);
14052 if (res < 0) {
14053 put_request(req);
14054 return res;
14055 }
b32b8144
FG
14056
14057 in = otherin.get();
14058 req->set_other_inode(in);
14059 in->break_all_delegs();
7c673cae
FG
14060 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14061
14062 req->set_inode(dir);
14063
14064 res = make_request(req, perm);
14065
14066 trim_cache();
1adf2230 14067 ldout(cct, 8) << "unlink(" << path << ") = " << res << dendl;
7c673cae 14068 return res;
7c673cae
FG
14069}
14070
14071int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
14072{
f67539c2
TL
14073 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14074 if (!mref_reader.is_state_satisfied())
14075 return -CEPHFS_ENOTCONN;
181888fb 14076
7c673cae
FG
14077 vinodeno_t vino = _get_vino(in);
14078
14079 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
14080 tout(cct) << "ll_unlink" << std::endl;
14081 tout(cct) << vino.ino.val << std::endl;
14082 tout(cct) << name << std::endl;
14083
f67539c2
TL
14084 std::scoped_lock lock(client_lock);
14085
11fdf7f2 14086 if (!fuse_default_permissions) {
7c673cae
FG
14087 int r = may_delete(in, name, perm);
14088 if (r < 0)
14089 return r;
14090 }
14091 return _unlink(in, name, perm);
14092}
14093
14094int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
14095{
1adf2230 14096 ldout(cct, 8) << "_rmdir(" << dir->ino << " " << name << " uid "
7c673cae
FG
14097 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
14098
14099 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
f67539c2 14100 return -CEPHFS_EROFS;
7c673cae 14101 }
1e59de90 14102
b32b8144
FG
14103 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP : CEPH_MDS_OP_RMDIR;
14104 MetaRequest *req = new MetaRequest(op);
7c673cae
FG
14105 filepath path;
14106 dir->make_nosnap_relative_path(path);
14107 path.push_dentry(name);
14108 req->set_filepath(path);
11fdf7f2 14109 req->set_inode(dir);
7c673cae
FG
14110
14111 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14112 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14113 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
14114
14115 InodeRef in;
14116
1e59de90
TL
14117 Dentry *de = get_or_create(dir, name);
14118 if (op == CEPH_MDS_OP_RMDIR)
b32b8144
FG
14119 req->set_dentry(de);
14120 else
14121 de->get();
14122
1e59de90
TL
14123 int res = _lookup(dir, name, 0, &in, perms);
14124 if (res < 0) {
14125 put_request(req);
14126 return res;
14127 }
11fdf7f2
TL
14128
14129 if (op == CEPH_MDS_OP_RMSNAP) {
7c673cae 14130 unlink(de, true, true);
b32b8144 14131 de->put();
7c673cae 14132 }
11fdf7f2 14133 req->set_other_inode(in.get());
7c673cae
FG
14134
14135 res = make_request(req, perms);
14136
14137 trim_cache();
1adf2230 14138 ldout(cct, 8) << "rmdir(" << path << ") = " << res << dendl;
7c673cae 14139 return res;
7c673cae
FG
14140}
14141
14142int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
14143{
f67539c2
TL
14144 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14145 if (!mref_reader.is_state_satisfied())
14146 return -CEPHFS_ENOTCONN;
181888fb 14147
7c673cae
FG
14148 vinodeno_t vino = _get_vino(in);
14149
14150 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
14151 tout(cct) << "ll_rmdir" << std::endl;
14152 tout(cct) << vino.ino.val << std::endl;
14153 tout(cct) << name << std::endl;
14154
f67539c2
TL
14155 std::scoped_lock lock(client_lock);
14156
11fdf7f2 14157 if (!fuse_default_permissions) {
7c673cae
FG
14158 int r = may_delete(in, name, perms);
14159 if (r < 0)
14160 return r;
14161 }
14162
14163 return _rmdir(in, name, perms);
14164}
14165
f67539c2 14166int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm, std::string alternate_name)
7c673cae 14167{
1adf2230 14168 ldout(cct, 8) << "_rename(" << fromdir->ino << " " << fromname << " to "
7c673cae
FG
14169 << todir->ino << " " << toname
14170 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
14171 << dendl;
14172
14173 if (fromdir->snapid != todir->snapid)
f67539c2 14174 return -CEPHFS_EXDEV;
7c673cae
FG
14175
14176 int op = CEPH_MDS_OP_RENAME;
14177 if (fromdir->snapid != CEPH_NOSNAP) {
14178 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
14179 op = CEPH_MDS_OP_RENAMESNAP;
14180 else
f67539c2
TL
14181 return -CEPHFS_EROFS;
14182 }
2a845540 14183 if (cct->_conf.get_val<bool>("client_quota") && fromdir != todir) {
f67539c2 14184 Inode *fromdir_root =
1e59de90 14185 fromdir->quota.is_enabled(QUOTA_MAX_FILES) ? fromdir : get_quota_root(fromdir, perm, QUOTA_MAX_FILES);
f67539c2 14186 Inode *todir_root =
1e59de90 14187 todir->quota.is_enabled(QUOTA_MAX_FILES) ? todir : get_quota_root(todir, perm, QUOTA_MAX_FILES);
f67539c2
TL
14188 if (fromdir_root != todir_root) {
14189 return -CEPHFS_EXDEV;
14190 }
7c673cae 14191 }
7c673cae
FG
14192
14193 InodeRef target;
14194 MetaRequest *req = new MetaRequest(op);
14195
14196 filepath from;
14197 fromdir->make_nosnap_relative_path(from);
14198 from.push_dentry(fromname);
14199 filepath to;
14200 todir->make_nosnap_relative_path(to);
14201 to.push_dentry(toname);
14202 req->set_filepath(to);
14203 req->set_filepath2(from);
f67539c2 14204 req->set_alternate_name(std::move(alternate_name));
7c673cae 14205
1e59de90
TL
14206 Dentry *oldde = get_or_create(fromdir, fromname);
14207 Dentry *de = get_or_create(todir, toname);
7c673cae 14208
1e59de90 14209 int res;
7c673cae
FG
14210 if (op == CEPH_MDS_OP_RENAME) {
14211 req->set_old_dentry(oldde);
14212 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
14213 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
14214
14215 req->set_dentry(de);
14216 req->dentry_drop = CEPH_CAP_FILE_SHARED;
14217 req->dentry_unless = CEPH_CAP_FILE_EXCL;
14218
14219 InodeRef oldin, otherin;
f67539c2 14220 res = _lookup(fromdir, fromname, 0, &oldin, perm);
7c673cae
FG
14221 if (res < 0)
14222 goto fail;
b32b8144
FG
14223
14224 Inode *oldinode = oldin.get();
14225 oldinode->break_all_delegs();
14226 req->set_old_inode(oldinode);
7c673cae
FG
14227 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
14228
14229 res = _lookup(todir, toname, 0, &otherin, perm);
b32b8144
FG
14230 switch (res) {
14231 case 0:
14232 {
14233 Inode *in = otherin.get();
14234 req->set_other_inode(in);
14235 in->break_all_delegs();
14236 }
7c673cae 14237 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
b32b8144 14238 break;
f67539c2 14239 case -CEPHFS_ENOENT:
b32b8144
FG
14240 break;
14241 default:
14242 goto fail;
7c673cae
FG
14243 }
14244
14245 req->set_inode(todir);
14246 } else {
14247 // renamesnap reply contains no tracedn, so we need to invalidate
14248 // dentry manually
14249 unlink(oldde, true, true);
14250 unlink(de, true, true);
11fdf7f2
TL
14251
14252 req->set_inode(todir);
7c673cae
FG
14253 }
14254
14255 res = make_request(req, perm, &target);
14256 ldout(cct, 10) << "rename result is " << res << dendl;
14257
14258 // renamed item from our cache
14259
14260 trim_cache();
1adf2230 14261 ldout(cct, 8) << "_rename(" << from << ", " << to << ") = " << res << dendl;
7c673cae
FG
14262 return res;
14263
14264 fail:
14265 put_request(req);
14266 return res;
14267}
14268
14269int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
14270 const char *newname, const UserPerm& perm)
14271{
f67539c2
TL
14272 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14273 if (!mref_reader.is_state_satisfied())
14274 return -CEPHFS_ENOTCONN;
181888fb 14275
7c673cae
FG
14276 vinodeno_t vparent = _get_vino(parent);
14277 vinodeno_t vnewparent = _get_vino(newparent);
14278
14279 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
14280 << vnewparent << " " << newname << dendl;
14281 tout(cct) << "ll_rename" << std::endl;
14282 tout(cct) << vparent.ino.val << std::endl;
14283 tout(cct) << name << std::endl;
14284 tout(cct) << vnewparent.ino.val << std::endl;
14285 tout(cct) << newname << std::endl;
14286
f67539c2
TL
14287 std::scoped_lock lock(client_lock);
14288
11fdf7f2 14289 if (!fuse_default_permissions) {
7c673cae
FG
14290 int r = may_delete(parent, name, perm);
14291 if (r < 0)
14292 return r;
14293 r = may_delete(newparent, newname, perm);
f67539c2 14294 if (r < 0 && r != -CEPHFS_ENOENT)
7c673cae
FG
14295 return r;
14296 }
14297
f67539c2 14298 return _rename(parent, name, newparent, newname, perm, "");
7c673cae
FG
14299}
14300
f67539c2 14301int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, std::string alternate_name, InodeRef *inp)
7c673cae 14302{
1adf2230 14303 ldout(cct, 8) << "_link(" << in->ino << " to " << dir->ino << " " << newname
7c673cae
FG
14304 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
14305
14306 if (strlen(newname) > NAME_MAX)
f67539c2 14307 return -CEPHFS_ENAMETOOLONG;
7c673cae
FG
14308
14309 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
f67539c2 14310 return -CEPHFS_EROFS;
7c673cae
FG
14311 }
14312 if (is_quota_files_exceeded(dir, perm)) {
f67539c2 14313 return -CEPHFS_EDQUOT;
7c673cae
FG
14314 }
14315
b32b8144 14316 in->break_all_delegs();
7c673cae
FG
14317 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
14318
14319 filepath path(newname, dir->ino);
14320 req->set_filepath(path);
f67539c2 14321 req->set_alternate_name(std::move(alternate_name));
7c673cae
FG
14322 filepath existing(in->ino);
14323 req->set_filepath2(existing);
14324
14325 req->set_inode(dir);
14326 req->inode_drop = CEPH_CAP_FILE_SHARED;
14327 req->inode_unless = CEPH_CAP_FILE_EXCL;
14328
1e59de90 14329 Dentry *de = get_or_create(dir, newname);
7c673cae 14330 req->set_dentry(de);
1e59de90
TL
14331
14332 int res = make_request(req, perm, inp);
7c673cae
FG
14333 ldout(cct, 10) << "link result is " << res << dendl;
14334
14335 trim_cache();
1adf2230 14336 ldout(cct, 8) << "link(" << existing << ", " << path << ") = " << res << dendl;
7c673cae 14337 return res;
7c673cae
FG
14338}
14339
14340int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
14341 const UserPerm& perm)
14342{
f67539c2
TL
14343 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14344 if (!mref_reader.is_state_satisfied())
14345 return -CEPHFS_ENOTCONN;
181888fb 14346
7c673cae
FG
14347 vinodeno_t vino = _get_vino(in);
14348 vinodeno_t vnewparent = _get_vino(newparent);
14349
31f18b77 14350 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
7c673cae
FG
14351 newname << dendl;
14352 tout(cct) << "ll_link" << std::endl;
14353 tout(cct) << vino.ino.val << std::endl;
14354 tout(cct) << vnewparent << std::endl;
14355 tout(cct) << newname << std::endl;
14356
7c673cae
FG
14357 InodeRef target;
14358
f67539c2
TL
14359 std::scoped_lock lock(client_lock);
14360
11fdf7f2 14361 if (!fuse_default_permissions) {
7c673cae 14362 if (S_ISDIR(in->mode))
f67539c2 14363 return -CEPHFS_EPERM;
7c673cae 14364
11fdf7f2 14365 int r = may_hardlink(in, perm);
7c673cae
FG
14366 if (r < 0)
14367 return r;
14368
14369 r = may_create(newparent, perm);
14370 if (r < 0)
14371 return r;
14372 }
14373
f67539c2 14374 return _link(in, newparent, newname, perm, "", &target);
7c673cae
FG
14375}
14376
14377int Client::ll_num_osds(void)
14378{
f67539c2 14379 std::scoped_lock lock(client_lock);
7c673cae
FG
14380 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
14381}
14382
14383int Client::ll_osdaddr(int osd, uint32_t *addr)
14384{
f67539c2 14385 std::scoped_lock lock(client_lock);
181888fb 14386
7c673cae
FG
14387 entity_addr_t g;
14388 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
14389 if (!o.exists(osd))
14390 return false;
11fdf7f2 14391 g = o.get_addrs(osd).front();
7c673cae
FG
14392 return true;
14393 });
14394 if (!exists)
14395 return -1;
14396 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
14397 *addr = ntohl(nb_addr);
14398 return 0;
14399}
181888fb 14400
7c673cae
FG
14401uint32_t Client::ll_stripe_unit(Inode *in)
14402{
f67539c2 14403 std::scoped_lock lock(client_lock);
7c673cae
FG
14404 return in->layout.stripe_unit;
14405}
14406
14407uint64_t Client::ll_snap_seq(Inode *in)
14408{
f67539c2 14409 std::scoped_lock lock(client_lock);
7c673cae
FG
14410 return in->snaprealm->seq;
14411}
14412
14413int Client::ll_file_layout(Inode *in, file_layout_t *layout)
14414{
f67539c2 14415 std::scoped_lock lock(client_lock);
7c673cae
FG
14416 *layout = in->layout;
14417 return 0;
14418}
14419
14420int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
14421{
14422 return ll_file_layout(fh->inode.get(), layout);
14423}
14424
14425/* Currently we cannot take advantage of redundancy in reads, since we
14426 would have to go through all possible placement groups (a
14427 potentially quite large number determined by a hash), and use CRUSH
14428 to calculate the appropriate set of OSDs for each placement group,
14429 then index into that. An array with one entry per OSD is much more
14430 tractable and works for demonstration purposes. */
14431
14432int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
14433 file_layout_t* layout)
14434{
f67539c2 14435 std::scoped_lock lock(client_lock);
181888fb 14436
28e407b8 14437 inodeno_t ino = in->ino;
7c673cae
FG
14438 uint32_t object_size = layout->object_size;
14439 uint32_t su = layout->stripe_unit;
14440 uint32_t stripe_count = layout->stripe_count;
14441 uint64_t stripes_per_object = object_size / su;
11fdf7f2 14442 uint64_t stripeno = 0, stripepos = 0;
7c673cae 14443
11fdf7f2
TL
14444 if(stripe_count) {
14445 stripeno = blockno / stripe_count; // which horizontal stripe (Y)
14446 stripepos = blockno % stripe_count; // which object in the object set (X)
14447 }
7c673cae
FG
14448 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
14449 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
14450
14451 object_t oid = file_object_t(ino, objectno);
14452 return objecter->with_osdmap([&](const OSDMap& o) {
14453 ceph_object_layout olayout =
14454 o.file_to_object_layout(oid, *layout);
14455 pg_t pg = (pg_t)olayout.ol_pgid;
14456 vector<int> osds;
14457 int primary;
14458 o.pg_to_acting_osds(pg, &osds, &primary);
14459 return primary;
14460 });
14461}
14462
14463/* Return the offset of the block, internal to the object */
14464
14465uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
14466{
f67539c2 14467 std::scoped_lock lock(client_lock);
7c673cae
FG
14468 file_layout_t *layout=&(in->layout);
14469 uint32_t object_size = layout->object_size;
14470 uint32_t su = layout->stripe_unit;
14471 uint64_t stripes_per_object = object_size / su;
14472
14473 return (blockno % stripes_per_object) * su;
14474}
14475
14476int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
14477 const UserPerm& perms)
14478{
f67539c2
TL
14479 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14480 if (!mref_reader.is_state_satisfied())
14481 return -CEPHFS_ENOTCONN;
181888fb 14482
7c673cae
FG
14483 vinodeno_t vino = _get_vino(in);
14484
14485 ldout(cct, 3) << "ll_opendir " << vino << dendl;
14486 tout(cct) << "ll_opendir" << std::endl;
14487 tout(cct) << vino.ino.val << std::endl;
14488
f67539c2
TL
14489 std::scoped_lock lock(client_lock);
14490
11fdf7f2 14491 if (!fuse_default_permissions) {
7c673cae
FG
14492 int r = may_open(in, flags, perms);
14493 if (r < 0)
14494 return r;
14495 }
14496
14497 int r = _opendir(in, dirpp, perms);
f67539c2 14498 tout(cct) << (uintptr_t)*dirpp << std::endl;
7c673cae
FG
14499
14500 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
14501 << dendl;
14502 return r;
14503}
14504
14505int Client::ll_releasedir(dir_result_t *dirp)
14506{
f67539c2
TL
14507 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14508 if (!mref_reader.is_state_satisfied())
14509 return -CEPHFS_ENOTCONN;
14510
7c673cae
FG
14511 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
14512 tout(cct) << "ll_releasedir" << std::endl;
f67539c2 14513 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14514
f67539c2 14515 std::scoped_lock lock(client_lock);
181888fb 14516
7c673cae
FG
14517 _closedir(dirp);
14518 return 0;
14519}
14520
14521int Client::ll_fsyncdir(dir_result_t *dirp)
14522{
f67539c2
TL
14523 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14524 if (!mref_reader.is_state_satisfied())
14525 return -CEPHFS_ENOTCONN;
14526
7c673cae
FG
14527 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
14528 tout(cct) << "ll_fsyncdir" << std::endl;
f67539c2 14529 tout(cct) << (uintptr_t)dirp << std::endl;
181888fb 14530
f67539c2 14531 std::scoped_lock lock(client_lock);
7c673cae
FG
14532 return _fsync(dirp->inode.get(), false);
14533}
14534
14535int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
14536{
11fdf7f2 14537 ceph_assert(!(flags & O_CREAT));
7c673cae 14538
f67539c2
TL
14539 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14540 if (!mref_reader.is_state_satisfied())
14541 return -CEPHFS_ENOTCONN;
181888fb 14542
7c673cae
FG
14543 vinodeno_t vino = _get_vino(in);
14544
14545 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
14546 tout(cct) << "ll_open" << std::endl;
14547 tout(cct) << vino.ino.val << std::endl;
14548 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14549
f67539c2
TL
14550 std::scoped_lock lock(client_lock);
14551
7c673cae 14552 int r;
11fdf7f2 14553 if (!fuse_default_permissions) {
7c673cae
FG
14554 r = may_open(in, flags, perms);
14555 if (r < 0)
14556 goto out;
14557 }
14558
14559 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
14560
14561 out:
14562 Fh *fhptr = fhp ? *fhp : NULL;
14563 if (fhptr) {
14564 ll_unclosed_fh_set.insert(fhptr);
14565 }
f67539c2 14566 tout(cct) << (uintptr_t)fhptr << std::endl;
7c673cae
FG
14567 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
14568 " = " << r << " (" << fhptr << ")" << dendl;
14569 return r;
14570}
14571
14572int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
14573 int flags, InodeRef *in, int caps, Fh **fhp,
14574 const UserPerm& perms)
14575{
14576 *fhp = NULL;
14577
14578 vinodeno_t vparent = _get_vino(parent);
14579
1adf2230 14580 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14581 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
14582 << ", gid " << perms.gid() << dendl;
14583 tout(cct) << "ll_create" << std::endl;
14584 tout(cct) << vparent.ino.val << std::endl;
14585 tout(cct) << name << std::endl;
14586 tout(cct) << mode << std::endl;
14587 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
14588
14589 bool created = false;
14590 int r = _lookup(parent, name, caps, in, perms);
14591
14592 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
f67539c2 14593 return -CEPHFS_EEXIST;
7c673cae 14594
f67539c2 14595 if (r == -CEPHFS_ENOENT && (flags & O_CREAT)) {
11fdf7f2 14596 if (!fuse_default_permissions) {
7c673cae
FG
14597 r = may_create(parent, perms);
14598 if (r < 0)
14599 goto out;
14600 }
14601 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
f67539c2 14602 perms, "");
7c673cae
FG
14603 if (r < 0)
14604 goto out;
14605 }
14606
14607 if (r < 0)
14608 goto out;
14609
11fdf7f2 14610 ceph_assert(*in);
7c673cae
FG
14611
14612 ldout(cct, 20) << "_ll_create created = " << created << dendl;
14613 if (!created) {
11fdf7f2 14614 if (!fuse_default_permissions) {
7c673cae
FG
14615 r = may_open(in->get(), flags, perms);
14616 if (r < 0) {
14617 if (*fhp) {
14618 int release_r = _release_fh(*fhp);
11fdf7f2 14619 ceph_assert(release_r == 0); // during create, no async data ops should have happened
7c673cae
FG
14620 }
14621 goto out;
14622 }
14623 }
14624 if (*fhp == NULL) {
14625 r = _open(in->get(), flags, mode, fhp, perms);
14626 if (r < 0)
14627 goto out;
14628 }
14629 }
14630
14631out:
14632 if (*fhp) {
14633 ll_unclosed_fh_set.insert(*fhp);
14634 }
14635
1e59de90
TL
14636 #ifdef _WIN32
14637 uint64_t ino = 0;
14638 #else
7c673cae 14639 ino_t ino = 0;
1e59de90 14640 #endif
7c673cae
FG
14641 if (r >= 0) {
14642 Inode *inode = in->get();
14643 if (use_faked_inos())
14644 ino = inode->faked_ino;
14645 else
14646 ino = inode->ino;
14647 }
14648
f67539c2 14649 tout(cct) << (uintptr_t)*fhp << std::endl;
7c673cae 14650 tout(cct) << ino << std::endl;
1adf2230 14651 ldout(cct, 8) << "_ll_create " << vparent << " " << name << " 0" << oct <<
7c673cae
FG
14652 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
14653 *fhp << " " << hex << ino << dec << ")" << dendl;
14654
14655 return r;
14656}
14657
14658int Client::ll_create(Inode *parent, const char *name, mode_t mode,
14659 int flags, struct stat *attr, Inode **outp, Fh **fhp,
14660 const UserPerm& perms)
14661{
f67539c2
TL
14662 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14663 if (!mref_reader.is_state_satisfied())
14664 return -CEPHFS_ENOTCONN;
7c673cae 14665
f67539c2
TL
14666 std::scoped_lock lock(client_lock);
14667 InodeRef in;
181888fb 14668
7c673cae
FG
14669 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
14670 fhp, perms);
14671 if (r >= 0) {
11fdf7f2 14672 ceph_assert(in);
7c673cae
FG
14673
14674 // passing an Inode in outp requires an additional ref
14675 if (outp) {
14676 _ll_get(in.get());
14677 *outp = in.get();
14678 }
14679 fill_stat(in, attr);
14680 } else {
14681 attr->st_ino = 0;
14682 }
14683
14684 return r;
14685}
14686
14687int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
14688 int oflags, Inode **outp, Fh **fhp,
14689 struct ceph_statx *stx, unsigned want, unsigned lflags,
14690 const UserPerm& perms)
14691{
14692 unsigned caps = statx_to_mask(lflags, want);
f67539c2
TL
14693 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14694 if (!mref_reader.is_state_satisfied())
14695 return -CEPHFS_ENOTCONN;
7c673cae 14696
f67539c2
TL
14697 std::scoped_lock lock(client_lock);
14698 InodeRef in;
7c673cae
FG
14699
14700 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
14701 if (r >= 0) {
11fdf7f2 14702 ceph_assert(in);
7c673cae
FG
14703
14704 // passing an Inode in outp requires an additional ref
14705 if (outp) {
14706 _ll_get(in.get());
14707 *outp = in.get();
14708 }
14709 fill_statx(in, caps, stx);
14710 } else {
14711 stx->stx_ino = 0;
14712 stx->stx_mask = 0;
14713 }
14714
14715 return r;
14716}
14717
14718loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
14719{
f67539c2
TL
14720 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14721 if (!mref_reader.is_state_satisfied())
14722 return -CEPHFS_ENOTCONN;
14723
7c673cae
FG
14724 tout(cct) << "ll_lseek" << std::endl;
14725 tout(cct) << offset << std::endl;
14726 tout(cct) << whence << std::endl;
14727
f67539c2 14728 std::scoped_lock lock(client_lock);
7c673cae
FG
14729 return _lseek(fh, offset, whence);
14730}
14731
14732int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
14733{
f67539c2
TL
14734 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14735 if (!mref_reader.is_state_satisfied())
14736 return -CEPHFS_ENOTCONN;
14737
7c673cae
FG
14738 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
14739 tout(cct) << "ll_read" << std::endl;
f67539c2 14740 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14741 tout(cct) << off << std::endl;
14742 tout(cct) << len << std::endl;
14743
11fdf7f2
TL
14744 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14745 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14746 std::scoped_lock lock(client_lock);
14747
f6b5b4d7
TL
14748 int r = _read(fh, off, len, bl);
14749 ldout(cct, 3) << "ll_read " << fh << " " << off << "~" << len << " = " << r
14750 << dendl;
14751 return r;
7c673cae
FG
14752}
14753
14754int Client::ll_read_block(Inode *in, uint64_t blockid,
14755 char *buf,
14756 uint64_t offset,
14757 uint64_t length,
14758 file_layout_t* layout)
14759{
f67539c2
TL
14760 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14761 if (!mref_reader.is_state_satisfied())
14762 return -CEPHFS_ENOTCONN;
181888fb 14763
b32b8144 14764 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14765 object_t oid = file_object_t(vino.ino, blockid);
14766 C_SaferCond onfinish;
14767 bufferlist bl;
14768
14769 objecter->read(oid,
14770 object_locator_t(layout->pool_id),
14771 offset,
14772 length,
14773 vino.snapid,
14774 &bl,
14775 CEPH_OSD_FLAG_READ,
14776 &onfinish);
14777
7c673cae 14778 int r = onfinish.wait();
7c673cae 14779 if (r >= 0) {
9f95a23c 14780 bl.begin().copy(bl.length(), buf);
7c673cae
FG
14781 r = bl.length();
14782 }
14783
14784 return r;
14785}
14786
14787/* It appears that the OSD doesn't return success unless the entire
14788 buffer was written, return the write length on success. */
14789
14790int Client::ll_write_block(Inode *in, uint64_t blockid,
14791 char* buf, uint64_t offset,
14792 uint64_t length, file_layout_t* layout,
14793 uint64_t snapseq, uint32_t sync)
14794{
7c673cae 14795 vinodeno_t vino = ll_get_vino(in);
7c673cae 14796 int r = 0;
11fdf7f2 14797 std::unique_ptr<C_SaferCond> onsafe = nullptr;
f67539c2
TL
14798
14799 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14800 if (!mref_reader.is_state_satisfied())
14801 return -CEPHFS_ENOTCONN;
14802
7c673cae 14803 if (length == 0) {
f67539c2 14804 return -CEPHFS_EINVAL;
7c673cae
FG
14805 }
14806 if (true || sync) {
14807 /* if write is stable, the epilogue is waiting on
14808 * flock */
11fdf7f2 14809 onsafe.reset(new C_SaferCond("Client::ll_write_block flock"));
7c673cae
FG
14810 }
14811 object_t oid = file_object_t(vino.ino, blockid);
14812 SnapContext fakesnap;
11fdf7f2
TL
14813 ceph::bufferlist bl;
14814 if (length > 0) {
14815 bl.push_back(buffer::copy(buf, length));
14816 }
7c673cae
FG
14817
14818 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
14819 << dendl;
14820
14821 fakesnap.seq = snapseq;
14822
14823 /* lock just in time */
7c673cae
FG
14824 objecter->write(oid,
14825 object_locator_t(layout->pool_id),
14826 offset,
14827 length,
14828 fakesnap,
14829 bl,
14830 ceph::real_clock::now(),
14831 0,
11fdf7f2 14832 onsafe.get());
7c673cae 14833
11fdf7f2
TL
14834 if (nullptr != onsafe) {
14835 r = onsafe->wait();
7c673cae
FG
14836 }
14837
14838 if (r < 0) {
14839 return r;
14840 } else {
14841 return length;
14842 }
14843}
14844
14845int Client::ll_commit_blocks(Inode *in,
14846 uint64_t offset,
14847 uint64_t length)
14848{
7c673cae
FG
14849 /*
14850 BarrierContext *bctx;
b32b8144 14851 vinodeno_t vino = _get_vino(in);
7c673cae
FG
14852 uint64_t ino = vino.ino;
14853
14854 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
14855 << offset << " to " << length << dendl;
14856
14857 if (length == 0) {
f67539c2 14858 return -CEPHFS_EINVAL;
7c673cae
FG
14859 }
14860
f67539c2 14861 std::scoped_lock lock(client_lock);
7c673cae
FG
14862 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
14863 if (p != barriers.end()) {
14864 barrier_interval civ(offset, offset + length);
14865 p->second->commit_barrier(civ);
14866 }
14867 */
14868 return 0;
14869}
14870
14871int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
14872{
7c673cae
FG
14873 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
14874 "~" << len << dendl;
14875 tout(cct) << "ll_write" << std::endl;
f67539c2 14876 tout(cct) << (uintptr_t)fh << std::endl;
7c673cae
FG
14877 tout(cct) << off << std::endl;
14878 tout(cct) << len << std::endl;
14879
f67539c2
TL
14880 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14881 if (!mref_reader.is_state_satisfied())
14882 return -CEPHFS_ENOTCONN;
181888fb 14883
11fdf7f2
TL
14884 /* We can't return bytes written larger than INT_MAX, clamp len to that */
14885 len = std::min(len, (loff_t)INT_MAX);
f67539c2
TL
14886 std::scoped_lock lock(client_lock);
14887
7c673cae
FG
14888 int r = _write(fh, off, len, data, NULL, 0);
14889 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
14890 << dendl;
14891 return r;
14892}
14893
11fdf7f2
TL
14894int64_t Client::ll_writev(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14895{
f67539c2
TL
14896 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14897 if (!mref_reader.is_state_satisfied())
14898 return -CEPHFS_ENOTCONN;
14899
20effc67
TL
14900 std::scoped_lock cl(client_lock);
14901 return _preadv_pwritev_locked(fh, iov, iovcnt, off, true, false);
11fdf7f2
TL
14902}
14903
14904int64_t Client::ll_readv(struct Fh *fh, const struct iovec *iov, int iovcnt, int64_t off)
14905{
f67539c2
TL
14906 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14907 if (!mref_reader.is_state_satisfied())
14908 return -CEPHFS_ENOTCONN;
14909
20effc67
TL
14910 std::scoped_lock cl(client_lock);
14911 return _preadv_pwritev_locked(fh, iov, iovcnt, off, false, false);
11fdf7f2
TL
14912}
14913
7c673cae
FG
14914int Client::ll_flush(Fh *fh)
14915{
f67539c2
TL
14916 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14917 if (!mref_reader.is_state_satisfied())
14918 return -CEPHFS_ENOTCONN;
14919
7c673cae
FG
14920 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
14921 tout(cct) << "ll_flush" << std::endl;
f67539c2 14922 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14923
f67539c2 14924 std::scoped_lock lock(client_lock);
7c673cae
FG
14925 return _flush(fh);
14926}
14927
14928int Client::ll_fsync(Fh *fh, bool syncdataonly)
14929{
f67539c2
TL
14930 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14931 if (!mref_reader.is_state_satisfied())
14932 return -CEPHFS_ENOTCONN;
14933
7c673cae
FG
14934 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
14935 tout(cct) << "ll_fsync" << std::endl;
f67539c2 14936 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 14937
f67539c2 14938 std::scoped_lock lock(client_lock);
7c673cae
FG
14939 int r = _fsync(fh, syncdataonly);
14940 if (r) {
14941 // If we're returning an error, clear it from the FH
14942 fh->take_async_err();
14943 }
14944 return r;
14945}
14946
28e407b8
AA
14947int Client::ll_sync_inode(Inode *in, bool syncdataonly)
14948{
f67539c2
TL
14949 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
14950 if (!mref_reader.is_state_satisfied())
14951 return -CEPHFS_ENOTCONN;
14952
28e407b8
AA
14953 ldout(cct, 3) << "ll_sync_inode " << *in << " " << dendl;
14954 tout(cct) << "ll_sync_inode" << std::endl;
f67539c2 14955 tout(cct) << (uintptr_t)in << std::endl;
28e407b8 14956
f67539c2 14957 std::scoped_lock lock(client_lock);
28e407b8
AA
14958 return _fsync(in, syncdataonly);
14959}
14960
1e59de90
TL
14961int Client::clear_suid_sgid(Inode *in, const UserPerm& perms, bool defer)
14962{
14963 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << " defer "
14964 << defer << dendl;
14965
14966 if (!in->is_file()) {
14967 return 0;
14968 }
14969
14970 if (likely(!(in->mode & (S_ISUID|S_ISGID)))) {
14971 return 0;
14972 }
14973
14974 if (perms.uid() == 0 || perms.uid() == in->uid) {
14975 return 0;
14976 }
14977
14978 int mask = 0;
14979
14980 // always drop the suid
14981 if (unlikely(in->mode & S_ISUID)) {
14982 mask = CEPH_SETATTR_KILL_SUID;
14983 }
14984
14985 // remove the sgid if S_IXUGO is set or the inode is
14986 // is not in the caller's group list.
14987 if ((in->mode & S_ISGID) &&
14988 ((in->mode & S_IXUGO) || !perms.gid_in_groups(in->gid))) {
14989 mask |= CEPH_SETATTR_KILL_SGID;
14990 }
14991
14992 ldout(cct, 20) << __func__ << " mask " << mask << dendl;
14993 if (defer) {
14994 return mask;
14995 }
14996
14997 struct ceph_statx stx = { 0 };
14998 return __setattrx(in, &stx, mask, perms);
14999}
15000
7c673cae
FG
15001int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
15002{
f67539c2
TL
15003 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15004
7c673cae 15005 if (offset < 0 || length <= 0)
f67539c2 15006 return -CEPHFS_EINVAL;
7c673cae
FG
15007
15008 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
f67539c2 15009 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
15010
15011 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
f67539c2 15012 return -CEPHFS_EOPNOTSUPP;
7c673cae
FG
15013
15014 Inode *in = fh->inode.get();
15015
15016 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
15017 !(mode & FALLOC_FL_PUNCH_HOLE)) {
f67539c2 15018 return -CEPHFS_ENOSPC;
7c673cae
FG
15019 }
15020
15021 if (in->snapid != CEPH_NOSNAP)
f67539c2 15022 return -CEPHFS_EROFS;
7c673cae
FG
15023
15024 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
f67539c2 15025 return -CEPHFS_EBADF;
7c673cae
FG
15026
15027 uint64_t size = offset + length;
15028 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
15029 size > in->size &&
11fdf7f2 15030 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
f67539c2 15031 return -CEPHFS_EDQUOT;
7c673cae
FG
15032 }
15033
15034 int have;
f6b5b4d7 15035 int r = get_caps(fh, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
7c673cae
FG
15036 if (r < 0)
15037 return r;
15038
1e59de90
TL
15039 r = clear_suid_sgid(in, fh->actor_perms);
15040 if (r < 0) {
15041 put_cap_ref(in, CEPH_CAP_FILE_WR);
15042 return r;
15043 }
15044
11fdf7f2 15045 std::unique_ptr<C_SaferCond> onuninline = nullptr;
7c673cae
FG
15046 if (mode & FALLOC_FL_PUNCH_HOLE) {
15047 if (in->inline_version < CEPH_INLINE_NONE &&
15048 (have & CEPH_CAP_FILE_BUFFER)) {
15049 bufferlist bl;
9f95a23c 15050 auto inline_iter = in->inline_data.cbegin();
7c673cae
FG
15051 int len = in->inline_data.length();
15052 if (offset < len) {
15053 if (offset > 0)
9f95a23c 15054 inline_iter.copy(offset, bl);
7c673cae
FG
15055 int size = length;
15056 if (offset + size > len)
15057 size = len - offset;
15058 if (size > 0)
15059 bl.append_zero(size);
9f95a23c
TL
15060 if (offset + size < len) {
15061 inline_iter += size;
15062 inline_iter.copy(len - offset - size, bl);
15063 }
7c673cae
FG
15064 in->inline_data = bl;
15065 in->inline_version++;
15066 }
91327a77 15067 in->mtime = in->ctime = ceph_clock_now();
7c673cae 15068 in->change_attr++;
28e407b8 15069 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
15070 } else {
15071 if (in->inline_version < CEPH_INLINE_NONE) {
11fdf7f2
TL
15072 onuninline.reset(new C_SaferCond("Client::_fallocate_uninline_data flock"));
15073 uninline_data(in, onuninline.get());
7c673cae
FG
15074 }
15075
11fdf7f2 15076 C_SaferCond onfinish("Client::_punch_hole flock");
7c673cae 15077
7c673cae
FG
15078 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
15079
15080 _invalidate_inode_cache(in, offset, length);
15081 filer->zero(in->ino, &in->layout,
15082 in->snaprealm->get_snap_context(),
15083 offset, length,
15084 ceph::real_clock::now(),
11fdf7f2 15085 0, true, &onfinish);
91327a77 15086 in->mtime = in->ctime = ceph_clock_now();
7c673cae 15087 in->change_attr++;
28e407b8 15088 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 15089
9f95a23c 15090 client_lock.unlock();
11fdf7f2 15091 onfinish.wait();
9f95a23c 15092 client_lock.lock();
f67539c2 15093 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
7c673cae
FG
15094 }
15095 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
15096 uint64_t size = offset + length;
15097 if (size > in->size) {
15098 in->size = size;
91327a77 15099 in->mtime = in->ctime = ceph_clock_now();
7c673cae 15100 in->change_attr++;
28e407b8 15101 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae 15102
11fdf7f2 15103 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
7c673cae 15104 check_caps(in, CHECK_CAPS_NODELAY);
31f18b77
FG
15105 } else if (is_max_size_approaching(in)) {
15106 check_caps(in, 0);
7c673cae
FG
15107 }
15108 }
15109 }
15110
11fdf7f2 15111 if (nullptr != onuninline) {
9f95a23c 15112 client_lock.unlock();
11fdf7f2 15113 int ret = onuninline->wait();
9f95a23c 15114 client_lock.lock();
7c673cae 15115
f67539c2 15116 if (ret >= 0 || ret == -CEPHFS_ECANCELED) {
7c673cae
FG
15117 in->inline_data.clear();
15118 in->inline_version = CEPH_INLINE_NONE;
28e407b8 15119 in->mark_caps_dirty(CEPH_CAP_FILE_WR);
7c673cae
FG
15120 check_caps(in, 0);
15121 } else
11fdf7f2 15122 r = ret;
7c673cae
FG
15123 }
15124
15125 put_cap_ref(in, CEPH_CAP_FILE_WR);
15126 return r;
15127}
7c673cae 15128
11fdf7f2 15129int Client::ll_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
7c673cae 15130{
f67539c2
TL
15131 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15132 if (!mref_reader.is_state_satisfied())
15133 return -CEPHFS_ENOTCONN;
15134
11fdf7f2
TL
15135 ldout(cct, 3) << __func__ << " " << fh << " " << fh->inode->ino << " " << dendl;
15136 tout(cct) << __func__ << " " << mode << " " << offset << " " << length << std::endl;
f67539c2 15137 tout(cct) << (uintptr_t)fh << std::endl;
181888fb 15138
f67539c2 15139 std::scoped_lock lock(client_lock);
7c673cae
FG
15140 return _fallocate(fh, mode, offset, length);
15141}
15142
15143int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
15144{
f67539c2
TL
15145 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15146 if (!mref_reader.is_state_satisfied())
15147 return -CEPHFS_ENOTCONN;
7c673cae 15148
1e59de90 15149 tout(cct) << __func__ << " " << fd << mode << " " << offset << " " << length << std::endl;
181888fb 15150
f67539c2 15151 std::scoped_lock lock(client_lock);
7c673cae
FG
15152 Fh *fh = get_filehandle(fd);
15153 if (!fh)
f67539c2 15154 return -CEPHFS_EBADF;
7c673cae
FG
15155#if defined(__linux__) && defined(O_PATH)
15156 if (fh->flags & O_PATH)
f67539c2 15157 return -CEPHFS_EBADF;
7c673cae
FG
15158#endif
15159 return _fallocate(fh, mode, offset, length);
15160}
15161
15162int Client::ll_release(Fh *fh)
15163{
f67539c2
TL
15164 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15165 if (!mref_reader.is_state_satisfied())
15166 return -CEPHFS_ENOTCONN;
91327a77 15167
11fdf7f2 15168 ldout(cct, 3) << __func__ << " (fh)" << fh << " " << fh->inode->ino << " " <<
7c673cae 15169 dendl;
11fdf7f2 15170 tout(cct) << __func__ << " (fh)" << std::endl;
f67539c2
TL
15171 tout(cct) << (uintptr_t)fh << std::endl;
15172
15173 std::scoped_lock lock(client_lock);
7c673cae
FG
15174
15175 if (ll_unclosed_fh_set.count(fh))
15176 ll_unclosed_fh_set.erase(fh);
15177 return _release_fh(fh);
15178}
15179
15180int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
15181{
f67539c2
TL
15182 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15183 if (!mref_reader.is_state_satisfied())
15184 return -CEPHFS_ENOTCONN;
7c673cae
FG
15185
15186 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
f67539c2 15187 tout(cct) << "ll_getk (fh)" << (uintptr_t)fh << std::endl;
181888fb 15188
f67539c2 15189 std::scoped_lock lock(client_lock);
7c673cae
FG
15190 return _getlk(fh, fl, owner);
15191}
15192
15193int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
15194{
f67539c2
TL
15195 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15196 if (!mref_reader.is_state_satisfied())
15197 return -CEPHFS_ENOTCONN;
7c673cae 15198
11fdf7f2 15199 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 15200 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 15201
f67539c2 15202 std::scoped_lock lock(client_lock);
7c673cae
FG
15203 return _setlk(fh, fl, owner, sleep);
15204}
15205
15206int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
15207{
f67539c2
TL
15208 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15209 if (!mref_reader.is_state_satisfied())
15210 return -CEPHFS_ENOTCONN;
7c673cae 15211
11fdf7f2 15212 ldout(cct, 3) << __func__ << " (fh) " << fh << " " << fh->inode->ino << dendl;
f67539c2 15213 tout(cct) << __func__ << " (fh)" << (uintptr_t)fh << std::endl;
181888fb 15214
f67539c2 15215 std::scoped_lock lock(client_lock);
7c673cae
FG
15216 return _flock(fh, cmd, owner);
15217}
15218
b32b8144
FG
15219int Client::set_deleg_timeout(uint32_t timeout)
15220{
f67539c2 15221 std::scoped_lock lock(client_lock);
b32b8144
FG
15222
15223 /*
f67539c2 15224 * The whole point is to prevent blocklisting so we must time out the
b32b8144
FG
15225 * delegation before the session autoclose timeout kicks in.
15226 */
15227 if (timeout >= mdsmap->get_session_autoclose())
f67539c2 15228 return -CEPHFS_EINVAL;
b32b8144
FG
15229
15230 deleg_timeout = timeout;
15231 return 0;
15232}
15233
15234int Client::ll_delegation(Fh *fh, unsigned cmd, ceph_deleg_cb_t cb, void *priv)
15235{
f67539c2 15236 int ret = -CEPHFS_EINVAL;
b32b8144 15237
f67539c2
TL
15238 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15239 if (!mref_reader.is_state_satisfied())
15240 return -CEPHFS_ENOTCONN;
b32b8144 15241
f67539c2 15242 std::scoped_lock lock(client_lock);
b32b8144
FG
15243
15244 Inode *inode = fh->inode.get();
15245
15246 switch(cmd) {
15247 case CEPH_DELEGATION_NONE:
15248 inode->unset_deleg(fh);
15249 ret = 0;
15250 break;
15251 default:
15252 try {
15253 ret = inode->set_deleg(fh, cmd, cb, priv);
11fdf7f2 15254 } catch (std::bad_alloc&) {
f67539c2 15255 ret = -CEPHFS_ENOMEM;
b32b8144
FG
15256 }
15257 break;
15258 }
15259 return ret;
15260}
15261
7c673cae
FG
15262class C_Client_RequestInterrupt : public Context {
15263private:
15264 Client *client;
15265 MetaRequest *req;
15266public:
15267 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
15268 req->get();
15269 }
15270 void finish(int r) override {
f67539c2 15271 std::scoped_lock l(client->client_lock);
11fdf7f2 15272 ceph_assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
7c673cae
FG
15273 client->_interrupt_filelock(req);
15274 client->put_request(req);
15275 }
15276};
15277
15278void Client::ll_interrupt(void *d)
15279{
15280 MetaRequest *req = static_cast<MetaRequest*>(d);
11fdf7f2
TL
15281 ldout(cct, 3) << __func__ << " tid " << req->get_tid() << dendl;
15282 tout(cct) << __func__ << " tid " << req->get_tid() << std::endl;
7c673cae
FG
15283 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
15284}
15285
15286// =========================================
15287// layout
15288
15289// expose file layouts
15290
15291int Client::describe_layout(const char *relpath, file_layout_t *lp,
15292 const UserPerm& perms)
15293{
f67539c2
TL
15294 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15295 if (!mref_reader.is_state_satisfied())
15296 return -CEPHFS_ENOTCONN;
7c673cae 15297
f67539c2 15298 std::scoped_lock lock(client_lock);
181888fb 15299
7c673cae
FG
15300 filepath path(relpath);
15301 InodeRef in;
15302 int r = path_walk(path, &in, perms);
15303 if (r < 0)
15304 return r;
15305
15306 *lp = in->layout;
15307
11fdf7f2 15308 ldout(cct, 3) << __func__ << "(" << relpath << ") = 0" << dendl;
7c673cae
FG
15309 return 0;
15310}
15311
15312int Client::fdescribe_layout(int fd, file_layout_t *lp)
15313{
f67539c2
TL
15314 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15315 if (!mref_reader.is_state_satisfied())
15316 return -CEPHFS_ENOTCONN;
7c673cae 15317
f67539c2 15318 std::scoped_lock lock(client_lock);
181888fb 15319
7c673cae
FG
15320 Fh *f = get_filehandle(fd);
15321 if (!f)
f67539c2 15322 return -CEPHFS_EBADF;
7c673cae
FG
15323 Inode *in = f->inode.get();
15324
15325 *lp = in->layout;
15326
11fdf7f2 15327 ldout(cct, 3) << __func__ << "(" << fd << ") = 0" << dendl;
7c673cae
FG
15328 return 0;
15329}
15330
d2e6a577
FG
15331int64_t Client::get_default_pool_id()
15332{
f67539c2
TL
15333 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15334 if (!mref_reader.is_state_satisfied())
15335 return -CEPHFS_ENOTCONN;
181888fb 15336
f67539c2 15337 std::scoped_lock lock(client_lock);
181888fb 15338
d2e6a577
FG
15339 /* first data pool is the default */
15340 return mdsmap->get_first_data_pool();
15341}
7c673cae
FG
15342
15343// expose osdmap
15344
15345int64_t Client::get_pool_id(const char *pool_name)
15346{
f67539c2
TL
15347 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15348 if (!mref_reader.is_state_satisfied())
15349 return -CEPHFS_ENOTCONN;
181888fb 15350
f67539c2 15351 std::scoped_lock lock(client_lock);
181888fb 15352
7c673cae
FG
15353 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
15354 pool_name);
15355}
15356
15357string Client::get_pool_name(int64_t pool)
15358{
f67539c2
TL
15359 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15360 if (!mref_reader.is_state_satisfied())
181888fb
FG
15361 return string();
15362
f67539c2
TL
15363 std::scoped_lock lock(client_lock);
15364
7c673cae
FG
15365 return objecter->with_osdmap([pool](const OSDMap& o) {
15366 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
15367 });
15368}
15369
15370int Client::get_pool_replication(int64_t pool)
15371{
f67539c2
TL
15372 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15373 if (!mref_reader.is_state_satisfied())
15374 return -CEPHFS_ENOTCONN;
181888fb 15375
f67539c2 15376 std::scoped_lock lock(client_lock);
181888fb 15377
7c673cae 15378 return objecter->with_osdmap([pool](const OSDMap& o) {
f67539c2 15379 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -CEPHFS_ENOENT;
7c673cae
FG
15380 });
15381}
15382
15383int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
15384{
f67539c2
TL
15385 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15386 if (!mref_reader.is_state_satisfied())
15387 return -CEPHFS_ENOTCONN;
7c673cae 15388
f67539c2 15389 std::scoped_lock lock(client_lock);
181888fb 15390
7c673cae
FG
15391 Fh *f = get_filehandle(fd);
15392 if (!f)
f67539c2 15393 return -CEPHFS_EBADF;
7c673cae
FG
15394 Inode *in = f->inode.get();
15395
15396 vector<ObjectExtent> extents;
15397 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
11fdf7f2 15398 ceph_assert(extents.size() == 1);
7c673cae
FG
15399
15400 objecter->with_osdmap([&](const OSDMap& o) {
15401 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15402 o.pg_to_acting_osds(pg, osds);
15403 });
15404
15405 if (osds.empty())
f67539c2 15406 return -CEPHFS_EINVAL;
7c673cae
FG
15407
15408 /*
15409 * Return the remainder of the extent (stripe unit)
15410 *
15411 * If length = 1 is passed to Striper::file_to_extents we get a single
15412 * extent back, but its length is one so we still need to compute the length
15413 * to the end of the stripe unit.
15414 *
15415 * If length = su then we may get 1 or 2 objects back in the extents vector
15416 * which would have to be examined. Even then, the offsets are local to the
15417 * object, so matching up to the file offset is extra work.
15418 *
15419 * It seems simpler to stick with length = 1 and manually compute the
15420 * remainder.
15421 */
15422 if (len) {
15423 uint64_t su = in->layout.stripe_unit;
15424 *len = su - (off % su);
15425 }
15426
15427 return 0;
15428}
15429
15430int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
15431{
f67539c2
TL
15432 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15433 if (!mref_reader.is_state_satisfied())
15434 return -CEPHFS_ENOTCONN;
181888fb 15435
f67539c2 15436 std::scoped_lock lock(client_lock);
181888fb 15437
7c673cae 15438 if (id < 0)
f67539c2 15439 return -CEPHFS_EINVAL;
7c673cae
FG
15440 return objecter->with_osdmap([&](const OSDMap& o) {
15441 return o.crush->get_full_location_ordered(id, path);
15442 });
15443}
15444
15445int Client::get_file_stripe_address(int fd, loff_t offset,
15446 vector<entity_addr_t>& address)
15447{
f67539c2
TL
15448 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15449 if (!mref_reader.is_state_satisfied())
15450 return -CEPHFS_ENOTCONN;
7c673cae 15451
f67539c2 15452 std::scoped_lock lock(client_lock);
181888fb 15453
7c673cae
FG
15454 Fh *f = get_filehandle(fd);
15455 if (!f)
f67539c2 15456 return -CEPHFS_EBADF;
7c673cae
FG
15457 Inode *in = f->inode.get();
15458
15459 // which object?
15460 vector<ObjectExtent> extents;
15461 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
15462 in->truncate_size, extents);
11fdf7f2 15463 ceph_assert(extents.size() == 1);
7c673cae
FG
15464
15465 // now we have the object and its 'layout'
15466 return objecter->with_osdmap([&](const OSDMap& o) {
15467 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
15468 vector<int> osds;
15469 o.pg_to_acting_osds(pg, osds);
15470 if (osds.empty())
f67539c2 15471 return -CEPHFS_EINVAL;
7c673cae 15472 for (unsigned i = 0; i < osds.size(); i++) {
11fdf7f2 15473 entity_addr_t addr = o.get_addrs(osds[i]).front();
7c673cae
FG
15474 address.push_back(addr);
15475 }
15476 return 0;
15477 });
15478}
15479
15480int Client::get_osd_addr(int osd, entity_addr_t& addr)
15481{
f67539c2
TL
15482 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15483 if (!mref_reader.is_state_satisfied())
15484 return -CEPHFS_ENOTCONN;
181888fb 15485
f67539c2 15486 std::scoped_lock lock(client_lock);
181888fb 15487
7c673cae
FG
15488 return objecter->with_osdmap([&](const OSDMap& o) {
15489 if (!o.exists(osd))
f67539c2 15490 return -CEPHFS_ENOENT;
7c673cae 15491
11fdf7f2 15492 addr = o.get_addrs(osd).front();
7c673cae
FG
15493 return 0;
15494 });
15495}
15496
15497int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
15498 loff_t length, loff_t offset)
15499{
f67539c2
TL
15500 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15501 if (!mref_reader.is_state_satisfied())
15502 return -CEPHFS_ENOTCONN;
7c673cae 15503
f67539c2 15504 std::scoped_lock lock(client_lock);
181888fb 15505
7c673cae
FG
15506 Fh *f = get_filehandle(fd);
15507 if (!f)
f67539c2 15508 return -CEPHFS_EBADF;
7c673cae
FG
15509 Inode *in = f->inode.get();
15510
15511 // map to a list of extents
15512 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
15513
11fdf7f2 15514 ldout(cct, 3) << __func__ << "(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
7c673cae
FG
15515 return 0;
15516}
15517
15518
f67539c2 15519/* find an osd with the same ip. -CEPHFS_ENXIO if none. */
7c673cae
FG
15520int Client::get_local_osd()
15521{
f67539c2
TL
15522 RWRef_t mref_reader(mount_state, CLIENT_MOUNTING);
15523 if (!mref_reader.is_state_satisfied())
15524 return -CEPHFS_ENOTCONN;
181888fb 15525
f67539c2 15526 std::scoped_lock lock(client_lock);
181888fb 15527
7c673cae
FG
15528 objecter->with_osdmap([this](const OSDMap& o) {
15529 if (o.get_epoch() != local_osd_epoch) {
11fdf7f2 15530 local_osd = o.find_osd_on_ip(messenger->get_myaddrs().front());
7c673cae
FG
15531 local_osd_epoch = o.get_epoch();
15532 }
15533 });
15534 return local_osd;
15535}
15536
15537
15538
15539
15540
15541
15542// ===============================
15543
15544void Client::ms_handle_connect(Connection *con)
15545{
11fdf7f2 15546 ldout(cct, 10) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15547}
15548
15549bool Client::ms_handle_reset(Connection *con)
15550{
11fdf7f2 15551 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15552 return false;
15553}
15554
15555void Client::ms_handle_remote_reset(Connection *con)
15556{
f67539c2 15557 std::scoped_lock lock(client_lock);
11fdf7f2 15558 ldout(cct, 0) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15559 switch (con->get_peer_type()) {
15560 case CEPH_ENTITY_TYPE_MDS:
15561 {
15562 // kludge to figure out which mds this is; fixme with a Connection* state
15563 mds_rank_t mds = MDS_RANK_NONE;
20effc67 15564 MetaSessionRef s = NULL;
11fdf7f2 15565 for (auto &p : mds_sessions) {
b3b6e05e 15566 if (mdsmap->have_inst(p.first) && mdsmap->get_addrs(p.first) == con->get_peer_addrs()) {
11fdf7f2 15567 mds = p.first;
20effc67 15568 s = p.second;
7c673cae
FG
15569 }
15570 }
15571 if (mds >= 0) {
20effc67 15572 ceph_assert(s != NULL);
7c673cae
FG
15573 switch (s->state) {
15574 case MetaSession::STATE_CLOSING:
15575 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
20effc67 15576 _closed_mds_session(s.get());
7c673cae
FG
15577 break;
15578
15579 case MetaSession::STATE_OPENING:
15580 {
15581 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
15582 list<Context*> waiters;
15583 waiters.swap(s->waiting_for_open);
20effc67
TL
15584 _closed_mds_session(s.get());
15585 auto news = _get_or_open_mds_session(mds);
7c673cae
FG
15586 news->waiting_for_open.swap(waiters);
15587 }
15588 break;
15589
15590 case MetaSession::STATE_OPEN:
15591 {
f67539c2 15592 objecter->maybe_request_map(); /* to check if we are blocklisted */
f6b5b4d7 15593 if (cct->_conf.get_val<bool>("client_reconnect_stale")) {
7c673cae 15594 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
20effc67 15595 _closed_mds_session(s.get());
7c673cae
FG
15596 } else {
15597 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
15598 s->state = MetaSession::STATE_STALE;
15599 }
15600 }
15601 break;
15602
15603 case MetaSession::STATE_NEW:
15604 case MetaSession::STATE_CLOSED:
15605 default:
15606 break;
15607 }
15608 }
15609 }
15610 break;
15611 }
15612}
15613
15614bool Client::ms_handle_refused(Connection *con)
15615{
11fdf7f2 15616 ldout(cct, 1) << __func__ << " on " << con->get_peer_addr() << dendl;
7c673cae
FG
15617 return false;
15618}
15619
1e59de90 15620Inode *Client::get_quota_root(Inode *in, const UserPerm& perms, quota_max_t type)
7c673cae 15621{
11fdf7f2
TL
15622 Inode *quota_in = root_ancestor;
15623 SnapRealm *realm = in->snaprealm;
2a845540
TL
15624
15625 if (!cct->_conf.get_val<bool>("client_quota"))
15626 return NULL;
15627
11fdf7f2
TL
15628 while (realm) {
15629 ldout(cct, 10) << __func__ << " realm " << realm->ino << dendl;
15630 if (realm->ino != in->ino) {
15631 auto p = inode_map.find(vinodeno_t(realm->ino, CEPH_NOSNAP));
15632 if (p == inode_map.end())
15633 break;
7c673cae 15634
1e59de90 15635 if (p->second->quota.is_enabled(type)) {
11fdf7f2
TL
15636 quota_in = p->second;
15637 break;
7c673cae 15638 }
7c673cae 15639 }
11fdf7f2 15640 realm = realm->pparent;
7c673cae 15641 }
11fdf7f2
TL
15642 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << quota_in->vino() << dendl;
15643 return quota_in;
7c673cae
FG
15644}
15645
15646/**
15647 * Traverse quota ancestors of the Inode, return true
15648 * if any of them passes the passed function
15649 */
15650bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
15651 std::function<bool (const Inode &in)> test)
15652{
2a845540
TL
15653 if (!cct->_conf.get_val<bool>("client_quota"))
15654 return false;
15655
7c673cae 15656 while (true) {
11fdf7f2 15657 ceph_assert(in != NULL);
7c673cae
FG
15658 if (test(*in)) {
15659 return true;
15660 }
15661
15662 if (in == root_ancestor) {
15663 // We're done traversing, drop out
15664 return false;
15665 } else {
15666 // Continue up the tree
15667 in = get_quota_root(in, perms);
15668 }
15669 }
15670
15671 return false;
15672}
15673
15674bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
15675{
15676 return check_quota_condition(in, perms,
15677 [](const Inode &in) {
15678 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
15679 });
15680}
15681
15682bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
11fdf7f2 15683 const UserPerm& perms)
7c673cae
FG
15684{
15685 return check_quota_condition(in, perms,
11fdf7f2 15686 [&new_bytes](const Inode &in) {
7c673cae
FG
15687 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
15688 > in.quota.max_bytes;
15689 });
15690}
15691
11fdf7f2 15692bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
7c673cae 15693{
9f95a23c
TL
15694 ceph_assert(in->size >= in->reported_size);
15695 const uint64_t size = in->size - in->reported_size;
11fdf7f2 15696 return check_quota_condition(in, perms,
9f95a23c 15697 [&size](const Inode &in) {
11fdf7f2
TL
15698 if (in.quota.max_bytes) {
15699 if (in.rstat.rbytes >= in.quota.max_bytes) {
15700 return true;
15701 }
15702
11fdf7f2 15703 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
11fdf7f2
TL
15704 return (space >> 4) < size;
15705 } else {
15706 return false;
15707 }
15708 });
7c673cae
FG
15709}
15710
15711enum {
15712 POOL_CHECKED = 1,
15713 POOL_CHECKING = 2,
15714 POOL_READ = 4,
15715 POOL_WRITE = 8,
15716};
15717
15718int Client::check_pool_perm(Inode *in, int need)
15719{
f67539c2
TL
15720 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
15721
7c673cae
FG
15722 if (!cct->_conf->client_check_pool_perm)
15723 return 0;
15724
f67539c2
TL
15725 /* Only need to do this for regular files */
15726 if (!in->is_file())
15727 return 0;
15728
7c673cae
FG
15729 int64_t pool_id = in->layout.pool_id;
15730 std::string pool_ns = in->layout.pool_ns;
15731 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
15732 int have = 0;
15733 while (true) {
15734 auto it = pool_perms.find(perm_key);
15735 if (it == pool_perms.end())
15736 break;
15737 if (it->second == POOL_CHECKING) {
15738 // avoid concurrent checkings
15739 wait_on_list(waiting_for_pool_perm);
15740 } else {
15741 have = it->second;
11fdf7f2 15742 ceph_assert(have & POOL_CHECKED);
7c673cae
FG
15743 break;
15744 }
15745 }
15746
15747 if (!have) {
15748 if (in->snapid != CEPH_NOSNAP) {
15749 // pool permission check needs to write to the first object. But for snapshot,
20effc67 15750 // head of the first object may have already been deleted. To avoid creating
7c673cae
FG
15751 // orphan object, skip the check for now.
15752 return 0;
15753 }
15754
15755 pool_perms[perm_key] = POOL_CHECKING;
15756
15757 char oid_buf[32];
15758 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
15759 object_t oid = oid_buf;
15760
15761 SnapContext nullsnapc;
15762
15763 C_SaferCond rd_cond;
15764 ObjectOperation rd_op;
f67539c2 15765 rd_op.stat(nullptr, nullptr, nullptr);
7c673cae
FG
15766
15767 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
15768 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
15769
15770 C_SaferCond wr_cond;
15771 ObjectOperation wr_op;
15772 wr_op.create(true);
15773
15774 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
15775 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
15776
9f95a23c 15777 client_lock.unlock();
7c673cae
FG
15778 int rd_ret = rd_cond.wait();
15779 int wr_ret = wr_cond.wait();
9f95a23c 15780 client_lock.lock();
7c673cae
FG
15781
15782 bool errored = false;
15783
f67539c2 15784 if (rd_ret == 0 || rd_ret == -CEPHFS_ENOENT)
7c673cae 15785 have |= POOL_READ;
f67539c2 15786 else if (rd_ret != -CEPHFS_EPERM) {
11fdf7f2 15787 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15788 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15789 errored = true;
15790 }
15791
f67539c2 15792 if (wr_ret == 0 || wr_ret == -CEPHFS_EEXIST)
7c673cae 15793 have |= POOL_WRITE;
f67539c2 15794 else if (wr_ret != -CEPHFS_EPERM) {
11fdf7f2 15795 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae
FG
15796 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
15797 errored = true;
15798 }
15799
15800 if (errored) {
15801 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
15802 // Raise EIO because actual error code might be misleading for
15803 // userspace filesystem user.
15804 pool_perms.erase(perm_key);
15805 signal_cond_list(waiting_for_pool_perm);
f67539c2 15806 return -CEPHFS_EIO;
7c673cae
FG
15807 }
15808
15809 pool_perms[perm_key] = have | POOL_CHECKED;
15810 signal_cond_list(waiting_for_pool_perm);
15811 }
15812
15813 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
11fdf7f2 15814 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15815 << " need " << ccap_string(need) << ", but no read perm" << dendl;
f67539c2 15816 return -CEPHFS_EPERM;
7c673cae
FG
15817 }
15818 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
11fdf7f2 15819 ldout(cct, 10) << __func__ << " on pool " << pool_id << " ns " << pool_ns
7c673cae 15820 << " need " << ccap_string(need) << ", but no write perm" << dendl;
f67539c2 15821 return -CEPHFS_EPERM;
7c673cae
FG
15822 }
15823
15824 return 0;
15825}
15826
15827int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
15828{
15829 if (acl_type == POSIX_ACL) {
15830 if (in->xattrs.count(ACL_EA_ACCESS)) {
15831 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15832
15833 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
15834 }
15835 }
f67539c2 15836 return -CEPHFS_EAGAIN;
7c673cae
FG
15837}
15838
15839int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
15840{
15841 if (acl_type == NO_ACL)
15842 return 0;
15843
15844 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
15845 if (r < 0)
15846 goto out;
15847
15848 if (acl_type == POSIX_ACL) {
15849 if (in->xattrs.count(ACL_EA_ACCESS)) {
15850 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
15851 bufferptr acl(access_acl.c_str(), access_acl.length());
15852 r = posix_acl_access_chmod(acl, mode);
15853 if (r < 0)
15854 goto out;
15855 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
15856 } else {
15857 r = 0;
15858 }
15859 }
15860out:
15861 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
15862 return r;
15863}
15864
15865int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
15866 const UserPerm& perms)
15867{
15868 if (acl_type == NO_ACL)
15869 return 0;
15870
15871 if (S_ISLNK(*mode))
15872 return 0;
15873
15874 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
15875 if (r < 0)
15876 goto out;
15877
15878 if (acl_type == POSIX_ACL) {
15879 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
15880 map<string, bufferptr> xattrs;
15881
15882 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
15883 bufferptr acl(default_acl.c_str(), default_acl.length());
15884 r = posix_acl_inherit_mode(acl, mode);
15885 if (r < 0)
15886 goto out;
15887
15888 if (r > 0) {
15889 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
15890 if (r < 0)
15891 goto out;
15892 if (r > 0)
15893 xattrs[ACL_EA_ACCESS] = acl;
15894 }
15895
15896 if (S_ISDIR(*mode))
15897 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
15898
15899 r = xattrs.size();
15900 if (r > 0)
11fdf7f2 15901 encode(xattrs, xattrs_bl);
7c673cae
FG
15902 } else {
15903 if (umask_cb)
15904 *mode &= ~umask_cb(callback_handle);
15905 r = 0;
15906 }
15907 }
15908out:
15909 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
15910 return r;
15911}
15912
15913void Client::set_filer_flags(int flags)
15914{
f67539c2 15915 std::scoped_lock l(client_lock);
11fdf7f2 15916 ceph_assert(flags == 0 ||
7c673cae
FG
15917 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
15918 objecter->add_global_op_flags(flags);
15919}
15920
15921void Client::clear_filer_flags(int flags)
15922{
f67539c2 15923 std::scoped_lock l(client_lock);
11fdf7f2 15924 ceph_assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
7c673cae
FG
15925 objecter->clear_global_op_flag(flags);
15926}
15927
11fdf7f2
TL
15928// called before mount
15929void Client::set_uuid(const std::string& uuid)
15930{
f67539c2
TL
15931 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15932 ceph_assert(iref_reader.is_state_satisfied());
15933
15934 std::scoped_lock l(client_lock);
20effc67 15935 ceph_assert(!uuid.empty());
11fdf7f2
TL
15936
15937 metadata["uuid"] = uuid;
15938 _close_sessions();
15939}
15940
15941// called before mount. 0 means infinite
15942void Client::set_session_timeout(unsigned timeout)
15943{
f67539c2
TL
15944 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15945 ceph_assert(iref_reader.is_state_satisfied());
15946
15947 std::scoped_lock l(client_lock);
11fdf7f2
TL
15948
15949 metadata["timeout"] = stringify(timeout);
15950}
15951
15952// called before mount
15953int Client::start_reclaim(const std::string& uuid, unsigned flags,
15954 const std::string& fs_name)
15955{
f67539c2
TL
15956 RWRef_t iref_reader(initialize_state, CLIENT_INITIALIZED);
15957 if (!iref_reader.is_state_satisfied())
15958 return -CEPHFS_ENOTCONN;
11fdf7f2
TL
15959
15960 if (uuid.empty())
f67539c2 15961 return -CEPHFS_EINVAL;
11fdf7f2 15962
f67539c2 15963 std::unique_lock l(client_lock);
11fdf7f2
TL
15964 {
15965 auto it = metadata.find("uuid");
15966 if (it != metadata.end() && it->second == uuid)
f67539c2 15967 return -CEPHFS_EINVAL;
11fdf7f2
TL
15968 }
15969
15970 int r = subscribe_mdsmap(fs_name);
15971 if (r < 0) {
15972 lderr(cct) << "mdsmap subscription failed: " << cpp_strerror(r) << dendl;
15973 return r;
15974 }
15975
15976 if (metadata.empty())
15977 populate_metadata("");
15978
15979 while (mdsmap->get_epoch() == 0)
15980 wait_on_list(waiting_for_mdsmap);
15981
15982 reclaim_errno = 0;
15983 for (unsigned mds = 0; mds < mdsmap->get_num_in_mds(); ) {
15984 if (!mdsmap->is_up(mds)) {
15985 ldout(cct, 10) << "mds." << mds << " not active, waiting for new mdsmap" << dendl;
15986 wait_on_list(waiting_for_mdsmap);
15987 continue;
15988 }
15989
20effc67 15990 MetaSessionRef session;
11fdf7f2
TL
15991 if (!have_open_session(mds)) {
15992 session = _get_or_open_mds_session(mds);
f6b5b4d7 15993 if (session->state == MetaSession::STATE_REJECTED)
f67539c2 15994 return -CEPHFS_EPERM;
11fdf7f2
TL
15995 if (session->state != MetaSession::STATE_OPENING) {
15996 // umounting?
f67539c2 15997 return -CEPHFS_EINVAL;
11fdf7f2
TL
15998 }
15999 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
16000 wait_on_context_list(session->waiting_for_open);
11fdf7f2
TL
16001 continue;
16002 }
16003
20effc67 16004 session = mds_sessions.at(mds);
11fdf7f2 16005 if (!session->mds_features.test(CEPHFS_FEATURE_RECLAIM_CLIENT))
f67539c2 16006 return -CEPHFS_EOPNOTSUPP;
11fdf7f2
TL
16007
16008 if (session->reclaim_state == MetaSession::RECLAIM_NULL ||
16009 session->reclaim_state == MetaSession::RECLAIMING) {
16010 session->reclaim_state = MetaSession::RECLAIMING;
9f95a23c 16011 auto m = make_message<MClientReclaim>(uuid, flags);
11fdf7f2
TL
16012 session->con->send_message2(std::move(m));
16013 wait_on_list(waiting_for_reclaim);
16014 } else if (session->reclaim_state == MetaSession::RECLAIM_FAIL) {
f67539c2 16015 return reclaim_errno ? : -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
16016 } else {
16017 mds++;
16018 }
16019 }
16020
16021 // didn't find target session in any mds
16022 if (reclaim_target_addrs.empty()) {
16023 if (flags & CEPH_RECLAIM_RESET)
f67539c2
TL
16024 return -CEPHFS_ENOENT;
16025 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
16026 }
16027
16028 if (flags & CEPH_RECLAIM_RESET)
16029 return 0;
16030
f67539c2
TL
16031 // use blocklist to check if target session was killed
16032 // (config option mds_session_blocklist_on_evict needs to be true)
16033 ldout(cct, 10) << __func__ << ": waiting for OSD epoch " << reclaim_osd_epoch << dendl;
16034 bs::error_code ec;
16035 l.unlock();
16036 objecter->wait_for_map(reclaim_osd_epoch, ca::use_blocked[ec]);
16037 l.lock();
11fdf7f2 16038
f67539c2
TL
16039 if (ec)
16040 return ceph::from_error_code(ec);
16041
16042 bool blocklisted = objecter->with_osdmap(
11fdf7f2 16043 [this](const OSDMap &osd_map) -> bool {
f67539c2 16044 return osd_map.is_blocklisted(reclaim_target_addrs);
11fdf7f2 16045 });
f67539c2
TL
16046 if (blocklisted)
16047 return -CEPHFS_ENOTRECOVERABLE;
11fdf7f2
TL
16048
16049 metadata["reclaiming_uuid"] = uuid;
16050 return 0;
16051}
16052
16053void Client::finish_reclaim()
16054{
16055 auto it = metadata.find("reclaiming_uuid");
16056 if (it == metadata.end()) {
16057 for (auto &p : mds_sessions)
20effc67 16058 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
11fdf7f2
TL
16059 return;
16060 }
16061
16062 for (auto &p : mds_sessions) {
20effc67 16063 p.second->reclaim_state = MetaSession::RECLAIM_NULL;
9f95a23c 16064 auto m = make_message<MClientReclaim>("", MClientReclaim::FLAG_FINISH);
20effc67 16065 p.second->con->send_message2(std::move(m));
11fdf7f2
TL
16066 }
16067
16068 metadata["uuid"] = it->second;
16069 metadata.erase(it);
16070}
16071
16072void Client::handle_client_reclaim_reply(const MConstRef<MClientReclaimReply>& reply)
16073{
16074 mds_rank_t from = mds_rank_t(reply->get_source().num());
16075 ldout(cct, 10) << __func__ << " " << *reply << " from mds." << from << dendl;
16076
f67539c2 16077 std::scoped_lock cl(client_lock);
20effc67 16078 auto session = _get_mds_session(from, reply->get_connection().get());
11fdf7f2
TL
16079 if (!session) {
16080 ldout(cct, 10) << " discarding reclaim reply from sessionless mds." << from << dendl;
16081 return;
16082 }
16083
16084 if (reply->get_result() >= 0) {
16085 session->reclaim_state = MetaSession::RECLAIM_OK;
16086 if (reply->get_epoch() > reclaim_osd_epoch)
16087 reclaim_osd_epoch = reply->get_epoch();
16088 if (!reply->get_addrs().empty())
16089 reclaim_target_addrs = reply->get_addrs();
16090 } else {
16091 session->reclaim_state = MetaSession::RECLAIM_FAIL;
16092 reclaim_errno = reply->get_result();
16093 }
16094
16095 signal_cond_list(waiting_for_reclaim);
16096}
16097
7c673cae
FG
16098/**
16099 * This is included in cap release messages, to cause
16100 * the MDS to wait until this OSD map epoch. It is necessary
16101 * in corner cases where we cancel RADOS ops, so that
16102 * nobody else tries to do IO to the same objects in
16103 * the same epoch as the cancelled ops.
16104 */
16105void Client::set_cap_epoch_barrier(epoch_t e)
16106{
16107 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
16108 cap_epoch_barrier = e;
16109}
16110
16111const char** Client::get_tracked_conf_keys() const
16112{
16113 static const char* keys[] = {
16114 "client_cache_size",
16115 "client_cache_mid",
16116 "client_acl_type",
b32b8144
FG
16117 "client_deleg_timeout",
16118 "client_deleg_break_on_open",
f67539c2
TL
16119 "client_oc_size",
16120 "client_oc_max_objects",
16121 "client_oc_max_dirty",
16122 "client_oc_target_dirty",
16123 "client_oc_max_dirty_age",
2a845540
TL
16124 "client_caps_release_delay",
16125 "client_mount_timeout",
7c673cae
FG
16126 NULL
16127 };
16128 return keys;
16129}
16130
11fdf7f2 16131void Client::handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
16132 const std::set <std::string> &changed)
16133{
f67539c2 16134 std::scoped_lock lock(client_lock);
7c673cae 16135
181888fb 16136 if (changed.count("client_cache_mid")) {
7c673cae
FG
16137 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
16138 }
16139 if (changed.count("client_acl_type")) {
16140 acl_type = NO_ACL;
16141 if (cct->_conf->client_acl_type == "posix_acl")
16142 acl_type = POSIX_ACL;
16143 }
f67539c2
TL
16144 if (changed.count("client_oc_size")) {
16145 objectcacher->set_max_size(cct->_conf->client_oc_size);
16146 }
16147 if (changed.count("client_oc_max_objects")) {
16148 objectcacher->set_max_objects(cct->_conf->client_oc_max_objects);
16149 }
16150 if (changed.count("client_oc_max_dirty")) {
16151 objectcacher->set_max_dirty(cct->_conf->client_oc_max_dirty);
16152 }
16153 if (changed.count("client_oc_target_dirty")) {
16154 objectcacher->set_target_dirty(cct->_conf->client_oc_target_dirty);
16155 }
16156 if (changed.count("client_oc_max_dirty_age")) {
16157 objectcacher->set_max_dirty_age(cct->_conf->client_oc_max_dirty_age);
16158 }
33c7a0ef
TL
16159 if (changed.count("client_collect_and_send_global_metrics")) {
16160 _collect_and_send_global_metrics = cct->_conf.get_val<bool>(
16161 "client_collect_and_send_global_metrics");
16162 }
2a845540
TL
16163 if (changed.count("client_caps_release_delay")) {
16164 caps_release_delay = cct->_conf.get_val<std::chrono::seconds>(
16165 "client_caps_release_delay");
16166 }
16167 if (changed.count("client_mount_timeout")) {
16168 mount_timeout = cct->_conf.get_val<std::chrono::seconds>(
16169 "client_mount_timeout");
16170 }
7c673cae
FG
16171}
16172
7c673cae
FG
16173void intrusive_ptr_add_ref(Inode *in)
16174{
b3b6e05e 16175 in->iget();
7c673cae 16176}
f67539c2 16177
7c673cae
FG
16178void intrusive_ptr_release(Inode *in)
16179{
16180 in->client->put_inode(in);
16181}
16182
16183mds_rank_t Client::_get_random_up_mds() const
16184{
9f95a23c 16185 ceph_assert(ceph_mutex_is_locked_by_me(client_lock));
7c673cae
FG
16186
16187 std::set<mds_rank_t> up;
16188 mdsmap->get_up_mds_set(up);
16189
16190 if (up.empty())
16191 return MDS_RANK_NONE;
16192 std::set<mds_rank_t>::const_iterator p = up.begin();
16193 for (int n = rand() % up.size(); n; n--)
16194 ++p;
16195 return *p;
16196}
16197
16198
f67539c2
TL
16199StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc,
16200 boost::asio::io_context& ictx)
16201 : Client(m, mc, new Objecter(m->cct, m, mc, ictx))
7c673cae
FG
16202{
16203 monclient->set_messenger(m);
16204 objecter->set_client_incarnation(0);
16205}
16206
16207StandaloneClient::~StandaloneClient()
16208{
16209 delete objecter;
16210 objecter = nullptr;
16211}
16212
16213int StandaloneClient::init()
16214{
f67539c2
TL
16215 RWRef_t iref_writer(initialize_state, CLIENT_INITIALIZING, false);
16216 ceph_assert(iref_writer.is_first_writer());
16217
e306af50 16218 _pre_init();
7c673cae
FG
16219 objecter->init();
16220
9f95a23c 16221 client_lock.lock();
7c673cae
FG
16222
16223 messenger->add_dispatcher_tail(objecter);
16224 messenger->add_dispatcher_tail(this);
16225
16226 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
16227 int r = monclient->init();
16228 if (r < 0) {
16229 // need to do cleanup because we're in an intermediate init state
f67539c2
TL
16230 {
16231 std::scoped_lock l(timer_lock);
16232 timer.shutdown();
16233 }
16234
9f95a23c 16235 client_lock.unlock();
7c673cae
FG
16236 objecter->shutdown();
16237 objectcacher->stop();
16238 monclient->shutdown();
16239 return r;
16240 }
16241 objecter->start();
16242
9f95a23c 16243 client_lock.unlock();
7c673cae 16244 _finish_init();
f67539c2 16245 iref_writer.update_state(CLIENT_INITIALIZED);
7c673cae
FG
16246
16247 return 0;
16248}
16249
16250void StandaloneClient::shutdown()
16251{
16252 Client::shutdown();
16253 objecter->shutdown();
16254 monclient->shutdown();
16255}